New upstream version 0.22.0+ds

author Andreas Tille <tille@debian.org>

Wed, 8 Nov 2023 08:35:50 +0000 (09:35 +0100)

committer Andreas Tille <tille@debian.org>

Wed, 8 Nov 2023 08:35:50 +0000 (09:35 +0100)
author Andreas Tille <tille@debian.org>
Wed, 8 Nov 2023 08:35:50 +0000 (09:35 +0100)
committer Andreas Tille <tille@debian.org>
Wed, 8 Nov 2023 08:35:50 +0000 (09:35 +0100)
diff --git a/.cirrus.yml b/.cirrus.yml

new file mode 100644 (file)

index 0000000..edf235c
--- /dev/null
+++ b/.cirrus.yml
@@ -0,0 +1,73 @@
+build_wheels_task:
+  only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*"
+
+  matrix:
+    - compute_engine_instance:
+        image_project: cirrus-images
+        image: family/docker-builder-arm64
+        architecture: arm64
+        platform: linux
+      matrix:
+        - name: Build ARM Linux py3.6-9 wheels
+          env:
+            CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-*"
+        - name: Build ARM Linux py3.10-12 wheels
+          env:
+            CIBW_BUILD: "cp310-* cp311-* cp312-*"
+
+    - name: Build ARM macOS wheels
+      macos_instance:
+        image: ghcr.io/cirruslabs/macos-ventura-base:latest
+      env:
+        CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*"
+
+  alias: build_wheels
+
+  env:
+    CIRRUS_CLONE_DEPTH: 1
+
+    CIBW_SKIP: "*-musllinux_*"
+    CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28
+
+  install_script: |
+    python3 -m pip install cibuildwheel==2.16.2
+
+  build_script: |
+    cibuildwheel
+
+  wheels_artifacts:
+    path: wheelhouse/*.whl
+
+upload_pypi_task:
+  only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*"
+  depends_on: build_wheels
+
+  name: Publish ARM wheels
+
+  container:
+    image: python:latest
+
+  env:
+    CIRRUS_CLONE_DEPTH: 1
+    API_BASEURL: https://api.cirrus-ci.com/v1
+    TWINE_USERNAME: __token__
+
+  install_script: |
+    python3 -m pip install twine
+
+  get_artifacts_script: |
+    curl -sSLO $API_BASEURL/artifact/build/$CIRRUS_BUILD_ID/wheels.zip
+    unzip -q wheels.zip
+
+  upload_script: |
+    case "$CIRRUS_TAG" in
+    v0.*)
+        export TWINE_REPOSITORY=pypi TWINE_PASSWORD=$PYPI_TOKEN ;;
+    *)
+        export TWINE_REPOSITORY=testpypi TWINE_PASSWORD=$TESTPYPI_TOKEN ;;
+    esac
+
+    echo Uploading wheels to $TWINE_REPOSITORY...
+
+    python3 -m twine check wheelhouse/*.whl
+    python3 -m twine upload --disable-progress-bar wheelhouse/*.whl
diff --git a/.python-version b/.python-version

deleted file mode 100644 (file)

index d8c6f97..0000000
--- a/.python-version
+++ /dev/null
@@ -1,2 +0,0 @@
-3.6
-3.11
diff --git a/.travis.disabled.yml b/.travis.disabled.yml

deleted file mode 100644 (file)

index 5b7bcc8..0000000
--- a/.travis.disabled.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-os:
-  - linux
-  - osx
-
-language: c
-
-stages:
-  - test
-  - name: deploy
-    if: tag IS present
-
-env:
-  matrix:
-    - CONDA_PY=2.7
-    - CONDA_PY=3.6
-    - CONDA_PY=3.7
-    - CONDA_PY=3.8
-  global:
-    - PYSAM_LINKING_TEST=1
-    - TWINE_USERNAME=grepall
-    - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU=
-
-_cibw_common: &cibw_common
-  addons: {}
-  install:
-    - python3 -m pip install cibuildwheel>=1.1.0 twine
-  script:
-    - set -e
-    - cibuildwheel --output-dir dist
-    - twine check dist/*
-    - twine upload --skip-existing dist/*
-
-_cibw_linux: &cibw_linux
-  stage: deploy
-  os: linux
-  language: python
-  python: '3.5'
-  services:
-    - docker
-  <<: *cibw_common
-
-_cibw_linux_aarch64: &cibw_linux_aarch64
-  stage: deploy
-  os: linux
-  arch: arm64
-  language: python
-  python: '3.9'
-  services:
-    - docker
-  <<: *cibw_common
-
-matrix:
-  include:
-    - stage: deploy
-      os: linux
-      language: python
-      python: '3.5'
-      addons:
-        apt:
-          packages:
-            - gcc
-            - g++
-            - libcurl4-openssl-dev  # for libcurl support in sdist
-            - libssl-dev  # for s3 support in sdist
-      install:
-        - python3 -m pip install Cython twine
-      script:
-        - set -e
-        - python3 setup.py build_ext --inplace
-        - python3 setup.py sdist
-        - twine check dist/*
-        - twine upload --skip-existing dist/*
-    - <<: *cibw_linux
-      env:
-        - CIBW_BUILD="*_x86_64"
-        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-    - <<: *cibw_linux
-      env:
-        - CIBW_BUILD="*_i686"
-        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-    - <<: *cibw_linux_aarch64
-      env:
-        - CIBW_BUILD="*_aarch64"
-        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-    - stage: deploy
-      os: osx
-      language: generic
-      env:
-        - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-      <<: *cibw_common
-
-addons:
-  apt:
-    packages:
-    - gcc
-    - g++
-
-script:
-  - ./devtools/run_tests_travis.sh
-
-notifications:
-  email:
-    - andreas.heger@gmail.com
diff --git a/MANIFEST.in b/MANIFEST.in

index 25e9a1a2c844d920ea0b47d97d6472891c88f7fc..5711f090280ffb08f0e12515c41d8b78c22c2e16 100644 (file)
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -9,8 +9,7 @@ include NEWS
  include INSTALL
  include KNOWN_BUGS
  include THANKS
-include cy_build.py
-include requirements.txt
+include requirements-dev.txt
  include pysam/libc*.pxd
  include pysam/libc*.pyx
  include pysam/libc*.c
@@ -46,9 +45,6 @@ include htslib/configure htslib/version.sh
  include htslib/Makefile htslib/*.mk
  exclude htslib/config.mk htslib/htscodecs.mk
  
-include cy_build.py
-include requirements.txt
-
  # documentation
  include doc/*.py doc/*.rst
-include doc/Makefile doc/make.bat
+include doc/Makefile doc/make.bat doc/requirements-rtd.txt
diff --git a/NEWS b/NEWS

index e0b77a95d06114adf204e5e6db74aeee6a0c30a1..ad7cfb15a415c6a04de7e2c8043d952d602b4097 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,17 +1,96 @@
-An online version of the installation instructions can be found here:
-http://pysam.readthedocs.io/en/latest/release.html
+.. An online version of the release history can be found here:
+.. http://pysam.readthedocs.io/en/latest/release.html
+
+Release 0.22.0
+==============
+
+.. rubric:: 5 October 2023
+
+This pysam release wraps htslib/samtools/bcftools 1.18 (PR #1208).
+
+It has been tested with Python versions 3.6 through 3.12, and wheels are
+available via pypi_ for all of those Python versions. Python versions 3.6
+and 3.7 are end-of-life; particularly if you use pysam with either of
+these versions, please vote in the version survey at issue #1230.
+
+The final pysam release that supported Python 2.7 was v0.20.0.
+
+Bugs fixed:
+
+* Remove Cython from runtime dependencies (PR #1186, thanks to Nicola Soranzo,
+  also reported by Arya Massarat in PR #1194)
+
+* Miscellaneous dependency improvements (PR #1216, #1217, PR #1218, PR #1219,
+  thanks to Martin Larralde and Arthur Vigil)
+
+* Suppress spurious "Could not retrieve index file" message when opening an
+  AlignmentFile (#939, #1214, reported by ChengYong Tham and Sebastian Röner)
+
+* Propagate SAM parsing errors encounted in :meth:`.AlignedSegment.fromstring`
+  (#1196, reported by DV Klopfenstein)
+
+* Accept invalid MD:A tagged fields produced by HTSeq instead of crashing
+  in :meth:`AlignedSegment.get_aligned_pairs(with_seq=True)
+  <.AlignedSegment.get_aligned_pairs>` (#1226, reported by Isaac Vock)
+
+* Fix multiarch macOS CI builds by removing brewed liblzma (#1205, reported
+  by Till Hartmann)
+
+* Fix :attr:`.VariantRecordSample.alleles` type hint (#1179, reported by
+  David Seifert)
+
+New functionality:
+
+* Add optional :meth:`HTSFile.seek(..., whence) <.HTSFile.seek>` parameter
+  and clarify which functions use libc.SEEK_SET vs io.SEEK_SET
+  (#1185, requested by luyulin)
+
+* File handling improvements in samtools & bcftools commands (should improve
+  #1193 and #1195, reported by Rob Bierman and Sam Chorlton)
+
+* Improve :class:`.FastxFile` performance (PR #1227, thanks to Fabian Klötzl
+  and Valentyn Bezshapkin)
+
+* Improve the accuracy of type hints for :class:`.AlignmentFile` iteration
+  (#1184, PR #1189, reported by @PikalaxALT)
+
+Documentation improvements:
+
+* Clarify that :meth:`.AlignedSegment.get_aligned_pairs` results are 0-based
+  (#1180, reported by Nick Semenkovich)
+
+* Clarify :meth:`.AlignedSegment.get_reference_positions` documentation
+  (#836, #838, reported by Liang Ou and Nick Stoler)
+
+* Clarify that installation via pip usually uses a wheel, and that configuring
+  the build via $HTSLIB_CONFIGURE_OPTIONS etc only applies when installing from
+  an sdist (#1086, reported by Layne Sadler)
+
+A message from pysam's founder, Andreas Heger:
+
+    As many of you will have noticed, John Marshall has been effectively
+    maintaining pysam and supporting users over the last few years.
+    I, Andreas, am very grateful for the countless hours he has contributed.
+    Unfortunately, I will not be able to contribute much in the near and
+    intermediate future. To keep pysam going, John has kindly agreed to
+    continue maintaining and supporting pysam as the principal developer
+    of pysam. I am very happy to know that pysam is in good hands and want
+    to thank again John and the wider pysam community for their suggestions,
+    bug reports, code contributions and general support.
+
+Thank you Andreas for all your work over the years and the solid foundations
+that pysam enjoys and the useful functionality it provides.
  
-=============
-Release notes
-=============
  
  Release 0.21.0
  ==============
  
+.. rubric:: 2 April 2023
+
  This release wraps htslib/samtools/bcftools version 1.17.
  
-Pysam is now compatible with Python 3.11. We have removed python 2.x
-support. Pysam is tested with python versions 3.6 to 3.11.
+Pysam is now compatible with Python 3.11. We have removed Python 2.x
+support. Pysam is tested with Python versions 3.6 to 3.11.
  
  * [#1175] VariantHeader.new_record: set start/stop before alleles
  * [#1173] Add multiple build improvements in htscodecs on multi-arch macOS
@@ -27,9 +106,12 @@ support. Pysam is tested with python versions 3.6 to 3.11.
  * [#1149] MacOS universal build compatibility.
  * [#1146] Fix build when CFLAGS/etc environment variables are set.
  
+
  Release 0.20.0
  ==============
  
+.. rubric:: 29 October 2022
+
  This release wraps htslib/bcftools version 1.16 and samtools version 1.16.1.
  
  * [#1113] Full compatibility with setuptools v62.1.0's build directory name changes
@@ -40,17 +122,23 @@ This release wraps htslib/bcftools version 1.16 and samtools version 1.16.1.
  Many additional type hints have been provided by the community,
  thanks!
  
+
  Release 0.19.1
  ==============
  
+.. rubric:: 27 May 2022
+
  This release wraps htslib/samtools/bcftools version 1.15.1.
  
  * [#1104] add an add_samples() method to quickly add multiple samples
    to VCF.
  
+
  Release 0.19.0
  ==============
  
+.. rubric:: 30 March 2022
+
  This release wraps htslib/samtools/bcftools version 1.15.
  
  * [#1085] Improve getopt()/getopt_long() resetting when running samtools/bcftools commands
@@ -66,10 +154,13 @@ This release wraps htslib/samtools/bcftools version 1.15.
  * Fix BGZFile.read() behaviour near or at EOF
  
  * First API for the htslib modified bases interface
-  
+
+
  Release 0.18.0
  ==============
  
+.. rubric:: 17 November 2021
+
  This release wraps htslib/samtools/bcftools version 1.14.
  
  * [#1048] and [#1060], clarify documentation of index statistics with CRAM files
@@ -77,9 +168,12 @@ This release wraps htslib/samtools/bcftools version 1.14.
  * Add new "samples" subcommand to pysam/samtools.py
  * Introduce TupleProxyIterator iterator object class
  
+
  Release 0.17.0
  ==============
  
+.. rubric:: 30 September 2021
+
  This release wraps htslib/samtools/bcftools version 1.13. Corresponding
  to new samtools commands, `pysam.samtools` now has additional functions
  `ampliconclip`, `ampliconstats`, `fqimport`, and `version`.
@@ -122,6 +216,8 @@ Documentation improvements:
  Release 0.16.0
  ==============
  
+.. rubric:: 8 June 2020
+
  This release wraps htslib/bcftools version 1.10.2 and samtools version
  1.10. The following bugs reported against pysam are fixed due to this:
  
@@ -162,6 +258,7 @@ version in order to fix pip install pysam with python 3.8.
  * [#846] Prevent segmentation fault on ID, when handling malformed records
  * [#829] Run configure with the correct CC/CFLAGS/LDFLAGS env vars
  
+
  Release 0.15.3
  ==============
  
@@ -205,7 +302,7 @@ Bugfix release.
  Release 0.15.0
  ==============
  
-This release wraps htslib (and friends) version 1.9.
+This release wraps htslib/samtools/bcftools version 1.9.
  
  * [#673] permit dash in chromosome name of region string
  * [#656] Support `text` when opening a SAM file for writing
@@ -225,6 +322,7 @@ upgraded to 1.7.0.
  * treat border case of all bases in pileup column below quality score
  * [#634] Fix access to pileup reference_sequence
  
+
  Release 0.14.0
  ==============
  
@@ -289,6 +387,7 @@ contains a series of bugfixes.
  * [#537] allow tabix index files to be created in a custom location.
  * [#530] add get_index_statistics() method
  
+
  Release 0.12.0.1
  ================
  
@@ -304,6 +403,7 @@ contains a series of bugfixes.
  * [#473] A new FastxRecord class that can be instantiated from class and
    modified in-place. Replaces PersistentFastqProxy.
  * [#521] In AligmentFile, Simplify file detection logic and allow remote index files
+
    * Removed attempts to guess data and index file names; this is magic left
      to htslib.
    * Removed file existence check prior to opening files with htslib
@@ -314,6 +414,7 @@ contains a series of bugfixes.
    * Allow remote indices (tested using S3 signed URLs).
    * Document filepath_index and make it an alias for index_filename.
    * Added a require_index parameter to AlignmentFile
+
  * [#526] handle unset ref when creating new records
  * [#513] fix bcf_translate to skip deleted FORMAT fields to avoid
    segfaults
@@ -554,12 +655,14 @@ Potential isses when upgrading from v0.8.3:
  
  * renamed several methods for pep8 compatibility, old names still retained for
    backwards compatibility, but should be considered deprecated.
+
     * gettid() is now get_tid()
     * getrname() is now get_reference_name()
     * parseRegion() is now parse_region()
  
  * some methods have changed for pep8 compatibility without the old
    names being present:
+
     * fromQualityString() is now qualitystring_to_array()
     * toQualityString() is now qualities_to_qualitystring()
  
@@ -678,6 +781,7 @@ Release 0.8.1
  * Pysam now wraps htslib and samtools versions 1.1.
  
  * Bugfixes, most notable:
+
    * issue #43: uncompressed BAM output
    * issue #42: skip tests requiring network if none available
    * issue #19: multiple iterators can now be made to work on the same tabix file
diff --git a/README.rst b/README.rst

index 4f19003806b67306d3b1e37f2e8e04de6e0f2225..b50e2e51bb95e4b351d94298e698b38ce5b7a9a0 100644 (file)
--- a/README.rst
+++ b/README.rst
@@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured
  compilation options. Especially for OS X this will potentially save a
  lot of trouble.
  
-The current version of pysam wraps 3rd-party code from htslib-1.17, samtools-1.17, and bcftools-1.17.
+The current version of pysam wraps 3rd-party code from htslib-1.18, samtools-1.18, and bcftools-1.18.
  
  Pysam is available through `pypi
  <https://pypi.python.org/pypi/pysam>`_. To install, type::
@@ -42,10 +42,10 @@ Questions and comments are very welcome and should be sent to the
  .. _tabix: http://samtools.sourceforge.net/tabix.shtml
  .. _Li 2009: http://www.ncbi.nlm.nih.gov/pubmed/19505943
  
-.. |build-status| image:: https://travis-ci.org/pysam-developers/pysam.svg
+.. |build-status| image:: https://github.com/pysam-developers/pysam/actions/workflows/ci.yaml/badge.svg
      :alt: build status
      :scale: 100%
-    :target: https://travis-ci.org/pysam-developers/pysam
+    :target: https://github.com/pysam-developers/pysam/actions/workflows/ci.yaml
  
  .. |docs| image:: https://readthedocs.org/projects/pysam/badge/?version=latest
      :alt: Documentation Status
diff --git a/bcftools/LICENSE b/bcftools/LICENSE

index 6d40ae2d16c6f7be85efddb76a29d13ab127ae4e..46dc0e0e31d8af12effea1b0c07955dcb3e1aa68 100644 (file)
--- a/bcftools/LICENSE
+++ b/bcftools/LICENSE
@@ -723,11 +723,12 @@ Public License instead of this License.  But first, please read
  
  -----------------------------------------------------------------------------
  
-LICENSE FOR VariantKey (https://github.com/Genomicsplc/variantkey)
+LICENSE FOR VariantKey (https://github.com/tecnickcom/variantkey)
  
  The MIT License
  
  Copyright (c) 2017-2018 GENOMICS plc
+Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h

index c3f7ded160fed9c90576c1432b453add11e985a2..bba71e3b63ff0bb81801b271a20a6aa7653bd04d 100644 (file)
--- a/bcftools/bcftools.h
+++ b/bcftools/bcftools.h
@@ -1,6 +1,6 @@
  /*  bcftools.h -- utility function declarations.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2
  //  newline will be added by the function.
  void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
  
+// For on the fly index creation with --write-index
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname);
+
  void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
  const char *hts_bcf_wmode(int file_type);
  const char *hts_bcf_wmode2(int file_type, const char *fname);
diff --git a/bcftools/cigar_state.h b/bcftools/cigar_state.h

index a12a70995bfd734f6305c04db0800869cc0441ac..dacac14acc035674e81b141dc11bcf3eb9a55285 100644 (file)
--- a/bcftools/cigar_state.h
+++ b/bcftools/cigar_state.h
@@ -107,6 +107,12 @@ static inline int cstate_seek_fwd(cigar_state_t *cs, hts_pos_t *pos_ptr, int tri
              cs->icig++;
              continue;
          }
+        if ( op==BAM_CHARD_CLIP || op==BAM_CPAD )
+        {
+            cs->icig++;
+            continue;
+        }
+        error("FIXME: not ready for CIGAR operator %d\n",op);
      }
      // the read starts after pos
      if ( trim_left )
@@ -175,6 +181,12 @@ static inline int cstate_seek_op_fwd(cigar_state_t *cs, hts_pos_t pos, int seek_
              cs->icig++;
              continue;
          }
+        if ( op==BAM_CHARD_CLIP || op==BAM_CPAD )
+        {
+            cs->icig++;
+            continue;
+        }
+        error("FIXME: not ready for CIGAR operator %d\n",op);
      }
      return cs->icig < cs->ncig ? -1 : -2;
  }
diff --git a/bcftools/consensus.c b/bcftools/consensus.c

index 397d45f9813be07bb6e7f348368fa0f61df26ab9..2b58670c74c5c06ead220c620d6a43e66b08d378 100644 (file)
--- a/bcftools/consensus.c
+++ b/bcftools/consensus.c
@@ -54,8 +54,8 @@
  #define PICK_SHORT 8
  #define PICK_IUPAC 16
  
-#define TO_UPPER 0
-#define TO_LOWER 1
+#define TO_UPPER 1
+#define TO_LOWER 2
  
  typedef struct
  {
@@ -324,7 +324,7 @@ static void init_region(args_t *args, char *line)
  {
      char *ss, *se = line;
      while ( *se && !isspace(*se) && *se!=':' ) se++;
-    int from = 0, to = 0;
+    hts_pos_t from = 0, to = 0;
      char tmp = 0, *tmp_ptr = NULL;
      if ( *se )
      {
@@ -356,7 +356,14 @@ static void init_region(args_t *args, char *line)
      args->fa_frz_mod = -1;
      args->fa_case    = -1;
      args->vcf_rbuf.n = 0;
-    bcf_sr_seek(args->files,line,args->fa_ori_pos);
+
+    kstring_t str = {0,0,0};
+    if ( from==0 ) from = 1;
+    if ( to==0 ) to = HTS_POS_MAX;
+    ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
+    bcf_sr_set_regions(args->files,line,0);
+    free(str.s);
+
      if ( tmp_ptr ) *tmp_ptr = tmp;
      fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
      if ( args->chain_fname )
@@ -466,25 +473,37 @@ static char *mark_del(char *ref, int rlen, char *alt, int mark)
  static void mark_ins(char *ref, char *alt, char mark)
  {
      int i, nref = strlen(ref), nalt = strlen(alt);
-    if ( mark=='l' )
+    if ( mark==TO_LOWER )
          for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
-    else
+    else if ( mark==TO_UPPER )
          for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+    else if ( mark )
+        for (i=nref; i<nalt; i++) alt[i] = mark;
  }
  static void mark_snv(char *ref, char *alt, char mark)
  {
      int i, nref = strlen(ref), nalt = strlen(alt);
      int n = nref < nalt ? nref : nalt;
-    if ( mark=='l' )
+    if ( mark==TO_LOWER )
      {
          for (i=0; i<n; i++)
              if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
      }
-    else
+    else if ( mark==TO_UPPER)
      {
          for (i=0; i<n; i++)
              if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
      }
+    else if ( mark==TO_UPPER)
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+    }
+    else if ( mark )
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = mark;
+    }
  }
  static void iupac_init(args_t *args, bcf1_t *rec)
  {
@@ -1099,19 +1118,18 @@ static void usage(args_t *args)
      fprintf(stderr, "    -f, --fasta-ref FILE           Reference sequence in fasta format\n");
      fprintf(stderr, "    -H, --haplotype WHICH          Choose which allele to use from the FORMAT/GT field, note\n");
      fprintf(stderr, "                                   the codes are case-insensitive:\n");
-    fprintf(stderr, "                                       1: first allele from GT, regardless of phasing\n");
-    fprintf(stderr, "                                       2: second allele from GT, regardless of phasing\n");
+    fprintf(stderr, "                                       N: N={1,2,3,..} is the index of the allele from GT, regardless of phasing (e.g. \"2\")\n");
      fprintf(stderr, "                                       R: REF allele in het genotypes\n");
      fprintf(stderr, "                                       A: ALT allele\n");
      fprintf(stderr, "                                       I: IUPAC code for all genotypes\n");
      fprintf(stderr, "                                       LR,LA: longer allele and REF/ALT if equal length\n");
      fprintf(stderr, "                                       SR,SA: shorter allele and REF/ALT if equal length\n");
-    fprintf(stderr, "                                       1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+    fprintf(stderr, "                                       NpIu: index of the allele for phased and IUPAC code for unphased GTs (e.g. \"2pIu\")\n");
      fprintf(stderr, "    -i, --include EXPR             Select sites for which the expression is true (see man page for details)\n");
      fprintf(stderr, "    -I, --iupac-codes              Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n");
-    fprintf(stderr, "        --mark-del CHAR            Instead of removing sequence, insert CHAR for deletions\n");
-    fprintf(stderr, "        --mark-ins uc|lc           Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
-    fprintf(stderr, "        --mark-snv uc|lc           Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+    fprintf(stderr, "        --mark-del CHAR            Instead of removing sequence, insert character CHAR for deletions\n");
+    fprintf(stderr, "        --mark-ins uc|lc|CHAR      Highlight insertions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
+    fprintf(stderr, "        --mark-snv uc|lc|CHAR      Highlight substitutions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
      fprintf(stderr, "    -m, --mask FILE                Replace regions according to the next --mask-with option. The default is --mask-with N\n");
      fprintf(stderr, "        --mask-with CHAR|uc|lc     Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
      fprintf(stderr, "    -M, --missing CHAR             Output CHAR instead of skipping a missing genotype \"./.\"\n");
@@ -1163,13 +1181,15 @@ int main_consensus(int argc, char *argv[])
          {
              case  1 : args->mark_del = optarg[0]; break;
              case  2 :
-                if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
-                else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+                if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER;
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER;
+                else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0];
                  else error("The argument is not recognised: --mark-ins %s\n",optarg);
                  break;
              case  3 :
-                if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
-                else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+                if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER;
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER;
+                else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
                  else error("The argument is not recognised: --mark-snv %s\n",optarg);
                  break;
              case 'p': args->chr_prefix = optarg; break;
@@ -1211,7 +1231,8 @@ int main_consensus(int argc, char *argv[])
                  {
                      char *tmp;
                      args->haplotype = strtol(optarg, &tmp, 10);
-                    if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg);
+                    if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg);
+                    if ( *tmp ) args->allele |= PICK_IUPAC;
                      if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n");
                  }
                  break;
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c

index b6119252d320c0ae22638c24e6be4f00b1f24425..9f0826b715cc46720a2794bbdecc9e581ae340ef 100644 (file)
--- a/bcftools/consensus.c.pysam.c
+++ b/bcftools/consensus.c.pysam.c
@@ -56,8 +56,8 @@
  #define PICK_SHORT 8
  #define PICK_IUPAC 16
  
-#define TO_UPPER 0
-#define TO_LOWER 1
+#define TO_UPPER 1
+#define TO_LOWER 2
  
  typedef struct
  {
@@ -326,7 +326,7 @@ static void init_region(args_t *args, char *line)
  {
      char *ss, *se = line;
      while ( *se && !isspace(*se) && *se!=':' ) se++;
-    int from = 0, to = 0;
+    hts_pos_t from = 0, to = 0;
      char tmp = 0, *tmp_ptr = NULL;
      if ( *se )
      {
@@ -358,7 +358,14 @@ static void init_region(args_t *args, char *line)
      args->fa_frz_mod = -1;
      args->fa_case    = -1;
      args->vcf_rbuf.n = 0;
-    bcf_sr_seek(args->files,line,args->fa_ori_pos);
+
+    kstring_t str = {0,0,0};
+    if ( from==0 ) from = 1;
+    if ( to==0 ) to = HTS_POS_MAX;
+    ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
+    bcf_sr_set_regions(args->files,line,0);
+    free(str.s);
+
      if ( tmp_ptr ) *tmp_ptr = tmp;
      fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
      if ( args->chain_fname )
@@ -468,25 +475,37 @@ static char *mark_del(char *ref, int rlen, char *alt, int mark)
  static void mark_ins(char *ref, char *alt, char mark)
  {
      int i, nref = strlen(ref), nalt = strlen(alt);
-    if ( mark=='l' )
+    if ( mark==TO_LOWER )
          for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
-    else
+    else if ( mark==TO_UPPER )
          for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+    else if ( mark )
+        for (i=nref; i<nalt; i++) alt[i] = mark;
  }
  static void mark_snv(char *ref, char *alt, char mark)
  {
      int i, nref = strlen(ref), nalt = strlen(alt);
      int n = nref < nalt ? nref : nalt;
-    if ( mark=='l' )
+    if ( mark==TO_LOWER )
      {
          for (i=0; i<n; i++)
              if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
      }
-    else
+    else if ( mark==TO_UPPER)
      {
          for (i=0; i<n; i++)
              if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
      }
+    else if ( mark==TO_UPPER)
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+    }
+    else if ( mark )
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = mark;
+    }
  }
  static void iupac_init(args_t *args, bcf1_t *rec)
  {
@@ -1101,19 +1120,18 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "    -f, --fasta-ref FILE           Reference sequence in fasta format\n");
      fprintf(bcftools_stderr, "    -H, --haplotype WHICH          Choose which allele to use from the FORMAT/GT field, note\n");
      fprintf(bcftools_stderr, "                                   the codes are case-insensitive:\n");
-    fprintf(bcftools_stderr, "                                       1: first allele from GT, regardless of phasing\n");
-    fprintf(bcftools_stderr, "                                       2: second allele from GT, regardless of phasing\n");
+    fprintf(bcftools_stderr, "                                       N: N={1,2,3,..} is the index of the allele from GT, regardless of phasing (e.g. \"2\")\n");
      fprintf(bcftools_stderr, "                                       R: REF allele in het genotypes\n");
      fprintf(bcftools_stderr, "                                       A: ALT allele\n");
      fprintf(bcftools_stderr, "                                       I: IUPAC code for all genotypes\n");
      fprintf(bcftools_stderr, "                                       LR,LA: longer allele and REF/ALT if equal length\n");
      fprintf(bcftools_stderr, "                                       SR,SA: shorter allele and REF/ALT if equal length\n");
-    fprintf(bcftools_stderr, "                                       1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+    fprintf(bcftools_stderr, "                                       NpIu: index of the allele for phased and IUPAC code for unphased GTs (e.g. \"2pIu\")\n");
      fprintf(bcftools_stderr, "    -i, --include EXPR             Select sites for which the expression is true (see man page for details)\n");
      fprintf(bcftools_stderr, "    -I, --iupac-codes              Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n");
-    fprintf(bcftools_stderr, "        --mark-del CHAR            Instead of removing sequence, insert CHAR for deletions\n");
-    fprintf(bcftools_stderr, "        --mark-ins uc|lc           Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
-    fprintf(bcftools_stderr, "        --mark-snv uc|lc           Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+    fprintf(bcftools_stderr, "        --mark-del CHAR            Instead of removing sequence, insert character CHAR for deletions\n");
+    fprintf(bcftools_stderr, "        --mark-ins uc|lc|CHAR      Highlight insertions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
+    fprintf(bcftools_stderr, "        --mark-snv uc|lc|CHAR      Highlight substitutions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
      fprintf(bcftools_stderr, "    -m, --mask FILE                Replace regions according to the next --mask-with option. The default is --mask-with N\n");
      fprintf(bcftools_stderr, "        --mask-with CHAR|uc|lc     Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
      fprintf(bcftools_stderr, "    -M, --missing CHAR             Output CHAR instead of skipping a missing genotype \"./.\"\n");
@@ -1165,13 +1183,15 @@ int main_consensus(int argc, char *argv[])
          {
              case  1 : args->mark_del = optarg[0]; break;
              case  2 :
-                if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
-                else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+                if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER;
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER;
+                else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0];
                  else error("The argument is not recognised: --mark-ins %s\n",optarg);
                  break;
              case  3 :
-                if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
-                else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+                if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER;
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER;
+                else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
                  else error("The argument is not recognised: --mark-snv %s\n",optarg);
                  break;
              case 'p': args->chr_prefix = optarg; break;
@@ -1213,7 +1233,8 @@ int main_consensus(int argc, char *argv[])
                  {
                      char *tmp;
                      args->haplotype = strtol(optarg, &tmp, 10);
-                    if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg);
+                    if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg);
+                    if ( *tmp ) args->allele |= PICK_IUPAC;
                      if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n");
                  }
                  break;
diff --git a/bcftools/convert.c b/bcftools/convert.c

index 80e54747df0cd06673200514206771190d895f45..07ff018621962a5ec3f81d9a7d954f3b1ec2cb99 100644 (file)
--- a/bcftools/convert.c
+++ b/bcftools/convert.c
@@ -106,6 +106,7 @@ struct _convert_t
      char **used_tags_list;
      int nused_tags;
      int allow_undef_tags;
+    int force_newline;
      uint8_t **subset_samples;
  };
  
@@ -648,6 +649,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
  static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
  {
      vcf_format1(convert->header, line, str);
+    if ( str->s[str->l-1]=='\n' ) str->l--;
  }
  static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
  {
@@ -1560,7 +1562,6 @@ void convert_destroy(convert_t *convert)
  int convert_header(convert_t *convert, kstring_t *str)
  {
      int i, icol = 0, l_ori = str->l;
-    bcf_hdr_t *hdr = convert->header;
  
      // Supress the header output if LINE is present
      for (i=0; i<convert->nfmt; i++)
@@ -1568,6 +1569,12 @@ int convert_header(convert_t *convert, kstring_t *str)
      if ( i!=convert->nfmt )
          return str->l - l_ori;
  
+    // Header formatting becomes problematic when the formatting expression contains a newline.
+    // Simple cases like
+    //      -f'[%CHROM %POS %SAMPLE\n]'
+    // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines
+    // are present.
+    int has_fmt_newline = 0;
      kputc('#', str);
      for (i=0; i<convert->nfmt; i++)
      {
@@ -1578,18 +1585,25 @@ int convert_header(convert_t *convert, kstring_t *str)
              while ( convert->fmt[j].is_gt_field ) j++;
              for (js=0; js<convert->nsamples; js++)
              {
-                int ks = convert->samples[js];
                  for (k=i; k<j; k++)
                  {
                      if ( convert->fmt[k].type == T_SEP )
                      {
-                        if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str);
+                        if ( convert->fmt[k].key )
+                        {
+                            char *tmp = convert->fmt[k].key;
+                            while ( *tmp )
+                            {
+                                if ( *tmp=='\n' ) has_fmt_newline = 1;
+                                else kputc(*tmp,str);
+                                tmp++;
+                            }
+                        }
                      }
-                    else if ( convert->fmt[k].type == T_SAMPLE )
-                        ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
                      else
-                        ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
+                        ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
                  }
+                if ( has_fmt_newline ) break;
              }
              i = j-1;
              continue;
@@ -1602,6 +1616,7 @@ int convert_header(convert_t *convert, kstring_t *str)
          }
          ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
      }
+    if ( has_fmt_newline ) kputc('\n',str);
      return str->l - l_ori;
  }
  
@@ -1678,6 +1693,47 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
      return str->l - l_ori;
  }
  
+static void force_newline_(convert_t *convert)
+{
+    int i, has_newline = 0;
+    for (i=0; i<convert->nfmt; i++)
+    {
+        if ( !convert->fmt[i].key ) continue;
+        char *tmp = convert->fmt[i].key;
+        while (*tmp)
+        {
+            if ( *tmp=='\n' ) { has_newline = 1; break; }
+            tmp++;
+        }
+        if ( has_newline ) break;
+    }
+    if ( has_newline ) return;
+
+    // A newline is not present, force it. But where to add it?
+    // Consider
+    //      -f'%CHROM[ %SAMPLE]\n'
+    // vs
+    //      -f'[%CHROM %SAMPLE\n]'
+    for (i=0; i<convert->nfmt; i++)
+        if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
+
+    if ( i < convert->nfmt )
+        register_tag(convert, "\n", 0, T_SEP);  // the first case
+    else
+    {
+        // the second case
+        i = convert->nfmt - 1;
+        if ( !convert->fmt[i].key )
+        {
+            convert->fmt[i].key = strdup("\n");
+            convert->fmt[i].is_gt_field = 1;
+            register_tag(convert, NULL, 0, T_SEP);
+        }
+        else
+            register_tag(convert, "\n", 1, T_SEP);
+    }
+}
+
  int convert_set_option(convert_t *convert, enum convert_option opt, ...)
  {
      int ret = 0;
@@ -1692,6 +1748,10 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
          case subset_samples:
              convert->subset_samples = va_arg(args, uint8_t**);
              break;
+        case force_newline:
+            convert->force_newline = va_arg(args, int);
+            if ( convert->force_newline ) force_newline_(convert);
+            break;
          default:
              ret = -1;
      }
diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c

index 92f9d017582bffd3ea7f0f36f1a385c2c5d3507b..09a7648cc3ff5a8ed809f84be92e9dd9e1d71c90 100644 (file)
--- a/bcftools/convert.c.pysam.c
+++ b/bcftools/convert.c.pysam.c
@@ -108,6 +108,7 @@ struct _convert_t
      char **used_tags_list;
      int nused_tags;
      int allow_undef_tags;
+    int force_newline;
      uint8_t **subset_samples;
  };
  
@@ -650,6 +651,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
  static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
  {
      vcf_format1(convert->header, line, str);
+    if ( str->s[str->l-1]=='\n' ) str->l--;
  }
  static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
  {
@@ -1562,7 +1564,6 @@ void convert_destroy(convert_t *convert)
  int convert_header(convert_t *convert, kstring_t *str)
  {
      int i, icol = 0, l_ori = str->l;
-    bcf_hdr_t *hdr = convert->header;
  
      // Supress the header output if LINE is present
      for (i=0; i<convert->nfmt; i++)
@@ -1570,6 +1571,12 @@ int convert_header(convert_t *convert, kstring_t *str)
      if ( i!=convert->nfmt )
          return str->l - l_ori;
  
+    // Header formatting becomes problematic when the formatting expression contains a newline.
+    // Simple cases like
+    //      -f'[%CHROM %POS %SAMPLE\n]'
+    // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines
+    // are present.
+    int has_fmt_newline = 0;
      kputc('#', str);
      for (i=0; i<convert->nfmt; i++)
      {
@@ -1580,18 +1587,25 @@ int convert_header(convert_t *convert, kstring_t *str)
              while ( convert->fmt[j].is_gt_field ) j++;
              for (js=0; js<convert->nsamples; js++)
              {
-                int ks = convert->samples[js];
                  for (k=i; k<j; k++)
                  {
                      if ( convert->fmt[k].type == T_SEP )
                      {
-                        if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str);
+                        if ( convert->fmt[k].key )
+                        {
+                            char *tmp = convert->fmt[k].key;
+                            while ( *tmp )
+                            {
+                                if ( *tmp=='\n' ) has_fmt_newline = 1;
+                                else kputc(*tmp,str);
+                                tmp++;
+                            }
+                        }
                      }
-                    else if ( convert->fmt[k].type == T_SAMPLE )
-                        ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
                      else
-                        ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
+                        ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
                  }
+                if ( has_fmt_newline ) break;
              }
              i = j-1;
              continue;
@@ -1604,6 +1618,7 @@ int convert_header(convert_t *convert, kstring_t *str)
          }
          ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
      }
+    if ( has_fmt_newline ) kputc('\n',str);
      return str->l - l_ori;
  }
  
@@ -1680,6 +1695,47 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
      return str->l - l_ori;
  }
  
+static void force_newline_(convert_t *convert)
+{
+    int i, has_newline = 0;
+    for (i=0; i<convert->nfmt; i++)
+    {
+        if ( !convert->fmt[i].key ) continue;
+        char *tmp = convert->fmt[i].key;
+        while (*tmp)
+        {
+            if ( *tmp=='\n' ) { has_newline = 1; break; }
+            tmp++;
+        }
+        if ( has_newline ) break;
+    }
+    if ( has_newline ) return;
+
+    // A newline is not present, force it. But where to add it?
+    // Consider
+    //      -f'%CHROM[ %SAMPLE]\n'
+    // vs
+    //      -f'[%CHROM %SAMPLE\n]'
+    for (i=0; i<convert->nfmt; i++)
+        if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
+
+    if ( i < convert->nfmt )
+        register_tag(convert, "\n", 0, T_SEP);  // the first case
+    else
+    {
+        // the second case
+        i = convert->nfmt - 1;
+        if ( !convert->fmt[i].key )
+        {
+            convert->fmt[i].key = strdup("\n");
+            convert->fmt[i].is_gt_field = 1;
+            register_tag(convert, NULL, 0, T_SEP);
+        }
+        else
+            register_tag(convert, "\n", 1, T_SEP);
+    }
+}
+
  int convert_set_option(convert_t *convert, enum convert_option opt, ...)
  {
      int ret = 0;
@@ -1694,6 +1750,10 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
          case subset_samples:
              convert->subset_samples = va_arg(args, uint8_t**);
              break;
+        case force_newline:
+            convert->force_newline = va_arg(args, int);
+            if ( convert->force_newline ) force_newline_(convert);
+            break;
          default:
              ret = -1;
      }
diff --git a/bcftools/convert.h b/bcftools/convert.h

index 5bbbc2cde9d60246c64efc57d5be1c0122f4a207..0626070937c44a61e82c71812d4e2b854cf3f4ae 100644 (file)
--- a/bcftools/convert.h
+++ b/bcftools/convert.h
@@ -1,6 +1,6 @@
  /*  convert.h -- functions for converting between VCF/BCF and related formats.
  
-    Copyright (C) 2014-2021 Genome Research Ltd.
+    Copyright (C) 2014-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -32,6 +32,7 @@ enum convert_option
  {
      allow_undef_tags,
      subset_samples,
+    force_newline,
  };
  
  convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
diff --git a/bcftools/csq.c b/bcftools/csq.c

index 49812d4de7570942bbeb2ebc2a43b284b583fc0c..f619e061add1fb638701bf1796e44296cb6fd9bc 100644 (file)
--- a/bcftools/csq.c
+++ b/bcftools/csq.c
@@ -35,7 +35,7 @@
      Read about transcript types here
          http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
          http://www.ensembl.org/info/genome/variation/predicted_data.html
-        http://www.gencodegenes.org/gencode_biotypes.html
+        https://www.gencodegenes.org/pages/biotypes.html
  
      List of supported biotypes
          antisense
@@ -45,6 +45,7 @@
          IG_LV_gene
          IG_V_gene
          lincRNA
+        lncRNA      .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
          macro_lncRNA
          miRNA
          misc_RNA
@@ -52,7 +53,7 @@
          Mt_tRNA
          polymorphic_pseudogene
          processed_transcript
-        protein_coding
+        protein_coding, mRNA
          ribozyme
          rRNA
          sRNA
@@ -144,6 +145,7 @@
  #include <htslib/khash_str2int.h>
  #include <htslib/kseq.h>
  #include <htslib/faidx.h>
+#include <htslib/bgzf.h>
  #include <errno.h>
  #include <unistd.h>
  #include <ctype.h>
@@ -153,6 +155,7 @@
  #include "kheap.h"
  #include "smpl_ilist.h"
  #include "rbuf.h"
+#include "gff.h"
  
  #ifndef __FUNCTION__
  #  define __FUNCTION__ __func__
@@ -162,20 +165,8 @@
  #define FLT_INCLUDE 1
  #define FLT_EXCLUDE 2
  
-// Definition of splice_region, splice_acceptor and splice_donor
-#define N_SPLICE_DONOR         2
-#define N_SPLICE_REGION_EXON   3
-#define N_SPLICE_REGION_INTRON 8
-
  #define N_REF_PAD 10    // number of bases to avoid boundary effects
  
-#define STRAND_REV 0
-#define STRAND_FWD 1
-
-#define TRIM_NONE   0
-#define TRIM_5PRIME 1
-#define TRIM_3PRIME 2
-
  // How to treat phased/unphased genotypes
  #define PHASE_REQUIRE 0     // --phase r
  #define PHASE_MERGE   1     // --phase m
@@ -223,6 +214,7 @@
  
  #define CSQ_PRN_STRAND(csq)     ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
  #define CSQ_PRN_TSCRIPT         (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_NMD             (~(CSQ_INTRON|CSQ_NON_CODING))
  #define CSQ_PRN_BIOTYPE         CSQ_NON_CODING
  
  // see kput_vcsq()
@@ -254,119 +246,6 @@ const char *csq_strings[] =
      "start_retained"
  };
  
-
-// GFF line types
-#define GFF_UNKN_LINE    0
-#define GFF_TSCRIPT_LINE 1
-#define GFF_GENE_LINE    2
-
-
-/*
-    Genomic features, for fast lookup by position to overlapping features
-*/
-#define GF_coding_bit 6
-#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
-#define GF_MT_rRNA                       1                      // non-coding: 1, 2, ...
-#define GF_MT_tRNA                       2
-#define GF_lincRNA                       3
-#define GF_miRNA                         4
-#define GF_MISC_RNA                      5
-#define GF_rRNA                          6
-#define GF_snRNA                         7
-#define GF_snoRNA                        8
-#define GF_PROCESSED_TRANSCRIPT          9
-#define GF_ANTISENSE                    10
-#define GF_macro_lncRNA                 11
-#define GF_ribozyme                     12
-#define GF_sRNA                         13
-#define GF_scRNA                        14
-#define GF_scaRNA                       15
-#define GF_SENSE_INTRONIC               16
-#define GF_SENSE_OVERLAPPING            17
-#define GF_PSEUDOGENE                   18
-#define GF_PROCESSED_PSEUDOGENE         19
-#define GF_ARTIFACT                     20
-#define GF_IG_PSEUDOGENE                21
-#define GF_IG_C_PSEUDOGENE              22
-#define GF_IG_J_PSEUDOGENE              23
-#define GF_IG_V_PSEUDOGENE              24
-#define GF_TR_V_PSEUDOGENE              25
-#define GF_TR_J_PSEUDOGENE              26
-#define GF_MT_tRNA_PSEUDOGENE           27
-#define GF_misc_RNA_PSEUDOGENE          28
-#define GF_miRNA_PSEUDOGENE             29
-#define GF_RIBOZYME                     30
-#define GF_RETAINED_INTRON              31
-#define GF_RETROTRANSPOSED              32
-#define GF_tRNA_PSEUDOGENE              33
-#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE     34
-#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE   35
-#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE       36
-#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE    37
-#define GF_TRANSLATED_PROCESSED_PSEUDOGENE      38
-#define GF_KNOWN_NCRNA                          39
-#define GF_UNITARY_PSEUDOGENE                   40
-#define GF_UNPROCESSED_PSEUDOGENE               41
-#define GF_LRG_GENE                             42
-#define GF_3PRIME_OVERLAPPING_ncRNA             43
-#define GF_DISRUPTED_DOMAIN                     44
-#define GF_vaultRNA                             45
-#define GF_BIDIRECTIONAL_PROMOTER_lncRNA        46
-#define GF_AMBIGUOUS_ORF                        47
-#define GF_PROTEIN_CODING               (1|(1<<GF_coding_bit))  // coding: 65, 66, ...
-#define GF_POLYMORPHIC_PSEUDOGENE       (2|(1<<GF_coding_bit))
-#define GF_IG_C                         (3|(1<<GF_coding_bit))
-#define GF_IG_D                         (4|(1<<GF_coding_bit))
-#define GF_IG_J                         (5|(1<<GF_coding_bit))
-#define GF_IG_LV                        (6|(1<<GF_coding_bit))
-#define GF_IG_V                         (7|(1<<GF_coding_bit))
-#define GF_TR_C                         (8|(1<<GF_coding_bit))
-#define GF_TR_D                         (9|(1<<GF_coding_bit))
-#define GF_TR_J                        (10|(1<<GF_coding_bit))
-#define GF_TR_V                        (11|(1<<GF_coding_bit))
-#define GF_NMD                         (12|(1<<GF_coding_bit))
-#define GF_NON_STOP_DECAY              (13|(1<<GF_coding_bit))
-#define GF_CDS      ((1<<(GF_coding_bit+1))+1)                  // special types: 129, 130, ...
-#define GF_EXON     ((1<<(GF_coding_bit+1))+2)
-#define GF_UTR3     ((1<<(GF_coding_bit+1))+3)
-#define GF_UTR5     ((1<<(GF_coding_bit+1))+4)
-// GF_MAX = (1<<30)-1, see hap_node_t
-
-#define CDS_PHASE_UNKN 3
-typedef struct _tscript_t tscript_t;
-typedef struct
-{
-    tscript_t *tr;      // transcript
-    uint32_t beg;       // the start coordinate of the CDS (on the reference strand, 0-based)
-    uint32_t pos;       // 0-based index of the first exon base within the transcript (only to
-                        //  update hap_node_t.sbeg in hap_init, could be calculated on the fly)
-    uint32_t len;       // exon length
-    uint32_t icds:30,   // exon index within the transcript
-             phase:2;   // offset of the CDS: 0,1,2 or 3 for unknown
-}
-gf_cds_t;
-typedef struct
-{
-    char *name;           // human readable name, e.g. ORF45
-    uint32_t iseq;
-}
-gf_gene_t;
-typedef struct
-{
-    uint32_t beg,end;
-    tscript_t *tr;
-}
-gf_exon_t;
-typedef enum { prime3, prime5 } utr_t;
-typedef struct
-{
-    utr_t which;
-    uint32_t beg,end;
-    tscript_t *tr;
-}
-gf_utr_t;
-
-
  /*
      Structures related to VCF output:
  
@@ -459,28 +338,21 @@ struct _hap_node_t
      csq_t *csq_list;            // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
      int ncsq_list, mcsq_list;
  };
-struct _tscript_t
+#define TSCRIPT_AUX(x) ((tscript_t*)(x)->aux)
+typedef struct
  {
-    uint32_t id;        // transcript id
-    uint32_t beg,end;   // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
-    uint32_t strand:1,  // STRAND_REV or STRAND_FWD
-             ncds:31,   // number of exons
-             mcds;
-    gf_cds_t **cds;     // ordered list of exons
      char *ref;          // reference sequence, padded with N_REF_PAD bases on both ends
      char *sref;         // spliced reference sequence, padded with N_REF_PAD bases on both ends
      hap_node_t *root;   // root of the haplotype tree
      hap_node_t **hap;   // pointer to haplotype leaves, two for each sample
      int nhap, nsref;    // number of haplotypes and length of sref, including 2*N_REF_PAD
-    uint32_t trim:2,    // complete, 5' or 3' trimmed, see TRIM_* types
-             type:30;   // one of GF_* types
-    gf_gene_t *gene;
-};
-static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+}
+tscript_t;
+static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b)
  {
      return ( (*a)->end  < (*b)->end ) ? 1 : 0;
  }
-KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript)
  typedef khp_trhp_t tr_heap_t;
  typedef struct
  {
@@ -494,7 +366,7 @@ typedef struct
  {
      int mstack;
      hstack_t *stack;
-    tscript_t *tr;      // tr->ref: spliced transcript on ref strand
+    gf_tscript_t *tr;   // tr->ref: spliced transcript on ref strand
      kstring_t sseq;     // spliced haplotype sequence on ref strand
      kstring_t tseq;     // the variable part of translated haplotype transcript, coding strand
      kstring_t tref;     // the variable part of translated reference transcript, coding strand
@@ -503,77 +375,20 @@ typedef struct
  }
  hap_t;
  
-
-/*
-    Helper structures, only for initialization
-
-    ftr_t
-        temporary list of all exons, CDS, UTRs
-*/
-KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
-typedef struct
-{
-    int type;       // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
-    uint32_t beg;
-    uint32_t end;
-    uint32_t trid;
-    uint32_t strand:1;   // STRAND_REV,STRAND_FWD
-    uint32_t phase:2;    // 0, 1, 2, or 3 for unknown
-    uint32_t iseq:29;
-}
-ftr_t;
-/*
-    Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
-    to integer id.  To keep the memory requirements low, the original version
-    relied on IDs in the form of a string prefix and a numerical id.  However,
-    it turns out that this assumption is not valid for some ensembl GFFs, see
-    for example Zea_mays.AGPv4.36.gff3.gz
- */
-typedef struct
-{
-    void *str2id;       // khash_str2int
-    int nstr, mstr;
-    char **str;         // numeric id to string
-}
-id_tbl_t;
-typedef struct
-{
-    // all exons, CDS, UTRs
-    ftr_t *ftr;
-    int nftr, mftr;
-
-    // mapping from gene id to gf_gene_t
-    kh_int2gene_t *gid2gene;
-
-    // mapping from transcript id to tscript, for quick CDS anchoring
-    kh_int2tscript_t *id2tr;
-
-    // sequences
-    void *seq2int;  // str2int hash
-    char **seq;
-    int nseq, mseq;
-
-    // ignored biotypes
-    void *ignored_biotypes;
-
-    id_tbl_t gene_ids;   // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
-}
-aux_t;
-
  typedef struct _args_t
  {
      // the main regidx lookups, from chr:beg-end to overlapping features and
      // index iterator
+    gff_t *gff;
      regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
      regitr_t *itr;
  
-    // temporary structures, deleted after initializtion
-    aux_t init;
-
      // text tab-delimited output (out) or vcf/bcf output (out_fh)
      FILE *out;
      htsFile *out_fh;
+    char *index_fn;
+    int write_index;
+    char *dump_gff;
  
      // vcf
      bcf_srs_t *sr;
@@ -597,6 +412,13 @@ typedef struct _args_t
      int ncsq2_max, nfmt_bcsq;   // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
      int ncsq2_small_warned;
      int brief_predictions;
+    int unify_chr_names;
+    char *chr_name;
+    int mchr_name;
+    struct {
+        int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+        int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+    } warned;
  
      int rid;                    // current chromosome
      tr_heap_t *active_tr;       // heap of active transcripts for quick flushing
@@ -604,11 +426,10 @@ typedef struct _args_t
      vbuf_t **vcf_buf;           // buffered VCF lines to annotate with CSQ and flush
      rbuf_t vcf_rbuf;            // round buffer indexes to vcf_buf
      kh_pos2vbuf_t *pos2vbuf;    // fast lookup of buffered lines by position
-    tscript_t **rm_tr;          // buffer of transcripts to clean
+    gf_tscript_t **rm_tr;       // buffer of transcripts to clean
      int nrm_tr, mrm_tr;
      csq_t *csq_buf;             // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
      int ncsq_buf, mcsq_buf;
-    id_tbl_t tscript_ids;       // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
      int force;                  // force run under various conditions. Currently only to skip out-of-phase transcripts
      int n_threads;              // extra compression/decompression threads
  
@@ -645,818 +466,6 @@ const uint8_t cnt4[] =
  #define dna2aa(x)  gencode[  nt4[(uint8_t)(x)[0]]<<4 |  nt4[(uint8_t)(x)[1]]<<2 |  nt4[(uint8_t)(x)[2]] ]
  #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
  
-static const char *gf_strings_noncoding[] =
-{
-    "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
-    "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
-    "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
-    "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
-    "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
-    "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene",    "translated_unprocessed_pseudogene",
-    "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
-    "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
-};
-static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
-static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
-
-const char *gf_type2gff_string(int type)
-{
-    if ( !GF_is_coding(type) )
-    {
-        if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
-        type &= (1<<(GF_coding_bit+1)) - 1;
-        return gf_strings_special[type - 1];
-    }
-    type &= (1<<GF_coding_bit) - 1;
-    return gf_strings_coding[type - 1];
-}
-
-/*
-    gff parsing functions
-*/
-static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
-{
-    aux_t *aux = &args->init;
-    char c = chr_end[1];
-    chr_end[1] = 0;
-    int iseq;
-    if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
-    {
-        // check for possible mismatch in chromosome naming convention such as chrX vs X
-        char *new_chr = NULL;
-        if ( faidx_has_seq(args->fai,chr_beg) )
-            new_chr = strdup(chr_beg);                  // valid chr name, the same in gff and faidx
-        else
-        {
-            int len = strlen(chr_beg);
-            if ( !strncmp("chr",chr_beg,3) && len>3 )
-                new_chr = strdup(chr_beg+3);            // gff has the prefix, faidx does not
-            else
-            {
-                new_chr = malloc(len+4);                // gff does not have the prefix, faidx has
-                memcpy(new_chr,"chr",3);
-                memcpy(new_chr+3,chr_beg,len);
-                new_chr[len+3] = 0;
-            }
-            if ( !faidx_has_seq(args->fai,new_chr) )    // modification did not help, this sequence is not in fai
-            {
-                static int unkwn_chr_warned = 0;
-                if ( !unkwn_chr_warned && args->verbosity>0 )
-                    fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
-                unkwn_chr_warned = 1;
-                free(new_chr);
-                new_chr = strdup(chr_beg);              // use the original sequence name
-            }
-        }
-        if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
-        {
-            hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
-            aux->seq[aux->nseq] = new_chr;
-            iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
-            aux->nseq++;
-            assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
-        }
-        else
-            free(new_chr);
-    }
-    chr_end[1] = c;
-    return iseq;
-}
-static inline char *gff_skip(const char *line, char *ss)
-{
-    while ( *ss && *ss!='\t' ) ss++;
-    if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    return ss+1;
-}
-static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
-{
-    char *se = (char*) line;
-    while ( *se && *se!='\t' ) se++;
-    if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    *chr_beg = (char*) line;
-    *chr_end = se-1;
-}
-static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
-{
-    char *se = ss;
-    *beg = strtol(ss, &se, 10) - 1;
-    if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
-    ss = se+1;
-    *end = strtol(ss, &se, 10) - 1;
-    if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    return se+1;
-}
-static void gff_id_init(id_tbl_t *tbl)
-{
-    memset(tbl, 0, sizeof(*tbl));
-    tbl->str2id = khash_str2int_init();
-}
-static void gff_id_destroy(id_tbl_t *tbl)
-{
-    khash_str2int_destroy_free(tbl->str2id);
-    free(tbl->str);
-}
-// returns 0 on success, -1 on failure
-static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr)
-{
-    ss = strstr(ss,needle);     // e.g. "ID=transcript:"
-    if ( !ss ) return -1;
-    ss += strlen(needle);
-
-    char *se = ss;
-    while ( *se && *se!=';' && !isspace(*se) ) se++;
-    char tmp = *se;
-    *se = 0;
-
-    int id;
-    if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
-    {
-        id = tbl->nstr++;
-        hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
-        tbl->str[id] = strdup(ss);
-        khash_str2int_set(tbl->str2id, tbl->str[id], id);
-    }
-    *se = tmp;
-    *id_ptr = id;
-    return 0;
-}
-static inline int gff_parse_type(char *line)
-{
-    line = strstr(line,"ID=");
-    if ( !line ) return -1;
-    line += 3;
-    if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
-    else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
-    return -1;
-}
-static inline int gff_parse_biotype(char *_line)
-{
-    char *line = strstr(_line,"biotype=");
-    if ( !line ) return -1;
-
-    line += 8;
-    switch (*line)
-    {
-        case 'p':
-            if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
-            else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
-            else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
-            else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
-            break;
-        case 'a':
-            if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
-            else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
-            else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
-            break;
-        case 'I':
-            if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
-            else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
-            else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
-            else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
-            else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
-            else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
-            break;
-        case 'T':
-            if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
-            else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
-            else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
-            else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
-            else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
-            else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
-            break;
-        case 'M':
-            if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
-            else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
-            break;
-        case 'l':
-            if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
-            break;
-        case 'm':
-            if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
-            else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
-            else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
-            else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
-            break;
-        case 'r':
-            if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
-            else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
-            else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
-            else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
-            break;
-        case 's':
-            if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
-            else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
-            else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
-            else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
-            else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
-            else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
-            else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
-            break;
-        case 't':
-            if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
-            else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
-            break;
-        case 'n':
-            if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
-            else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
-            break;
-        case 'k':
-            if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
-            break;
-        case 'u':
-            if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
-            else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
-            break;
-        case 'L':
-            if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
-            break;
-        case '3':
-            if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
-            break;
-        case 'd':
-            if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
-            break;
-        case 'v':
-            if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
-            break;
-        case 'b':
-            if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
-            break;
-    }
-    return 0;
-}
-static inline int gff_ignored_biotype(args_t *args, char *ss)
-{
-    ss = strstr(ss,"biotype=");
-    if ( !ss ) return 0;
-
-    ss += 8;
-    char *se = ss, tmp;
-    while ( *se && *se!=';' ) se++;
-    tmp = *se;
-    *se = 0;
-
-    char *key = ss;
-    int n = 0;
-    if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
-    khash_str2int_set(args->init.ignored_biotypes, key, n+1);
-
-    *se = tmp;
-    return 1;
-}
-gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
-{
-    khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
-    gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
-    if ( !gene )
-    {
-        gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
-        int ret;
-        k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
-        kh_val(aux->gid2gene,k) = gene;
-    }
-    return gene;
-}
-void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    int biotype = gff_parse_biotype(ss);
-    if ( biotype <= 0 )
-    {
-        if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript, unknown biotype: %s\n",line);
-        return;
-    }
-
-    // create a mapping from transcript_id to gene_id
-    uint32_t trid, gene_id;
-    if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) )
-    {
-        if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) )
-            error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-    if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) )
-    {
-        if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) )
-            error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
-    tr->id     = trid;
-    tr->strand = ftr->strand;
-    tr->gene   = gene_init(aux, gene_id);
-    tr->type   = biotype;
-    tr->beg    = ftr->beg;
-    tr->end    = ftr->end;
-
-    khint_t k;
-    int ret;
-    k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
-    kh_val(aux->id2tr,k) = tr;
-}
-void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
-{
-    int biotype = gff_parse_biotype(ss);
-    if ( biotype <= 0 )
-    {
-        if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene, unknown biotype: %s\n",line);
-        return;
-    }
-
-    aux_t *aux = &args->init;
-
-    // substring search for "ID=gene:ENSG00000437963"
-    uint32_t gene_id;
-    if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) )
-    {
-        if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) )
-            error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    gf_gene_t *gene = gene_init(aux, gene_id);
-    assert( !gene->name );      // the gene_id should be unique
-
-    gene->iseq = feature_set_seq(args, chr_beg,chr_end);
-
-    // substring search for "Name=OR4F5"
-    ss = strstr(chr_end+2,"Name=");
-    if ( ss )
-    {
-        ss += 5;
-        char *se = ss;
-        while ( *se && *se!=';' && !isspace(*se) ) se++;
-        gene->name = (char*) malloc(se-ss+1);
-        memcpy(gene->name,ss,se-ss);
-        gene->name[se-ss] = 0;
-    }
-    else
-        gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
-}
-int gff_parse(args_t *args, char *line, ftr_t *ftr)
-{
-    // - skip empty lines and commented lines
-    // - columns
-    //      1.      chr
-    //      2.      <skip>
-    //      3.      CDS, transcript, gene, ...
-    //      4-5.    beg,end
-    //      6.      <skip>
-    //      7.      strand
-    //      8.      phase
-    //      9.      Parent=transcript:ENST(\d+);ID=... etc
-
-    char *ss = line;
-    if ( !*ss ) return -1;      // skip blank lines
-    if ( *ss=='#' ) return -1;  // skip comments
-
-    char *chr_beg, *chr_end;
-    gff_parse_chr(line, &chr_beg, &chr_end);
-    ss = gff_skip(line, chr_end + 2);
-
-    // 3. column: is this a CDS, transcript, gene, etc.
-    if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
-    else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
-    else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
-    else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
-    else
-    {
-        int type = GFF_UNKN_LINE;
-        if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE;
-        else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE;
-        ss = gff_skip(line, ss);
-        ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
-        ss = gff_skip(line, ss);
-        if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss);   // determine type from ID=transcript: or ID=gene:
-        if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
-        {
-            // we ignore these, debug print to see new types:
-            ss = strstr(ss,"ID=");
-            if ( !ss ) return -1;   // no ID, ignore the line
-            if ( !strncmp("chromosome",ss+3,10) ) return -1;
-            if ( !strncmp("supercontig",ss+3,11) ) return -1;
-            if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line);
-            return -1;
-        }
-
-        // 7. column: strand
-        if ( *ss == '+' ) ftr->strand = STRAND_FWD;
-        else if ( *ss == '-' ) ftr->strand = STRAND_REV;
-        else error("Unknown strand: %c .. %s\n", *ss,ss);
-
-        if ( type==GFF_TSCRIPT_LINE )
-            gff_parse_transcript(args, line, ss, ftr);
-        else
-            gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
-
-        return -1;
-    }
-    ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
-    ss = gff_skip(line, ss);
-
-    // 7. column: strand
-    if ( *ss == '+' ) ftr->strand = STRAND_FWD;
-    else if ( *ss == '-' ) ftr->strand = STRAND_REV;
-    else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
-    ss += 2;
-
-    // 8. column: phase (codon offset)
-    if ( *ss == '0' ) ftr->phase = 0;
-    else if ( *ss == '1' ) ftr->phase = 1;
-    else if ( *ss == '2' ) ftr->phase = 2;
-    else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN;     // exons and even CDS in some GFFs do not have phase
-    else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
-    ss += 2;
-
-    // substring search for "Parent=transcript:ENST00000437963"
-    if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) )
-    {
-        if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) )
-            error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
-    return 0;
-}
-
-static int cmp_cds_ptr(const void *a, const void *b)
-{
-    // comparison function for qsort of transcripts's CDS
-    if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
-    if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
-    return 0;
-}
-
-static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
-{
-    *chr_beg = *chr_end = aux->seq[iseq];
-    while ( (*chr_end)[1] ) (*chr_end)++;
-}
-tscript_t *tscript_init(aux_t *aux, uint32_t trid)
-{
-    khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
-    tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
-    assert( tr );
-    return tr;
-}
-void register_cds(args_t *args, ftr_t *ftr)
-{
-    // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
-    //  ftr is the result of parsing a gff CDS line
-    aux_t *aux = &args->init;
-
-    tscript_t *tr = tscript_init(aux, ftr->trid);
-    if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
-
-    gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
-    cds->tr    = tr;
-    cds->beg   = ftr->beg;
-    cds->len   = ftr->end - ftr->beg + 1;
-    cds->icds  = 0;     // to keep valgrind on mac happy
-    cds->phase = ftr->phase;
-
-    hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
-    tr->cds[tr->ncds++] = cds;
-}
-void register_utr(args_t *args, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
-    utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
-    utr->beg   = ftr->beg;
-    utr->end   = ftr->end;
-    utr->tr    = tscript_init(aux, ftr->trid);
-
-    char *chr_beg, *chr_end;
-    chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
-    regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
-}
-void register_exon(args_t *args, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
-    exon->beg = ftr->beg;
-    exon->end = ftr->end;
-    exon->tr  = tscript_init(aux, ftr->trid);
-
-    char *chr_beg, *chr_end;
-    chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
-    regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
-}
-
-void tscript_init_cds(args_t *args)
-{
-    aux_t *aux = &args->init;
-
-    // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
-    khint_t k;
-    int warn_phase_unkn = 0;
-    for (k=0; k<kh_end(aux->id2tr); k++)
-    {
-        if ( !kh_exist(aux->id2tr, k) ) continue;
-        tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
-
-        // position-to-tscript lookup
-        char *chr_beg, *chr_end;
-        chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
-        regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
-
-        if ( !tr->ncds ) continue;      // transcript with no CDS
-
-        // sort CDs
-        qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
-
-        // trim non-coding start
-        int i, len = 0;
-        if ( tr->strand==STRAND_FWD )
-        {
-            if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
-            {
-                if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
-                tr->cds[0]->beg += tr->cds[0]->phase;
-                tr->cds[0]->len -= tr->cds[0]->phase;
-                tr->cds[0]->phase = 0;
-            }
-
-            // sanity check phase; the phase number in gff tells us how many bases to skip in this
-            // feature to reach the first base of the next codon
-            int tscript_ok = 1;
-            for (i=0; i<tr->ncds; i++)
-            {
-                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
-                {
-                    warn_phase_unkn = 1;
-                    len += tr->cds[i]->len;
-                    continue;
-                }
-                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
-                if ( phase!=len%3 )
-                {
-                    if ( args->force )
-                    {
-                        if ( args->verbosity > 0 )
-                            fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
-                                args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                        tscript_ok = 0;
-                        break;
-                    }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
-                            args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                }
-                len += tr->cds[i]->len;
-            }
-            if ( !tscript_ok ) continue;    // skip this transcript
-        }
-        else
-        {
-            if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
-            {
-                // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
-                // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
-                // todo: the same for the fwd strand
-                i = tr->ncds - 1;
-                int phase = tr->cds[i]->phase;
-                if ( phase ) tr->trim |= TRIM_5PRIME;
-                while ( i>=0 && phase > tr->cds[i]->len )
-                {
-                    phase -= tr->cds[i]->len;
-                    tr->cds[i]->phase = 0;
-                    tr->cds[i]->len   = 0;
-                    i--;
-                }
-                tr->cds[i]->len  -= tr->cds[i]->phase;
-                tr->cds[i]->phase = 0;
-            }
-
-            // sanity check phase
-            int tscript_ok = 1;
-            for (i=tr->ncds-1; i>=0; i--)
-            {
-                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
-                {
-                    warn_phase_unkn = 1;
-                    len += tr->cds[i]->len;
-                    continue;
-                }
-                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
-                if ( phase!=len%3)
-                {
-                    if ( args->force )
-                    {
-                        if ( args->verbosity > 0 )
-                            fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
-                                args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                        tscript_ok = 0;
-                        break;
-                    }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
-                        args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                }
-                len += tr->cds[i]->len;
-            }
-            if ( !tscript_ok ) continue;    // skip this transcript
-        }
-
-        // set len. At the same check that CDS within a transcript do not overlap
-        len = 0;
-        for (i=0; i<tr->ncds; i++)
-        {
-            tr->cds[i]->icds = i;
-            len += tr->cds[i]->len;
-            if ( !i ) continue;
-
-            gf_cds_t *a = tr->cds[i-1];
-            gf_cds_t *b = tr->cds[i];
-            if ( a->beg + a->len - 1 >= b->beg )
-            {
-                if ( args->force )
-                {
-                    fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
-                        args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
-                }
-                else
-                    error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
-                          "       Use the --force option to override (at your own risk).\n",
-                            args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
-            }
-        }
-        if ( len%3 != 0 )
-        {
-            // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
-            //  http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
-            // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
-
-            tr->trim |= TRIM_3PRIME;
-            if ( tr->strand==STRAND_FWD )
-            {
-                i = tr->ncds - 1;
-                while ( i>=0 && len%3 )
-                {
-                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
-                    tr->cds[i]->len -= dlen;
-                    len -= dlen;
-                    i--;
-                }
-            }
-            else
-            {
-                i = 0;
-                while ( i<tr->ncds && len%3 )
-                {
-                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
-                    tr->cds[i]->len -= dlen;
-                    tr->cds[i]->beg += dlen;
-                    len -= dlen;
-                    i++;
-                }
-            }
-        }
-
-        // set CDS offsets and insert into regidx
-        len=0;
-        for (i=0; i<tr->ncds; i++)
-        {
-            tr->cds[i]->pos = len;
-            len += tr->cds[i]->len;
-            regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
-        }
-    }
-    if ( warn_phase_unkn && args->verbosity > 0 )
-        fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n");
-}
-
-void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
-void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
-
-void init_gff(args_t *args)
-{
-    aux_t *aux = &args->init;
-    aux->seq2int   = khash_str2int_init();   // chrom's numeric id
-    aux->gid2gene  = kh_init(int2gene);      // gene id to gf_gene_t, for idx_gene
-    aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
-    args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
-    aux->ignored_biotypes = khash_str2int_init();
-    gff_id_init(&aux->gene_ids);
-    gff_id_init(&args->tscript_ids);
-
-    // parse gff
-    kstring_t str = {0,0,0};
-    htsFile *fp = hts_open(args->gff_fname,"r");
-    if ( !fp ) error("Failed to read %s\n", args->gff_fname);
-    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
-    {
-        hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
-        int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
-        if ( !ret ) aux->nftr++;
-    }
-    free(str.s);
-    if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
-
-
-    // process gff information: connect CDS and exons to transcripts
-    args->idx_cds  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
-    args->idx_utr  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
-    args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
-    args->itr      = regitr_init(NULL);
-
-    int i;
-    for (i=0; i<aux->nftr; i++)
-    {
-        ftr_t *ftr = &aux->ftr[i];
-
-        // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
-        khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
-        if ( k==kh_end(aux->id2tr) ) continue;       // no such transcript
-
-        tscript_t *tr = kh_val(aux->id2tr,k);
-        if ( !tr->gene->name )
-        {
-            // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
-            regidx_free_tscript(&tr);
-            kh_del(int2tscript, aux->id2tr,k);
-            continue;
-        }
-
-        // populate regidx by category:
-        //      ftr->type   .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
-        //      gene->type  .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
-        if ( ftr->type==GF_CDS ) register_cds(args, ftr);
-        else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
-        else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
-        else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
-        else
-            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
-    }
-    tscript_init_cds(args);
-
-    if ( args->verbosity > 0 )
-    {
-        fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
-                regidx_nregs(args->idx_tscript),
-                regidx_nregs(args->idx_exon),
-                regidx_nregs(args->idx_cds),
-                regidx_nregs(args->idx_utr));
-    }
-    if ( !regidx_nregs(args->idx_tscript) )
-        fprintf(stderr,
-            "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
-            "         or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
-            "         of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
-
-    free(aux->ftr);
-    khash_str2int_destroy_free(aux->seq2int);
-    // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
-    kh_destroy(int2tscript,aux->id2tr);
-    free(aux->seq);
-    gff_id_destroy(&aux->gene_ids);
-
-    if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
-    {
-        khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
-        fprintf(stderr,"Ignored the following biotypes:\n");
-        for (i = kh_begin(ign); i < kh_end(ign); i++)
-        {
-            if ( !kh_exist(ign,i)) continue;
-            const char *biotype = kh_key(ign,i);
-            if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
-            fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
-        }
-    }
-    khash_str2int_destroy_free(aux->ignored_biotypes);
-}
-
  static inline int ncsq2_to_nfmt(int ncsq2)
  {
      return 1 + (ncsq2 - 1) / 30;
@@ -1474,8 +483,17 @@ void init_data(args_t *args)
      args->fai = fai_load(args->fa_fname);
      if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
  
-    if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
-    init_gff(args);
+    args->gff = gff_init(args->gff_fname);
+    gff_set(args->gff,verbosity,args->verbosity);
+    gff_set(args->gff,strip_chr_names,args->unify_chr_names);
+    gff_set(args->gff,force_out_of_phase,args->force);
+    gff_set(args->gff,dump_fname,args->dump_gff);
+    gff_parse(args->gff);
+    args->idx_cds  = gff_get(args->gff,idx_cds);
+    args->idx_utr  = gff_get(args->gff,idx_utr);
+    args->idx_exon = gff_get(args->gff,idx_exon);
+    args->idx_tscript = gff_get(args->gff,idx_tscript);
+    args->itr = regitr_init(NULL);
  
      args->rid = -1;
  
@@ -1536,6 +554,7 @@ void init_data(args_t *args)
          if ( args->hdr_nsmpl )
              bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
          if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      }
      if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
  }
@@ -1547,21 +566,8 @@ void destroy_data(args_t *args)
              "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
              "      the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
  
-    regidx_destroy(args->idx_cds);
-    regidx_destroy(args->idx_utr);
-    regidx_destroy(args->idx_exon);
-    regidx_destroy(args->idx_tscript);
      regitr_destroy(args->itr);
-
-    khint_t k,i,j;
-    for (k=0; k<kh_end(args->init.gid2gene); k++)
-    {
-        if ( !kh_exist(args->init.gid2gene, k) ) continue;
-        gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
-        free(gene->name);
-        free(gene);
-    }
-    kh_destroy(int2gene,args->init.gid2gene);
+    gff_destroy(args->gff);
  
      if ( args->filter )
          filter_destroy(args->filter);
@@ -1569,9 +575,20 @@ void destroy_data(args_t *args)
      khp_destroy(trhp,args->active_tr);
      kh_destroy(pos2vbuf,args->pos2vbuf);
      if ( args->smpl ) smpl_ilist_destroy(args->smpl);
-    int ret;
+    int i,j,ret;
      if ( args->out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
          ret = hts_close(args->out_fh);
+    }
      else
          ret = fclose(args->out);
      if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
@@ -1602,7 +619,7 @@ void destroy_data(args_t *args)
      free(args->gt_arr);
      free(args->str.s);
      free(args->str2.s);
-    gff_id_destroy(&args->tscript_ids);
+    free(args->chr_name);
  }
  
  /*
@@ -1614,7 +631,7 @@ void destroy_data(args_t *args)
  #define SPLICE_OVERLAP 3   // indel overlaps region boundary, csq set but could not determine csq
  typedef struct
  {
-    tscript_t *tr;
+    gf_tscript_t *tr;
      struct {
          int32_t pos, rlen, alen, ial;
          char *ref, *alt;
@@ -1678,7 +695,7 @@ fprintf(stderr,"build_hap:  rbeg=%d + %d    abeg=%d \n",rbeg,rlen,abeg);
      if ( rbeg < splice->vcf.pos )
      {
          assert( splice->tr->beg <= rbeg );  // this can be extended thanks to N_REF_PAD
-        kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+        kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
          roff = 0;
      }
      else
@@ -1703,7 +720,7 @@ fprintf(stderr,"r2: %s\n",splice->kref.s);
          if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
              rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
          if ( splice->kref.l < rlen )
-            kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+            kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
      }
  #if XDBG
  fprintf(stderr,"r3: %s\n",splice->kref.s);
@@ -1714,7 +731,7 @@ fprintf(stderr,"r3: %s\n",splice->kref.s);
      if ( abeg < splice->vcf.pos )
      {
          assert( splice->tr->beg <= abeg );
-        kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+        kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
          aoff = 0;
      }
      else
@@ -1742,7 +759,7 @@ fprintf(stderr,"a2: %s  aoff=%d\n",splice->kalt.s,aoff);
          if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
              alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
          if ( alen > 0 && alen > splice->kalt.l )
-            kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+            kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
      }
  #if XDBG
  fprintf(stderr,"a3: %s\n",splice->kalt.s);
@@ -1755,7 +772,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
      while ( regitr_overlap(itr) )
      {
          gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
-        tscript_t *tr = utr->tr;
+        gf_tscript_t *tr = utr->tr;
          if ( tr->id != trid ) continue;
          csq_t csq;
          memset(&csq, 0, sizeof(csq_t));
@@ -1771,7 +788,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
      }
      return 0;
  }
-static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
  {
  #if XDBG
  fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
@@ -1788,6 +805,21 @@ fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
      csq.type.gene    = tr->gene->name;
      csq_stage(args, &csq, rec);
  }
+static inline const char *drop_chr_prefix(args_t *args, const char *chr)
+{
+    if ( !args->unify_chr_names ) return chr;
+    if ( !strncasecmp("chr",chr,3) ) return chr+3;
+    return chr;
+}
+static inline const char *add_chr_prefix(args_t *args, const char *chr)
+{
+    if ( !args->unify_chr_names ) return chr;
+    int len = strlen(chr);
+    hts_expand(char,len+4,args->mchr_name,args->chr_name);
+    memcpy(args->chr_name,"chr",3);
+    memcpy(args->chr_name+3,chr,len+1);
+    return args->chr_name;
+}
  static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
  {
      // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
@@ -1813,7 +845,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
          if ( splice->check_utr )
          {
              regitr_t *itr = regitr_init(NULL);
-            const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+            const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
              if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) )     // adjacent utr
              {
                  ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1851,7 +883,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
          if ( splice->check_utr )
          {
              regitr_t *itr = regitr_init(NULL);
-            const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+            const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
              if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) )     // adjacent utr
              {
                  ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1924,7 +956,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
  int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
  {
      static int small_ref_padding_warned = 0;
-    tscript_t *tr = splice->tr;
+    gf_tscript_t *tr = splice->tr;
  
      // We know the VCF record overlaps the exon, but does it overlap the start codon?
      if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0;
@@ -1956,7 +988,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
          }
  
          char *ptr_vcf = splice->vcf.ref + alt_len;                         // the first deleted base in the VCF REF allele
-        char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg);  // the first ref base after the ndel bases deleted
+        char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg);  // the first ref base after the ndel bases deleted
  #if XDBG
          fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
  #endif
@@ -1985,7 +1017,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
          }
  
          char *ptr_vcf = splice->vcf.ref + alt_len;                                      // the first deleted base in the VCF REF allele
-        char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg;  // the replacement ref block
+        char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg;  // the replacement ref block
  #if XDBG
          fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
  #endif
@@ -2030,7 +1062,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%
              if ( splice->check_utr )
              {
                  regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                  if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) )     // adjacent utr
                      csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                  regitr_destroy(itr);
@@ -2086,7 +1118,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%
              if ( splice->check_utr )
              {
                  regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                  if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) )     // adjacent utr
                      csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                  regitr_destroy(itr);
@@ -2175,7 +1207,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
              if ( splice->check_utr )
              {
                  regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                  if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) )     // adjacent utr
                      csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                  regitr_destroy(itr);
@@ -2205,7 +1237,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d  check_ut
              if ( splice->check_utr )
              {
                  regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                  if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) )     // adjacent utr
                      csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                  regitr_destroy(itr);
@@ -2291,7 +1323,7 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
  {
      int i;
      kstring_t str = {0,0,0};
-    tscript_t *tr = cds->tr;
+    gf_tscript_t *tr = cds->tr;
      child->icds = cds->icds;     // index of cds in the tscript's list of exons
      child->vcf_ial = ial;
  
@@ -2313,8 +1345,8 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
      }
      if ( splice.check_start )   // do not check starts in incomplete CDS, defined as not starting with M
      {
-        if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
-        else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+        if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+        else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
      }
      if ( child->icds!=0 ) splice.check_region_beg = 1;
      if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -2373,12 +1405,12 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
              // the variant is on a new exon, finish up the previous
              int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
              if ( len > 0 )
-                kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+                kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
          }
  
          // append any skipped non-variant exons
          while ( ++i < cds->icds )
-            kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
  
          if ( parent->icds==child->icds )
          {
@@ -2390,10 +1422,10 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
                  free(splice.kalt.s);
                  return 1;
              }
-            kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
          }
          else
-            kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
      }
      kputs(splice.kalt.s + dbeg, &str);
  
@@ -2645,28 +1677,28 @@ fprintf(stderr,"\ntranslate: %d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,rend,fill,
  #endif
  }
  
-void tscript_splice_ref(tscript_t *tr)
+void tscript_splice_ref(gf_tscript_t *tr)
  {
      int i, len = 0;
      for (i=0; i<tr->ncds; i++)
          len += tr->cds[i]->len;
  
-    tr->nsref = len + 2*N_REF_PAD;
-    tr->sref  = (char*) malloc(len + 1 + 2*N_REF_PAD);
+    TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD;
+    TSCRIPT_AUX(tr)->sref  = (char*) malloc(len + 1 + 2*N_REF_PAD);
      len = 0;
  
-    memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+    memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
      len += N_REF_PAD;
  
      for (i=0; i<tr->ncds; i++)
      {
-        memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+        memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
          len += tr->cds[i]->len;
      }
-    memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+    memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
      len += N_REF_PAD;
  
-    tr->sref[len] = 0;
+    TSCRIPT_AUX(tr)->sref[len] = 0;
  }
  
  // returns: 0 if consequence was added, 1 if it already exists or could not be added
@@ -2800,18 +1832,25 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
      if ( csq->type & CSQ_UPSTREAM_STOP )
          kputc_('*',str);
  
-    int i, n = sizeof(csq_strings)/sizeof(char*);
+    int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*);
      for (i=1; i<n; i++)
-        if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+        if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputs(csq_strings[i],str); break; }
      i++;
      for (; i<n; i++)
-        if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+        if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputc_('&',str); kputs(csq_strings[i],str); }
+
+    if ( (csq->biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) )
+    {
+        if ( has_csq ) kputc_('&',str); // just in case, this should always be true
+        kputs("NMD_transcript",str);
+    }
  
      kputc_('|', str);
      if ( csq->gene ) kputs(csq->gene , str);
  
      kputc_('|', str);
-    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+//    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str);
  
      kputc_('|', str);
      kputs(gf_type2gff_string(csq->biotype), str);
@@ -2840,7 +1879,7 @@ void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
  void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
  {
      int i;
-    tscript_t *tr = hap->tr;
+    gf_tscript_t *tr = hap->tr;
      int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
      int icsq = node->ncsq_list++;
      hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
@@ -2954,7 +1993,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
      str.l = 0;
  
      // create the aa variant string
-    int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+    int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
      int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
      kputc_('|', &str);
      kputw(aa_rbeg, &str);
@@ -3020,13 +2059,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
  
  void hap_finalize(args_t *args, hap_t *hap)
  {
-    tscript_t *tr = hap->tr;
-    if ( !tr->sref )
+    gf_tscript_t *tr = hap->tr;
+    if ( !TSCRIPT_AUX(tr)->sref )
          tscript_splice_ref(tr);
  
      kstring_t sref;
-    sref.s = tr->sref;
-    sref.l = tr->nsref;
+    sref.s = TSCRIPT_AUX(tr)->sref;
+    sref.l = TSCRIPT_AUX(tr)->nsref;
      sref.m = sref.l;
  
      int istack = 0;
@@ -3034,7 +2073,7 @@ void hap_finalize(args_t *args, hap_t *hap)
  
      hap->sseq.l = 0;
      hap->tseq.l = 0;
-    hap->stack[0].node = tr->root;
+    hap->stack[0].node = TSCRIPT_AUX(tr)->root;
      hap->stack[0].ichild = -1;
      hap->stack[0].slen = 0;
      hap->stack[0].dlen = 0;
@@ -3214,7 +2253,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
      kput_vcsq(args, &csq->type, &args->str);
      fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
  }
-static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
  {
      if ( !node || !node->ncsq_list ) return;
  
@@ -3240,7 +2279,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih
      }
  }
  
-static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
  {
      if ( !node || !node->ncsq_list || ismpl<0 ) return;
  
@@ -3276,23 +2315,23 @@ void hap_flush(args_t *args, uint32_t pos)
      tr_heap_t *heap = args->active_tr;
      while ( heap->ndat && heap->dat[0]->end<=pos )
      {
-        tscript_t *tr = heap->dat[0];
+        gf_tscript_t *tr = heap->dat[0];
          khp_delete(trhp, heap);
          args->hap->tr = tr;
-        if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+        if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling
          {
              hap_finalize(args, args->hap);
  
              if ( args->output_type==FT_TAB_TEXT )   // plain text output, not a vcf
              {
                  if ( args->phase==PHASE_DROP_GT )
-                    hap_print_text(args, tr, -1,0, tr->hap[0]);
+                    hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]);
                  else
                  {
                      for (i=0; i<args->smpl->n; i++)
                      {
                          for (j=0; j<2; j++)
-                            hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+                            hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]);
                      }
                  }
              }
@@ -3301,7 +2340,7 @@ void hap_flush(args_t *args, uint32_t pos)
                  for (i=0; i<args->smpl->n; i++)
                  {
                      for (j=0; j<2; j++)
-                        hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+                        hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]);
                  }
              }
          }
@@ -3309,7 +2348,7 @@ void hap_flush(args_t *args, uint32_t pos)
          // mark the transcript for deletion. Cannot delete it immediately because
          // by-position VCF output will need them when flushed by vcf_buf_push
          args->nrm_tr++;
-        hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+        hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
          args->rm_tr[args->nrm_tr-1] = tr;
      }
  }
@@ -3424,24 +2463,33 @@ void vbuf_flush(args_t *args, uint32_t pos)
  
      for (i=0; i<args->nrm_tr; i++)
      {
-        tscript_t *tr = args->rm_tr[i];
-        if ( tr->root ) hap_destroy(tr->root);
-        tr->root = NULL;
-        free(tr->hap);
-        free(tr->ref);
-        free(tr->sref);
+        gf_tscript_t *tr = args->rm_tr[i];
+        tscript_t *aux = TSCRIPT_AUX(tr);
+        if ( aux->root ) hap_destroy(aux->root);
+        aux->root = NULL;
+        free(aux->hap);
+        free(aux->ref);
+        free(aux->sref);
+        free(aux);
+        tr->aux = NULL;
      }
      args->nrm_tr = 0;
      args->ncsq_buf = 0;
  }
  
-void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
  {
      int i, len;
      int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
  
-    tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
-    if ( !tr->ref )
+    const char *tmp_chr = chr;
+    if ( !faidx_has_seq(args->fai,tmp_chr) )
+    {
+        tmp_chr = drop_chr_prefix(args,chr);
+        if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr);
+    }
+    TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+    if ( !TSCRIPT_AUX(tr)->ref )
          error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
  
      int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
@@ -3449,23 +2497,23 @@ void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
      {
          char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1);
          for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
-        memcpy(ref+i, tr->ref, len);
+        memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len);
          len += i;
          for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
          ref[i+len] = 0;
-        free(tr->ref);
-        tr->ref = ref;
+        free(TSCRIPT_AUX(tr)->ref);
+        TSCRIPT_AUX(tr)->ref = ref;
      }
  }
  
-static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
  {
      int vbeg = 0;
      int rbeg = rec->pos - tr->beg + N_REF_PAD;
      if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; }
-    char *ref = tr->ref + rbeg;
+    char *ref = TSCRIPT_AUX(tr)->ref + rbeg;
      char *vcf = rec->d.allele[0] + vbeg;
-    assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD );
+    assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD );
      int i = 0;
      while ( ref[i] && vcf[i] )
      {
@@ -3479,7 +2527,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
  int test_cds_local(args_t *args, bcf1_t *rec)
  {
      int i,j, ret = 0;
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      // note that the off-by-one extension of rlen is deliberate to account for insertions
      if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
  
@@ -3491,12 +2539,13 @@ int test_cds_local(args_t *args, bcf1_t *rec)
      while ( regitr_overlap(args->itr) )
      {
          gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
-        tscript_t *tr = cds->tr;
+        gf_tscript_t *tr = cds->tr;
          if ( !GF_is_coding(tr->type) ) continue;
          ret = 1;
  
-        if ( !tr->ref )
+        if ( !TSCRIPT_AUX(tr) )
          {
+            tr->aux = calloc(sizeof(tscript_t),1);
              tscript_init_ref(args, tr, chr);
              tscript_splice_ref(tr);
              khp_insert(trhp, args->active_tr, &tr);     // only to clean the reference afterwards
@@ -3505,8 +2554,8 @@ int test_cds_local(args_t *args, bcf1_t *rec)
          sanity_check_ref(args, tr, rec);
  
          kstring_t sref;
-        sref.s = tr->sref;
-        sref.l = tr->nsref;
+        sref.s = TSCRIPT_AUX(tr)->sref;
+        sref.l = TSCRIPT_AUX(tr)->nsref;
          sref.m = sref.l;
  
          for (i=1; i<rec->n_allele; i++)
@@ -3614,8 +2663,8 @@ int test_cds_local(args_t *args, bcf1_t *rec)
                  {
                      // create the aa variant string
                      kstring_t str = {0,0,0};
-                    int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
-                    int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+                    int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+                    int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
                      kputc_('|', &str);
                      kputw(aa_rbeg, &str);
                      kprint_aa_prediction(args,aa_rbeg,tref,&str);
@@ -3633,11 +2682,11 @@ int test_cds_local(args_t *args, bcf1_t *rec)
                      csq_stage(args, &csq, rec);
  
                      // all this only to clean vstr when vrec is flushed
-                    if ( !tr->root )
-                        tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
-                    tr->root->ncsq_list++;
-                    hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
-                    csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+                    if ( !TSCRIPT_AUX(tr)->root )
+                        TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+                    TSCRIPT_AUX(tr)->root->ncsq_list++;
+                    hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list);
+                    csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1;
                      rm_csq->type.vstr = str;
                  }
                  if ( csq_type & ~CSQ_COMPOUND )
@@ -3659,27 +2708,28 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
      static int overlaps_warned = 0, multiploid_warned = 0;
  
      int i, ret = 0, hap_ret;
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      // note that the off-by-one extension of rlen is deliberate to account for insertions
      if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
      while ( regitr_overlap(args->itr) )
      {
          gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
-        tscript_t *tr = cds->tr;
+        gf_tscript_t *tr = cds->tr;
          if ( !GF_is_coding(tr->type) ) continue;
          if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end;
          ret = 1;
-        if ( !tr->root )
+        if ( !TSCRIPT_AUX(tr) )
          {
              // initialize the transcript and its haplotype tree, fetch the reference sequence
+            tr->aux = calloc(sizeof(tscript_t),1);
              tscript_init_ref(args, tr, chr);
  
-            tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
-            tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n;     // maximum ploidy = diploid
-            tr->hap  = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
-            for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
-            tr->root->nend = tr->nhap;
-            tr->root->type = HAP_ROOT;
+            TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+            TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n;     // maximum ploidy = diploid
+            TSCRIPT_AUX(tr)->hap  = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*));
+            for (i=0; i<TSCRIPT_AUX(tr)->nhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL;
+            TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap;
+            TSCRIPT_AUX(tr)->root->type = HAP_ROOT;
  
              khp_insert(trhp, args->active_tr, &tr);
          }
@@ -3689,7 +2739,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
          if ( args->phase==PHASE_DROP_GT )
          {
              if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
-            hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+            hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root;
              hap_node_t *child  = (hap_node_t*)calloc(1,sizeof(hap_node_t));
              hap_ret = hap_init(args, parent, child, cds, rec, 1);
              if ( hap_ret!=0 )
@@ -3734,8 +2784,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
              parent->mchild = 1;
              parent->child  = (hap_node_t**) malloc(sizeof(hap_node_t*));
              parent->child[0] = child;
-            tr->hap[0] = child;
-            tr->hap[0]->nend = 1;
+            TSCRIPT_AUX(tr)->hap[0] = child;
+            TSCRIPT_AUX(tr)->hap[0]->nend = 1;
              continue;
          }
  
@@ -3793,12 +2843,12 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
                  assert( ial < rec->n_allele );
                  if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
  
-                hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+                hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root;
                  if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
                  {
                      // this haplotype has been seen in another sample
-                    tr->hap[i] = parent->child[ parent->cur_child[ial] ];
-                    tr->hap[i]->nend++;
+                    TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ];
+                    TSCRIPT_AUX(tr)->hap[i]->nend++;
                      parent->nend--;
                      continue;
                  }
@@ -3852,8 +2902,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
                  hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
                  parent->cur_child[ial] = j;
                  parent->child[j] = child;
-                tr->hap[i] = child;
-                tr->hap[i]->nend++;
+                TSCRIPT_AUX(tr)->hap[i] = child;
+                TSCRIPT_AUX(tr)->hap[i]->nend++;
                  parent->nend--;
              }
          }
@@ -3933,7 +2983,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
  }
  int test_utr(args_t *args, bcf1_t *rec)
  {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      // note that the off-by-one extension of rlen is deliberate to account for insertions
      if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
  
@@ -3944,7 +2994,7 @@ int test_utr(args_t *args, bcf1_t *rec)
      while ( regitr_overlap(args->itr) )
      {
          gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
-        tscript_t *tr = splice.tr = utr->tr;
+        gf_tscript_t *tr = splice.tr = utr->tr;
          for (i=1; i<rec->n_allele; i++)
          {
              if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
@@ -3971,7 +3021,7 @@ int test_utr(args_t *args, bcf1_t *rec)
  }
  int test_splice(args_t *args, bcf1_t *rec)
  {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
  
      splice_t splice;
@@ -4003,7 +3053,7 @@ int test_splice(args_t *args, bcf1_t *rec)
  }
  int test_tscript(args_t *args, bcf1_t *rec)
  {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
  
      splice_t splice;
@@ -4012,7 +3062,7 @@ int test_tscript(args_t *args, bcf1_t *rec)
      int i, ret = 0;
      while ( regitr_overlap(args->itr) )
      {
-        tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+        gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
          for (i=1; i<rec->n_allele; i++)
          {
              if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
@@ -4046,7 +3096,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
          warned = 1;
      }
  
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
  
      // only insertions atm
      int beg = rec->pos + 1;
@@ -4061,7 +3111,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
              csq_t csq;
              memset(&csq, 0, sizeof(csq_t));
              gf_cds_t *cds    = regitr_payload(args->itr,gf_cds_t*);
-            tscript_t *tr    = cds->tr;
+            gf_tscript_t *tr = cds->tr;
              csq.type.type    = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class;
              csq.pos          = rec->pos;
              csq.type.biotype = tr->type;
@@ -4079,7 +3129,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
              csq_t csq;
              memset(&csq, 0, sizeof(csq_t));
              gf_utr_t *utr    = regitr_payload(args->itr, gf_utr_t*);
-            tscript_t *tr    = utr->tr;
+            gf_tscript_t *tr = utr->tr;
              csq.type.type    = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class;
              csq.pos          = rec->pos;
              csq.type.biotype = tr->type;
@@ -4118,7 +3168,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
          {
              csq_t csq;
              memset(&csq, 0, sizeof(csq_t));
-            tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+            gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
              splice.vcf.alt = rec->d.allele[1];
              splice.csq     = csq_class;
              int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
@@ -4179,7 +3229,10 @@ static void process(args_t *args, bcf1_t **rec_ptr)
          // Perform a simple sanity check (that does not catch much), the chromosome must be present in the
          // reference file
          if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
-            error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+        {
+            if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) )
+                error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+        }
      }
      if ( prev_pos > rec->pos )
          error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
@@ -4254,9 +3307,12 @@ static const char *usage(void)
          "                                       r: require phased GTs, throw an error on unphased het GTs\n"
          "                                       R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
          "                                       s: skip unphased hets\n"
-        "Options:\n"
-        "   -e, --exclude EXPR                Exclude sites for which the expression is true\n"
+        "GFF options:\n"
+        "       --dump-gff FILE.gz            Dump the parsed GFF file (for debugging purposes)\n"
          "       --force                       Run even if some sanity checks fail\n"
+        "       --unify-chr-names 1|0         Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n"
+        "General options:\n"
+        "   -e, --exclude EXPR                Exclude sites for which the expression is true\n"
          "   -i, --include EXPR                Select sites for which the expression is true\n"
          "       --no-version                  Do not append version and command line to the header\n"
          "   -o, --output FILE                 Write output to a file [standard output]\n"
@@ -4272,6 +3328,7 @@ static const char *usage(void)
          "       --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
          "       --threads INT                 Use multithreading with <int> worker threads [0]\n"
          "   -v, --verbose INT                 Verbosity level 0-2 [1]\n"
+        "       --write-index                 Automatically index the output files [off]\n"
          "\n"
          "Example:\n"
          "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
@@ -4292,6 +3349,7 @@ int main_csq(int argc, char *argv[])
      args->verbosity = 1;
      args->record_cmd_line = 1;
      args->clevel = -1;
+    args->unify_chr_names = 1;
  
      static struct option loptions[] =
      {
@@ -4321,6 +3379,9 @@ int main_csq(int argc, char *argv[])
          {"targets-file",1,0,'T'},
          {"targets-overlap",required_argument,NULL,5},
          {"no-version",no_argument,NULL,3},
+        {"write-index",no_argument,NULL,6},
+        {"dump-gff",required_argument,NULL,7},
+        {"unify-chr-names",required_argument,NULL,8},
          {0,0,0,0}
      };
      int c, targets_is_file = 0, regions_is_file = 0;
@@ -4339,7 +3400,7 @@ int main_csq(int argc, char *argv[])
              case  3 : args->record_cmd_line = 0; break;
              case 'b':
                      args->brief_predictions = 1;
-                    fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+                    fprintf(stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n");
                      break;
              case 'B':
                      args->brief_predictions = strtol(optarg,&tmp,10);
@@ -4409,6 +3470,13 @@ int main_csq(int argc, char *argv[])
                  targets_overlap = parse_overlap_option(optarg);
                  if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                  break;
+            case  6 : args->write_index = 1; break;
+            case  7 : args->dump_gff = optarg; break;
+            case  8 :
+                if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
+                else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1;
+                else error("Could not parse: --unify-chr-names %s\n",optarg);
+                break;
              case 'h':
              case '?': error("%s",usage());
              default: error("The option not recognised: %s\n\n", optarg); break;
diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c

index 8feb7af2228a2057c4c47387a44f1a26bcaeec60..5f590d16fd5d4c56ad60f72e410094ba1195057f 100644 (file)
--- a/bcftools/csq.c.pysam.c
+++ b/bcftools/csq.c.pysam.c
@@ -37,7 +37,7 @@
      Read about transcript types here
          http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
          http://www.ensembl.org/info/genome/variation/predicted_data.html
-        http://www.gencodegenes.org/gencode_biotypes.html
+        https://www.gencodegenes.org/pages/biotypes.html
  
      List of supported biotypes
          antisense
@@ -47,6 +47,7 @@
          IG_LV_gene
          IG_V_gene
          lincRNA
+        lncRNA      .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
          macro_lncRNA
          miRNA
          misc_RNA
@@ -54,7 +55,7 @@
          Mt_tRNA
          polymorphic_pseudogene
          processed_transcript
-        protein_coding
+        protein_coding, mRNA
          ribozyme
          rRNA
          sRNA
@@ -146,6 +147,7 @@
  #include <htslib/khash_str2int.h>
  #include <htslib/kseq.h>
  #include <htslib/faidx.h>
+#include <htslib/bgzf.h>
  #include <errno.h>
  #include <unistd.h>
  #include <ctype.h>
@@ -155,6 +157,7 @@
  #include "kheap.h"
  #include "smpl_ilist.h"
  #include "rbuf.h"
+#include "gff.h"
  
  #ifndef __FUNCTION__
  #  define __FUNCTION__ __func__
@@ -164,20 +167,8 @@
  #define FLT_INCLUDE 1
  #define FLT_EXCLUDE 2
  
-// Definition of splice_region, splice_acceptor and splice_donor
-#define N_SPLICE_DONOR         2
-#define N_SPLICE_REGION_EXON   3
-#define N_SPLICE_REGION_INTRON 8
-
  #define N_REF_PAD 10    // number of bases to avoid boundary effects
  
-#define STRAND_REV 0
-#define STRAND_FWD 1
-
-#define TRIM_NONE   0
-#define TRIM_5PRIME 1
-#define TRIM_3PRIME 2
-
  // How to treat phased/unphased genotypes
  #define PHASE_REQUIRE 0     // --phase r
  #define PHASE_MERGE   1     // --phase m
@@ -225,6 +216,7 @@
  
  #define CSQ_PRN_STRAND(csq)     ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
  #define CSQ_PRN_TSCRIPT         (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_NMD             (~(CSQ_INTRON|CSQ_NON_CODING))
  #define CSQ_PRN_BIOTYPE         CSQ_NON_CODING
  
  // see kput_vcsq()
@@ -256,119 +248,6 @@ const char *csq_strings[] =
      "start_retained"
  };
  
-
-// GFF line types
-#define GFF_UNKN_LINE    0
-#define GFF_TSCRIPT_LINE 1
-#define GFF_GENE_LINE    2
-
-
-/*
-    Genomic features, for fast lookup by position to overlapping features
-*/
-#define GF_coding_bit 6
-#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
-#define GF_MT_rRNA                       1                      // non-coding: 1, 2, ...
-#define GF_MT_tRNA                       2
-#define GF_lincRNA                       3
-#define GF_miRNA                         4
-#define GF_MISC_RNA                      5
-#define GF_rRNA                          6
-#define GF_snRNA                         7
-#define GF_snoRNA                        8
-#define GF_PROCESSED_TRANSCRIPT          9
-#define GF_ANTISENSE                    10
-#define GF_macro_lncRNA                 11
-#define GF_ribozyme                     12
-#define GF_sRNA                         13
-#define GF_scRNA                        14
-#define GF_scaRNA                       15
-#define GF_SENSE_INTRONIC               16
-#define GF_SENSE_OVERLAPPING            17
-#define GF_PSEUDOGENE                   18
-#define GF_PROCESSED_PSEUDOGENE         19
-#define GF_ARTIFACT                     20
-#define GF_IG_PSEUDOGENE                21
-#define GF_IG_C_PSEUDOGENE              22
-#define GF_IG_J_PSEUDOGENE              23
-#define GF_IG_V_PSEUDOGENE              24
-#define GF_TR_V_PSEUDOGENE              25
-#define GF_TR_J_PSEUDOGENE              26
-#define GF_MT_tRNA_PSEUDOGENE           27
-#define GF_misc_RNA_PSEUDOGENE          28
-#define GF_miRNA_PSEUDOGENE             29
-#define GF_RIBOZYME                     30
-#define GF_RETAINED_INTRON              31
-#define GF_RETROTRANSPOSED              32
-#define GF_tRNA_PSEUDOGENE              33
-#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE     34
-#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE   35
-#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE       36
-#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE    37
-#define GF_TRANSLATED_PROCESSED_PSEUDOGENE      38
-#define GF_KNOWN_NCRNA                          39
-#define GF_UNITARY_PSEUDOGENE                   40
-#define GF_UNPROCESSED_PSEUDOGENE               41
-#define GF_LRG_GENE                             42
-#define GF_3PRIME_OVERLAPPING_ncRNA             43
-#define GF_DISRUPTED_DOMAIN                     44
-#define GF_vaultRNA                             45
-#define GF_BIDIRECTIONAL_PROMOTER_lncRNA        46
-#define GF_AMBIGUOUS_ORF                        47
-#define GF_PROTEIN_CODING               (1|(1<<GF_coding_bit))  // coding: 65, 66, ...
-#define GF_POLYMORPHIC_PSEUDOGENE       (2|(1<<GF_coding_bit))
-#define GF_IG_C                         (3|(1<<GF_coding_bit))
-#define GF_IG_D                         (4|(1<<GF_coding_bit))
-#define GF_IG_J                         (5|(1<<GF_coding_bit))
-#define GF_IG_LV                        (6|(1<<GF_coding_bit))
-#define GF_IG_V                         (7|(1<<GF_coding_bit))
-#define GF_TR_C                         (8|(1<<GF_coding_bit))
-#define GF_TR_D                         (9|(1<<GF_coding_bit))
-#define GF_TR_J                        (10|(1<<GF_coding_bit))
-#define GF_TR_V                        (11|(1<<GF_coding_bit))
-#define GF_NMD                         (12|(1<<GF_coding_bit))
-#define GF_NON_STOP_DECAY              (13|(1<<GF_coding_bit))
-#define GF_CDS      ((1<<(GF_coding_bit+1))+1)                  // special types: 129, 130, ...
-#define GF_EXON     ((1<<(GF_coding_bit+1))+2)
-#define GF_UTR3     ((1<<(GF_coding_bit+1))+3)
-#define GF_UTR5     ((1<<(GF_coding_bit+1))+4)
-// GF_MAX = (1<<30)-1, see hap_node_t
-
-#define CDS_PHASE_UNKN 3
-typedef struct _tscript_t tscript_t;
-typedef struct
-{
-    tscript_t *tr;      // transcript
-    uint32_t beg;       // the start coordinate of the CDS (on the reference strand, 0-based)
-    uint32_t pos;       // 0-based index of the first exon base within the transcript (only to
-                        //  update hap_node_t.sbeg in hap_init, could be calculated on the fly)
-    uint32_t len;       // exon length
-    uint32_t icds:30,   // exon index within the transcript
-             phase:2;   // offset of the CDS: 0,1,2 or 3 for unknown
-}
-gf_cds_t;
-typedef struct
-{
-    char *name;           // human readable name, e.g. ORF45
-    uint32_t iseq;
-}
-gf_gene_t;
-typedef struct
-{
-    uint32_t beg,end;
-    tscript_t *tr;
-}
-gf_exon_t;
-typedef enum { prime3, prime5 } utr_t;
-typedef struct
-{
-    utr_t which;
-    uint32_t beg,end;
-    tscript_t *tr;
-}
-gf_utr_t;
-
-
  /*
      Structures related to VCF output:
  
@@ -461,28 +340,21 @@ struct _hap_node_t
      csq_t *csq_list;            // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
      int ncsq_list, mcsq_list;
  };
-struct _tscript_t
+#define TSCRIPT_AUX(x) ((tscript_t*)(x)->aux)
+typedef struct
  {
-    uint32_t id;        // transcript id
-    uint32_t beg,end;   // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
-    uint32_t strand:1,  // STRAND_REV or STRAND_FWD
-             ncds:31,   // number of exons
-             mcds;
-    gf_cds_t **cds;     // ordered list of exons
      char *ref;          // reference sequence, padded with N_REF_PAD bases on both ends
      char *sref;         // spliced reference sequence, padded with N_REF_PAD bases on both ends
      hap_node_t *root;   // root of the haplotype tree
      hap_node_t **hap;   // pointer to haplotype leaves, two for each sample
      int nhap, nsref;    // number of haplotypes and length of sref, including 2*N_REF_PAD
-    uint32_t trim:2,    // complete, 5' or 3' trimmed, see TRIM_* types
-             type:30;   // one of GF_* types
-    gf_gene_t *gene;
-};
-static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+}
+tscript_t;
+static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b)
  {
      return ( (*a)->end  < (*b)->end ) ? 1 : 0;
  }
-KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript)
  typedef khp_trhp_t tr_heap_t;
  typedef struct
  {
@@ -496,7 +368,7 @@ typedef struct
  {
      int mstack;
      hstack_t *stack;
-    tscript_t *tr;      // tr->ref: spliced transcript on ref strand
+    gf_tscript_t *tr;   // tr->ref: spliced transcript on ref strand
      kstring_t sseq;     // spliced haplotype sequence on ref strand
      kstring_t tseq;     // the variable part of translated haplotype transcript, coding strand
      kstring_t tref;     // the variable part of translated reference transcript, coding strand
@@ -505,77 +377,20 @@ typedef struct
  }
  hap_t;
  
-
-/*
-    Helper structures, only for initialization
-
-    ftr_t
-        temporary list of all exons, CDS, UTRs
-*/
-KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
-typedef struct
-{
-    int type;       // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
-    uint32_t beg;
-    uint32_t end;
-    uint32_t trid;
-    uint32_t strand:1;   // STRAND_REV,STRAND_FWD
-    uint32_t phase:2;    // 0, 1, 2, or 3 for unknown
-    uint32_t iseq:29;
-}
-ftr_t;
-/*
-    Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
-    to integer id.  To keep the memory requirements low, the original version
-    relied on IDs in the form of a string prefix and a numerical id.  However,
-    it turns out that this assumption is not valid for some ensembl GFFs, see
-    for example Zea_mays.AGPv4.36.gff3.gz
- */
-typedef struct
-{
-    void *str2id;       // khash_str2int
-    int nstr, mstr;
-    char **str;         // numeric id to string
-}
-id_tbl_t;
-typedef struct
-{
-    // all exons, CDS, UTRs
-    ftr_t *ftr;
-    int nftr, mftr;
-
-    // mapping from gene id to gf_gene_t
-    kh_int2gene_t *gid2gene;
-
-    // mapping from transcript id to tscript, for quick CDS anchoring
-    kh_int2tscript_t *id2tr;
-
-    // sequences
-    void *seq2int;  // str2int hash
-    char **seq;
-    int nseq, mseq;
-
-    // ignored biotypes
-    void *ignored_biotypes;
-
-    id_tbl_t gene_ids;   // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
-}
-aux_t;
-
  typedef struct _args_t
  {
      // the main regidx lookups, from chr:beg-end to overlapping features and
      // index iterator
+    gff_t *gff;
      regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
      regitr_t *itr;
  
-    // temporary structures, deleted after initializtion
-    aux_t init;
-
      // text tab-delimited output (out) or vcf/bcf output (out_fh)
      FILE *out;
      htsFile *out_fh;
+    char *index_fn;
+    int write_index;
+    char *dump_gff;
  
      // vcf
      bcf_srs_t *sr;
@@ -599,6 +414,13 @@ typedef struct _args_t
      int ncsq2_max, nfmt_bcsq;   // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
      int ncsq2_small_warned;
      int brief_predictions;
+    int unify_chr_names;
+    char *chr_name;
+    int mchr_name;
+    struct {
+        int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+        int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+    } warned;
  
      int rid;                    // current chromosome
      tr_heap_t *active_tr;       // heap of active transcripts for quick flushing
@@ -606,11 +428,10 @@ typedef struct _args_t
      vbuf_t **vcf_buf;           // buffered VCF lines to annotate with CSQ and flush
      rbuf_t vcf_rbuf;            // round buffer indexes to vcf_buf
      kh_pos2vbuf_t *pos2vbuf;    // fast lookup of buffered lines by position
-    tscript_t **rm_tr;          // buffer of transcripts to clean
+    gf_tscript_t **rm_tr;       // buffer of transcripts to clean
      int nrm_tr, mrm_tr;
      csq_t *csq_buf;             // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
      int ncsq_buf, mcsq_buf;
-    id_tbl_t tscript_ids;       // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
      int force;                  // force run under various conditions. Currently only to skip out-of-phase transcripts
      int n_threads;              // extra compression/decompression threads
  
@@ -647,818 +468,6 @@ const uint8_t cnt4[] =
  #define dna2aa(x)  gencode[  nt4[(uint8_t)(x)[0]]<<4 |  nt4[(uint8_t)(x)[1]]<<2 |  nt4[(uint8_t)(x)[2]] ]
  #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
  
-static const char *gf_strings_noncoding[] =
-{
-    "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
-    "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
-    "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
-    "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
-    "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
-    "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene",    "translated_unprocessed_pseudogene",
-    "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
-    "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
-};
-static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
-static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
-
-const char *gf_type2gff_string(int type)
-{
-    if ( !GF_is_coding(type) )
-    {
-        if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
-        type &= (1<<(GF_coding_bit+1)) - 1;
-        return gf_strings_special[type - 1];
-    }
-    type &= (1<<GF_coding_bit) - 1;
-    return gf_strings_coding[type - 1];
-}
-
-/*
-    gff parsing functions
-*/
-static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
-{
-    aux_t *aux = &args->init;
-    char c = chr_end[1];
-    chr_end[1] = 0;
-    int iseq;
-    if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
-    {
-        // check for possible mismatch in chromosome naming convention such as chrX vs X
-        char *new_chr = NULL;
-        if ( faidx_has_seq(args->fai,chr_beg) )
-            new_chr = strdup(chr_beg);                  // valid chr name, the same in gff and faidx
-        else
-        {
-            int len = strlen(chr_beg);
-            if ( !strncmp("chr",chr_beg,3) && len>3 )
-                new_chr = strdup(chr_beg+3);            // gff has the prefix, faidx does not
-            else
-            {
-                new_chr = malloc(len+4);                // gff does not have the prefix, faidx has
-                memcpy(new_chr,"chr",3);
-                memcpy(new_chr+3,chr_beg,len);
-                new_chr[len+3] = 0;
-            }
-            if ( !faidx_has_seq(args->fai,new_chr) )    // modification did not help, this sequence is not in fai
-            {
-                static int unkwn_chr_warned = 0;
-                if ( !unkwn_chr_warned && args->verbosity>0 )
-                    fprintf(bcftools_stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
-                unkwn_chr_warned = 1;
-                free(new_chr);
-                new_chr = strdup(chr_beg);              // use the original sequence name
-            }
-        }
-        if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
-        {
-            hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
-            aux->seq[aux->nseq] = new_chr;
-            iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
-            aux->nseq++;
-            assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
-        }
-        else
-            free(new_chr);
-    }
-    chr_end[1] = c;
-    return iseq;
-}
-static inline char *gff_skip(const char *line, char *ss)
-{
-    while ( *ss && *ss!='\t' ) ss++;
-    if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    return ss+1;
-}
-static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
-{
-    char *se = (char*) line;
-    while ( *se && *se!='\t' ) se++;
-    if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    *chr_beg = (char*) line;
-    *chr_end = se-1;
-}
-static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
-{
-    char *se = ss;
-    *beg = strtol(ss, &se, 10) - 1;
-    if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
-    ss = se+1;
-    *end = strtol(ss, &se, 10) - 1;
-    if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    return se+1;
-}
-static void gff_id_init(id_tbl_t *tbl)
-{
-    memset(tbl, 0, sizeof(*tbl));
-    tbl->str2id = khash_str2int_init();
-}
-static void gff_id_destroy(id_tbl_t *tbl)
-{
-    khash_str2int_destroy_free(tbl->str2id);
-    free(tbl->str);
-}
-// returns 0 on success, -1 on failure
-static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr)
-{
-    ss = strstr(ss,needle);     // e.g. "ID=transcript:"
-    if ( !ss ) return -1;
-    ss += strlen(needle);
-
-    char *se = ss;
-    while ( *se && *se!=';' && !isspace(*se) ) se++;
-    char tmp = *se;
-    *se = 0;
-
-    int id;
-    if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
-    {
-        id = tbl->nstr++;
-        hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
-        tbl->str[id] = strdup(ss);
-        khash_str2int_set(tbl->str2id, tbl->str[id], id);
-    }
-    *se = tmp;
-    *id_ptr = id;
-    return 0;
-}
-static inline int gff_parse_type(char *line)
-{
-    line = strstr(line,"ID=");
-    if ( !line ) return -1;
-    line += 3;
-    if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
-    else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
-    return -1;
-}
-static inline int gff_parse_biotype(char *_line)
-{
-    char *line = strstr(_line,"biotype=");
-    if ( !line ) return -1;
-
-    line += 8;
-    switch (*line)
-    {
-        case 'p':
-            if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
-            else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
-            else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
-            else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
-            break;
-        case 'a':
-            if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
-            else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
-            else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
-            break;
-        case 'I':
-            if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
-            else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
-            else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
-            else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
-            else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
-            else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
-            else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
-            break;
-        case 'T':
-            if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
-            else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
-            else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
-            else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
-            else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
-            else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
-            break;
-        case 'M':
-            if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
-            else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
-            break;
-        case 'l':
-            if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
-            break;
-        case 'm':
-            if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
-            else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
-            else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
-            else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
-            break;
-        case 'r':
-            if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
-            else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
-            else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
-            else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
-            break;
-        case 's':
-            if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
-            else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
-            else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
-            else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
-            else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
-            else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
-            else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
-            break;
-        case 't':
-            if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
-            else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
-            else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
-            break;
-        case 'n':
-            if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
-            else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
-            break;
-        case 'k':
-            if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
-            break;
-        case 'u':
-            if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
-            else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
-            break;
-        case 'L':
-            if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
-            break;
-        case '3':
-            if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
-            break;
-        case 'd':
-            if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
-            break;
-        case 'v':
-            if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
-            break;
-        case 'b':
-            if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
-            break;
-    }
-    return 0;
-}
-static inline int gff_ignored_biotype(args_t *args, char *ss)
-{
-    ss = strstr(ss,"biotype=");
-    if ( !ss ) return 0;
-
-    ss += 8;
-    char *se = ss, tmp;
-    while ( *se && *se!=';' ) se++;
-    tmp = *se;
-    *se = 0;
-
-    char *key = ss;
-    int n = 0;
-    if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
-    khash_str2int_set(args->init.ignored_biotypes, key, n+1);
-
-    *se = tmp;
-    return 1;
-}
-gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
-{
-    khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
-    gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
-    if ( !gene )
-    {
-        gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
-        int ret;
-        k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
-        kh_val(aux->gid2gene,k) = gene;
-    }
-    return gene;
-}
-void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    int biotype = gff_parse_biotype(ss);
-    if ( biotype <= 0 )
-    {
-        if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript, unknown biotype: %s\n",line);
-        return;
-    }
-
-    // create a mapping from transcript_id to gene_id
-    uint32_t trid, gene_id;
-    if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) )
-    {
-        if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) )
-            error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(bcftools_stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-    if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) )
-    {
-        if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) )
-            error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(bcftools_stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
-    tr->id     = trid;
-    tr->strand = ftr->strand;
-    tr->gene   = gene_init(aux, gene_id);
-    tr->type   = biotype;
-    tr->beg    = ftr->beg;
-    tr->end    = ftr->end;
-
-    khint_t k;
-    int ret;
-    k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
-    kh_val(aux->id2tr,k) = tr;
-}
-void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
-{
-    int biotype = gff_parse_biotype(ss);
-    if ( biotype <= 0 )
-    {
-        if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene, unknown biotype: %s\n",line);
-        return;
-    }
-
-    aux_t *aux = &args->init;
-
-    // substring search for "ID=gene:ENSG00000437963"
-    uint32_t gene_id;
-    if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) )
-    {
-        if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) )
-            error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(bcftools_stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    gf_gene_t *gene = gene_init(aux, gene_id);
-    assert( !gene->name );      // the gene_id should be unique
-
-    gene->iseq = feature_set_seq(args, chr_beg,chr_end);
-
-    // substring search for "Name=OR4F5"
-    ss = strstr(chr_end+2,"Name=");
-    if ( ss )
-    {
-        ss += 5;
-        char *se = ss;
-        while ( *se && *se!=';' && !isspace(*se) ) se++;
-        gene->name = (char*) malloc(se-ss+1);
-        memcpy(gene->name,ss,se-ss);
-        gene->name[se-ss] = 0;
-    }
-    else
-        gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
-}
-int gff_parse(args_t *args, char *line, ftr_t *ftr)
-{
-    // - skip empty lines and commented lines
-    // - columns
-    //      1.      chr
-    //      2.      <skip>
-    //      3.      CDS, transcript, gene, ...
-    //      4-5.    beg,end
-    //      6.      <skip>
-    //      7.      strand
-    //      8.      phase
-    //      9.      Parent=transcript:ENST(\d+);ID=... etc
-
-    char *ss = line;
-    if ( !*ss ) return -1;      // skip blank lines
-    if ( *ss=='#' ) return -1;  // skip comments
-
-    char *chr_beg, *chr_end;
-    gff_parse_chr(line, &chr_beg, &chr_end);
-    ss = gff_skip(line, chr_end + 2);
-
-    // 3. column: is this a CDS, transcript, gene, etc.
-    if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
-    else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
-    else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
-    else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
-    else
-    {
-        int type = GFF_UNKN_LINE;
-        if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE;
-        else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE;
-        ss = gff_skip(line, ss);
-        ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
-        ss = gff_skip(line, ss);
-        if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss);   // determine type from ID=transcript: or ID=gene:
-        if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
-        {
-            // we ignore these, debug print to see new types:
-            ss = strstr(ss,"ID=");
-            if ( !ss ) return -1;   // no ID, ignore the line
-            if ( !strncmp("chromosome",ss+3,10) ) return -1;
-            if ( !strncmp("supercontig",ss+3,11) ) return -1;
-            if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored: %s\n", line);
-            return -1;
-        }
-
-        // 7. column: strand
-        if ( *ss == '+' ) ftr->strand = STRAND_FWD;
-        else if ( *ss == '-' ) ftr->strand = STRAND_REV;
-        else error("Unknown strand: %c .. %s\n", *ss,ss);
-
-        if ( type==GFF_TSCRIPT_LINE )
-            gff_parse_transcript(args, line, ss, ftr);
-        else
-            gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
-
-        return -1;
-    }
-    ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
-    ss = gff_skip(line, ss);
-
-    // 7. column: strand
-    if ( *ss == '+' ) ftr->strand = STRAND_FWD;
-    else if ( *ss == '-' ) ftr->strand = STRAND_REV;
-    else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
-    ss += 2;
-
-    // 8. column: phase (codon offset)
-    if ( *ss == '0' ) ftr->phase = 0;
-    else if ( *ss == '1' ) ftr->phase = 1;
-    else if ( *ss == '2' ) ftr->phase = 2;
-    else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN;     // exons and even CDS in some GFFs do not have phase
-    else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
-    ss += 2;
-
-    // substring search for "Parent=transcript:ENST00000437963"
-    if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) )
-    {
-        if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) )
-            error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-        static int warned = 0;
-        if ( !warned && args->verbosity > 0 )
-        {
-            fprintf(bcftools_stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line);
-            warned = 1;
-        }
-    }
-
-    ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
-    return 0;
-}
-
-static int cmp_cds_ptr(const void *a, const void *b)
-{
-    // comparison function for qsort of transcripts's CDS
-    if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
-    if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
-    return 0;
-}
-
-static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
-{
-    *chr_beg = *chr_end = aux->seq[iseq];
-    while ( (*chr_end)[1] ) (*chr_end)++;
-}
-tscript_t *tscript_init(aux_t *aux, uint32_t trid)
-{
-    khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
-    tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
-    assert( tr );
-    return tr;
-}
-void register_cds(args_t *args, ftr_t *ftr)
-{
-    // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
-    //  ftr is the result of parsing a gff CDS line
-    aux_t *aux = &args->init;
-
-    tscript_t *tr = tscript_init(aux, ftr->trid);
-    if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
-
-    gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
-    cds->tr    = tr;
-    cds->beg   = ftr->beg;
-    cds->len   = ftr->end - ftr->beg + 1;
-    cds->icds  = 0;     // to keep valgrind on mac happy
-    cds->phase = ftr->phase;
-
-    hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
-    tr->cds[tr->ncds++] = cds;
-}
-void register_utr(args_t *args, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
-    utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
-    utr->beg   = ftr->beg;
-    utr->end   = ftr->end;
-    utr->tr    = tscript_init(aux, ftr->trid);
-
-    char *chr_beg, *chr_end;
-    chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
-    regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
-}
-void register_exon(args_t *args, ftr_t *ftr)
-{
-    aux_t *aux = &args->init;
-    gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
-    exon->beg = ftr->beg;
-    exon->end = ftr->end;
-    exon->tr  = tscript_init(aux, ftr->trid);
-
-    char *chr_beg, *chr_end;
-    chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
-    regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
-}
-
-void tscript_init_cds(args_t *args)
-{
-    aux_t *aux = &args->init;
-
-    // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
-    khint_t k;
-    int warn_phase_unkn = 0;
-    for (k=0; k<kh_end(aux->id2tr); k++)
-    {
-        if ( !kh_exist(aux->id2tr, k) ) continue;
-        tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
-
-        // position-to-tscript lookup
-        char *chr_beg, *chr_end;
-        chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
-        regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
-
-        if ( !tr->ncds ) continue;      // transcript with no CDS
-
-        // sort CDs
-        qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
-
-        // trim non-coding start
-        int i, len = 0;
-        if ( tr->strand==STRAND_FWD )
-        {
-            if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
-            {
-                if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
-                tr->cds[0]->beg += tr->cds[0]->phase;
-                tr->cds[0]->len -= tr->cds[0]->phase;
-                tr->cds[0]->phase = 0;
-            }
-
-            // sanity check phase; the phase number in gff tells us how many bases to skip in this
-            // feature to reach the first base of the next codon
-            int tscript_ok = 1;
-            for (i=0; i<tr->ncds; i++)
-            {
-                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
-                {
-                    warn_phase_unkn = 1;
-                    len += tr->cds[i]->len;
-                    continue;
-                }
-                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
-                if ( phase!=len%3 )
-                {
-                    if ( args->force )
-                    {
-                        if ( args->verbosity > 0 )
-                            fprintf(bcftools_stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
-                                args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                        tscript_ok = 0;
-                        break;
-                    }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
-                            args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                }
-                len += tr->cds[i]->len;
-            }
-            if ( !tscript_ok ) continue;    // skip this transcript
-        }
-        else
-        {
-            if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
-            {
-                // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
-                // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
-                // todo: the same for the fwd strand
-                i = tr->ncds - 1;
-                int phase = tr->cds[i]->phase;
-                if ( phase ) tr->trim |= TRIM_5PRIME;
-                while ( i>=0 && phase > tr->cds[i]->len )
-                {
-                    phase -= tr->cds[i]->len;
-                    tr->cds[i]->phase = 0;
-                    tr->cds[i]->len   = 0;
-                    i--;
-                }
-                tr->cds[i]->len  -= tr->cds[i]->phase;
-                tr->cds[i]->phase = 0;
-            }
-
-            // sanity check phase
-            int tscript_ok = 1;
-            for (i=tr->ncds-1; i>=0; i--)
-            {
-                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
-                {
-                    warn_phase_unkn = 1;
-                    len += tr->cds[i]->len;
-                    continue;
-                }
-                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
-                if ( phase!=len%3)
-                {
-                    if ( args->force )
-                    {
-                        if ( args->verbosity > 0 )
-                            fprintf(bcftools_stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
-                                args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                        tscript_ok = 0;
-                        break;
-                    }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
-                        args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
-                }
-                len += tr->cds[i]->len;
-            }
-            if ( !tscript_ok ) continue;    // skip this transcript
-        }
-
-        // set len. At the same check that CDS within a transcript do not overlap
-        len = 0;
-        for (i=0; i<tr->ncds; i++)
-        {
-            tr->cds[i]->icds = i;
-            len += tr->cds[i]->len;
-            if ( !i ) continue;
-
-            gf_cds_t *a = tr->cds[i-1];
-            gf_cds_t *b = tr->cds[i];
-            if ( a->beg + a->len - 1 >= b->beg )
-            {
-                if ( args->force )
-                {
-                    fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
-                        args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
-                }
-                else
-                    error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
-                          "       Use the --force option to override (at your own risk).\n",
-                            args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
-            }
-        }
-        if ( len%3 != 0 )
-        {
-            // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
-            //  http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
-            // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
-
-            tr->trim |= TRIM_3PRIME;
-            if ( tr->strand==STRAND_FWD )
-            {
-                i = tr->ncds - 1;
-                while ( i>=0 && len%3 )
-                {
-                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
-                    tr->cds[i]->len -= dlen;
-                    len -= dlen;
-                    i--;
-                }
-            }
-            else
-            {
-                i = 0;
-                while ( i<tr->ncds && len%3 )
-                {
-                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
-                    tr->cds[i]->len -= dlen;
-                    tr->cds[i]->beg += dlen;
-                    len -= dlen;
-                    i++;
-                }
-            }
-        }
-
-        // set CDS offsets and insert into regidx
-        len=0;
-        for (i=0; i<tr->ncds; i++)
-        {
-            tr->cds[i]->pos = len;
-            len += tr->cds[i]->len;
-            regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
-        }
-    }
-    if ( warn_phase_unkn && args->verbosity > 0 )
-        fprintf(bcftools_stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n");
-}
-
-void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
-void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
-
-void init_gff(args_t *args)
-{
-    aux_t *aux = &args->init;
-    aux->seq2int   = khash_str2int_init();   // chrom's numeric id
-    aux->gid2gene  = kh_init(int2gene);      // gene id to gf_gene_t, for idx_gene
-    aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
-    args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
-    aux->ignored_biotypes = khash_str2int_init();
-    gff_id_init(&aux->gene_ids);
-    gff_id_init(&args->tscript_ids);
-
-    // parse gff
-    kstring_t str = {0,0,0};
-    htsFile *fp = hts_open(args->gff_fname,"r");
-    if ( !fp ) error("Failed to read %s\n", args->gff_fname);
-    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
-    {
-        hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
-        int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
-        if ( !ret ) aux->nftr++;
-    }
-    free(str.s);
-    if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
-
-
-    // process gff information: connect CDS and exons to transcripts
-    args->idx_cds  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
-    args->idx_utr  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
-    args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
-    args->itr      = regitr_init(NULL);
-
-    int i;
-    for (i=0; i<aux->nftr; i++)
-    {
-        ftr_t *ftr = &aux->ftr[i];
-
-        // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
-        khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
-        if ( k==kh_end(aux->id2tr) ) continue;       // no such transcript
-
-        tscript_t *tr = kh_val(aux->id2tr,k);
-        if ( !tr->gene->name )
-        {
-            // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
-            regidx_free_tscript(&tr);
-            kh_del(int2tscript, aux->id2tr,k);
-            continue;
-        }
-
-        // populate regidx by category:
-        //      ftr->type   .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
-        //      gene->type  .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
-        if ( ftr->type==GF_CDS ) register_cds(args, ftr);
-        else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
-        else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
-        else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
-        else
-            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
-    }
-    tscript_init_cds(args);
-
-    if ( args->verbosity > 0 )
-    {
-        fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
-                regidx_nregs(args->idx_tscript),
-                regidx_nregs(args->idx_exon),
-                regidx_nregs(args->idx_cds),
-                regidx_nregs(args->idx_utr));
-    }
-    if ( !regidx_nregs(args->idx_tscript) )
-        fprintf(bcftools_stderr,
-            "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
-            "         or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
-            "         of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
-
-    free(aux->ftr);
-    khash_str2int_destroy_free(aux->seq2int);
-    // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
-    kh_destroy(int2tscript,aux->id2tr);
-    free(aux->seq);
-    gff_id_destroy(&aux->gene_ids);
-
-    if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
-    {
-        khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
-        fprintf(bcftools_stderr,"Ignored the following biotypes:\n");
-        for (i = kh_begin(ign); i < kh_end(ign); i++)
-        {
-            if ( !kh_exist(ign,i)) continue;
-            const char *biotype = kh_key(ign,i);
-            if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
-            fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
-        }
-    }
-    khash_str2int_destroy_free(aux->ignored_biotypes);
-}
-
  static inline int ncsq2_to_nfmt(int ncsq2)
  {
      return 1 + (ncsq2 - 1) / 30;
@@ -1476,8 +485,17 @@ void init_data(args_t *args)
      args->fai = fai_load(args->fa_fname);
      if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
  
-    if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname);
-    init_gff(args);
+    args->gff = gff_init(args->gff_fname);
+    gff_set(args->gff,verbosity,args->verbosity);
+    gff_set(args->gff,strip_chr_names,args->unify_chr_names);
+    gff_set(args->gff,force_out_of_phase,args->force);
+    gff_set(args->gff,dump_fname,args->dump_gff);
+    gff_parse(args->gff);
+    args->idx_cds  = gff_get(args->gff,idx_cds);
+    args->idx_utr  = gff_get(args->gff,idx_utr);
+    args->idx_exon = gff_get(args->gff,idx_exon);
+    args->idx_tscript = gff_get(args->gff,idx_tscript);
+    args->itr = regitr_init(NULL);
  
      args->rid = -1;
  
@@ -1538,6 +556,7 @@ void init_data(args_t *args)
          if ( args->hdr_nsmpl )
              bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
          if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      }
      if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Calling...\n");
  }
@@ -1549,21 +568,8 @@ void destroy_data(args_t *args)
              "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
              "      the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
  
-    regidx_destroy(args->idx_cds);
-    regidx_destroy(args->idx_utr);
-    regidx_destroy(args->idx_exon);
-    regidx_destroy(args->idx_tscript);
      regitr_destroy(args->itr);
-
-    khint_t k,i,j;
-    for (k=0; k<kh_end(args->init.gid2gene); k++)
-    {
-        if ( !kh_exist(args->init.gid2gene, k) ) continue;
-        gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
-        free(gene->name);
-        free(gene);
-    }
-    kh_destroy(int2gene,args->init.gid2gene);
+    gff_destroy(args->gff);
  
      if ( args->filter )
          filter_destroy(args->filter);
@@ -1571,9 +577,20 @@ void destroy_data(args_t *args)
      khp_destroy(trhp,args->active_tr);
      kh_destroy(pos2vbuf,args->pos2vbuf);
      if ( args->smpl ) smpl_ilist_destroy(args->smpl);
-    int ret;
+    int i,j,ret;
      if ( args->out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
          ret = hts_close(args->out_fh);
+    }
      else
          ret = fclose(args->out);
      if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
@@ -1604,7 +621,7 @@ void destroy_data(args_t *args)
      free(args->gt_arr);
      free(args->str.s);
      free(args->str2.s);
-    gff_id_destroy(&args->tscript_ids);
+    free(args->chr_name);
  }
  
  /*
@@ -1616,7 +633,7 @@ void destroy_data(args_t *args)
  #define SPLICE_OVERLAP 3   // indel overlaps region boundary, csq set but could not determine csq
  typedef struct
  {
-    tscript_t *tr;
+    gf_tscript_t *tr;
      struct {
          int32_t pos, rlen, alen, ial;
          char *ref, *alt;
@@ -1680,7 +697,7 @@ fprintf(bcftools_stderr,"build_hap:  rbeg=%d + %d    abeg=%d \n",rbeg,rlen,abeg)
      if ( rbeg < splice->vcf.pos )
      {
          assert( splice->tr->beg <= rbeg );  // this can be extended thanks to N_REF_PAD
-        kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+        kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
          roff = 0;
      }
      else
@@ -1705,7 +722,7 @@ fprintf(bcftools_stderr,"r2: %s\n",splice->kref.s);
          if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
              rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
          if ( splice->kref.l < rlen )
-            kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+            kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
      }
  #if XDBG
  fprintf(bcftools_stderr,"r3: %s\n",splice->kref.s);
@@ -1716,7 +733,7 @@ fprintf(bcftools_stderr,"r3: %s\n",splice->kref.s);
      if ( abeg < splice->vcf.pos )
      {
          assert( splice->tr->beg <= abeg );
-        kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+        kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
          aoff = 0;
      }
      else
@@ -1744,7 +761,7 @@ fprintf(bcftools_stderr,"a2: %s  aoff=%d\n",splice->kalt.s,aoff);
          if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
              alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
          if ( alen > 0 && alen > splice->kalt.l )
-            kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+            kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
      }
  #if XDBG
  fprintf(bcftools_stderr,"a3: %s\n",splice->kalt.s);
@@ -1757,7 +774,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
      while ( regitr_overlap(itr) )
      {
          gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
-        tscript_t *tr = utr->tr;
+        gf_tscript_t *tr = utr->tr;
          if ( tr->id != trid ) continue;
          csq_t csq;
          memset(&csq, 0, sizeof(csq_t));
@@ -1773,7 +790,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
      }
      return 0;
  }
-static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
  {
  #if XDBG
  fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
@@ -1790,6 +807,21 @@ fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
      csq.type.gene    = tr->gene->name;
      csq_stage(args, &csq, rec);
  }
+static inline const char *drop_chr_prefix(args_t *args, const char *chr)
+{
+    if ( !args->unify_chr_names ) return chr;
+    if ( !strncasecmp("chr",chr,3) ) return chr+3;
+    return chr;
+}
+static inline const char *add_chr_prefix(args_t *args, const char *chr)
+{
+    if ( !args->unify_chr_names ) return chr;
+    int len = strlen(chr);
+    hts_expand(char,len+4,args->mchr_name,args->chr_name);
+    memcpy(args->chr_name,"chr",3);
+    memcpy(args->chr_name+3,chr,len+1);
+    return args->chr_name;
+}
  static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
  {
      // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
@@ -1815,7 +847,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d
          if ( splice->check_utr )
          {
              regitr_t *itr = regitr_init(NULL);
-            const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+            const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
              if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) )     // adjacent utr
              {
                  ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1853,7 +885,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d
          if ( splice->check_utr )
          {
              regitr_t *itr = regitr_init(NULL);
-            const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+            const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
              if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) )     // adjacent utr
              {
                  ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
@@ -1926,7 +958,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d
  int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
  {
      static int small_ref_padding_warned = 0;
-    tscript_t *tr = splice->tr;
+    gf_tscript_t *tr = splice->tr;
  
      // We know the VCF record overlaps the exon, but does it overlap the start codon?
      if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0;
@@ -1958,7 +990,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
          }
  
          char *ptr_vcf = splice->vcf.ref + alt_len;                         // the first deleted base in the VCF REF allele
-        char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg);  // the first ref base after the ndel bases deleted
+        char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg);  // the first ref base after the ndel bases deleted
  #if XDBG
          fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
  #endif
@@ -1987,7 +1019,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
          }
  
          char *ptr_vcf = splice->vcf.ref + alt_len;                                      // the first deleted base in the VCF REF allele
-        char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg;  // the replacement ref block
+        char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg;  // the replacement ref block
  #if XDBG
          fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
  #endif
@@ -2032,7 +1064,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,
              if ( splice->check_utr )
              {
                  regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                  if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) )     // adjacent utr
                      csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                  regitr_destroy(itr);
@@ -2088,7 +1120,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,
              if ( splice->check_utr )
              {
                  regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                  if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) )     // adjacent utr
                      csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                  regitr_destroy(itr);
@@ -2177,7 +1209,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d
              if ( splice->check_utr )
              {
                  regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                  if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) )     // adjacent utr
                      csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                  regitr_destroy(itr);
@@ -2207,7 +1239,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d  beg,end=%d,%d  tbeg,tend=%d,%d
              if ( splice->check_utr )
              {
                  regitr_t *itr = regitr_init(NULL);
-                const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+                const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
                  if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) )     // adjacent utr
                      csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
                  regitr_destroy(itr);
@@ -2293,7 +1325,7 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
  {
      int i;
      kstring_t str = {0,0,0};
-    tscript_t *tr = cds->tr;
+    gf_tscript_t *tr = cds->tr;
      child->icds = cds->icds;     // index of cds in the tscript's list of exons
      child->vcf_ial = ial;
  
@@ -2315,8 +1347,8 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
      }
      if ( splice.check_start )   // do not check starts in incomplete CDS, defined as not starting with M
      {
-        if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
-        else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+        if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+        else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
      }
      if ( child->icds!=0 ) splice.check_region_beg = 1;
      if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -2375,12 +1407,12 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c
              // the variant is on a new exon, finish up the previous
              int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
              if ( len > 0 )
-                kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+                kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
          }
  
          // append any skipped non-variant exons
          while ( ++i < cds->icds )
-            kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
  
          if ( parent->icds==child->icds )
          {
@@ -2392,10 +1424,10 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c
                  free(splice.kalt.s);
                  return 1;
              }
-            kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
          }
          else
-            kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+            kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
      }
      kputs(splice.kalt.s + dbeg, &str);
  
@@ -2647,28 +1679,28 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d  fill=%d  seq.l=%d\n",sbeg,rbeg,r
  #endif
  }
  
-void tscript_splice_ref(tscript_t *tr)
+void tscript_splice_ref(gf_tscript_t *tr)
  {
      int i, len = 0;
      for (i=0; i<tr->ncds; i++)
          len += tr->cds[i]->len;
  
-    tr->nsref = len + 2*N_REF_PAD;
-    tr->sref  = (char*) malloc(len + 1 + 2*N_REF_PAD);
+    TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD;
+    TSCRIPT_AUX(tr)->sref  = (char*) malloc(len + 1 + 2*N_REF_PAD);
      len = 0;
  
-    memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+    memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
      len += N_REF_PAD;
  
      for (i=0; i<tr->ncds; i++)
      {
-        memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+        memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
          len += tr->cds[i]->len;
      }
-    memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+    memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
      len += N_REF_PAD;
  
-    tr->sref[len] = 0;
+    TSCRIPT_AUX(tr)->sref[len] = 0;
  }
  
  // returns: 0 if consequence was added, 1 if it already exists or could not be added
@@ -2802,18 +1834,25 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
      if ( csq->type & CSQ_UPSTREAM_STOP )
          kputc_('*',str);
  
-    int i, n = sizeof(csq_strings)/sizeof(char*);
+    int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*);
      for (i=1; i<n; i++)
-        if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+        if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputs(csq_strings[i],str); break; }
      i++;
      for (; i<n; i++)
-        if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+        if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputc_('&',str); kputs(csq_strings[i],str); }
+
+    if ( (csq->biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) )
+    {
+        if ( has_csq ) kputc_('&',str); // just in case, this should always be true
+        kputs("NMD_transcript",str);
+    }
  
      kputc_('|', str);
      if ( csq->gene ) kputs(csq->gene , str);
  
      kputc_('|', str);
-    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+//    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str);
  
      kputc_('|', str);
      kputs(gf_type2gff_string(csq->biotype), str);
@@ -2842,7 +1881,7 @@ void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
  void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
  {
      int i;
-    tscript_t *tr = hap->tr;
+    gf_tscript_t *tr = hap->tr;
      int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
      int icsq = node->ncsq_list++;
      hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
@@ -2956,7 +1995,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
      str.l = 0;
  
      // create the aa variant string
-    int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+    int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
      int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
      kputc_('|', &str);
      kputw(aa_rbeg, &str);
@@ -3022,13 +2061,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
  
  void hap_finalize(args_t *args, hap_t *hap)
  {
-    tscript_t *tr = hap->tr;
-    if ( !tr->sref )
+    gf_tscript_t *tr = hap->tr;
+    if ( !TSCRIPT_AUX(tr)->sref )
          tscript_splice_ref(tr);
  
      kstring_t sref;
-    sref.s = tr->sref;
-    sref.l = tr->nsref;
+    sref.s = TSCRIPT_AUX(tr)->sref;
+    sref.l = TSCRIPT_AUX(tr)->nsref;
      sref.m = sref.l;
  
      int istack = 0;
@@ -3036,7 +2075,7 @@ void hap_finalize(args_t *args, hap_t *hap)
  
      hap->sseq.l = 0;
      hap->tseq.l = 0;
-    hap->stack[0].node = tr->root;
+    hap->stack[0].node = TSCRIPT_AUX(tr)->root;
      hap->stack[0].ichild = -1;
      hap->stack[0].slen = 0;
      hap->stack[0].dlen = 0;
@@ -3216,7 +2255,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
      kput_vcsq(args, &csq->type, &args->str);
      fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
  }
-static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
  {
      if ( !node || !node->ncsq_list ) return;
  
@@ -3242,7 +2281,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih
      }
  }
  
-static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
  {
      if ( !node || !node->ncsq_list || ismpl<0 ) return;
  
@@ -3278,23 +2317,23 @@ void hap_flush(args_t *args, uint32_t pos)
      tr_heap_t *heap = args->active_tr;
      while ( heap->ndat && heap->dat[0]->end<=pos )
      {
-        tscript_t *tr = heap->dat[0];
+        gf_tscript_t *tr = heap->dat[0];
          khp_delete(trhp, heap);
          args->hap->tr = tr;
-        if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+        if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling
          {
              hap_finalize(args, args->hap);
  
              if ( args->output_type==FT_TAB_TEXT )   // plain text output, not a vcf
              {
                  if ( args->phase==PHASE_DROP_GT )
-                    hap_print_text(args, tr, -1,0, tr->hap[0]);
+                    hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]);
                  else
                  {
                      for (i=0; i<args->smpl->n; i++)
                      {
                          for (j=0; j<2; j++)
-                            hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+                            hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]);
                      }
                  }
              }
@@ -3303,7 +2342,7 @@ void hap_flush(args_t *args, uint32_t pos)
                  for (i=0; i<args->smpl->n; i++)
                  {
                      for (j=0; j<2; j++)
-                        hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+                        hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]);
                  }
              }
          }
@@ -3311,7 +2350,7 @@ void hap_flush(args_t *args, uint32_t pos)
          // mark the transcript for deletion. Cannot delete it immediately because
          // by-position VCF output will need them when flushed by vcf_buf_push
          args->nrm_tr++;
-        hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+        hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
          args->rm_tr[args->nrm_tr-1] = tr;
      }
  }
@@ -3426,24 +2465,33 @@ void vbuf_flush(args_t *args, uint32_t pos)
  
      for (i=0; i<args->nrm_tr; i++)
      {
-        tscript_t *tr = args->rm_tr[i];
-        if ( tr->root ) hap_destroy(tr->root);
-        tr->root = NULL;
-        free(tr->hap);
-        free(tr->ref);
-        free(tr->sref);
+        gf_tscript_t *tr = args->rm_tr[i];
+        tscript_t *aux = TSCRIPT_AUX(tr);
+        if ( aux->root ) hap_destroy(aux->root);
+        aux->root = NULL;
+        free(aux->hap);
+        free(aux->ref);
+        free(aux->sref);
+        free(aux);
+        tr->aux = NULL;
      }
      args->nrm_tr = 0;
      args->ncsq_buf = 0;
  }
  
-void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
  {
      int i, len;
      int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
  
-    tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
-    if ( !tr->ref )
+    const char *tmp_chr = chr;
+    if ( !faidx_has_seq(args->fai,tmp_chr) )
+    {
+        tmp_chr = drop_chr_prefix(args,chr);
+        if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr);
+    }
+    TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+    if ( !TSCRIPT_AUX(tr)->ref )
          error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
  
      int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
@@ -3451,23 +2499,23 @@ void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
      {
          char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1);
          for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
-        memcpy(ref+i, tr->ref, len);
+        memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len);
          len += i;
          for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
          ref[i+len] = 0;
-        free(tr->ref);
-        tr->ref = ref;
+        free(TSCRIPT_AUX(tr)->ref);
+        TSCRIPT_AUX(tr)->ref = ref;
      }
  }
  
-static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
  {
      int vbeg = 0;
      int rbeg = rec->pos - tr->beg + N_REF_PAD;
      if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; }
-    char *ref = tr->ref + rbeg;
+    char *ref = TSCRIPT_AUX(tr)->ref + rbeg;
      char *vcf = rec->d.allele[0] + vbeg;
-    assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD );
+    assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD );
      int i = 0;
      while ( ref[i] && vcf[i] )
      {
@@ -3481,7 +2529,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
  int test_cds_local(args_t *args, bcf1_t *rec)
  {
      int i,j, ret = 0;
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      // note that the off-by-one extension of rlen is deliberate to account for insertions
      if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
  
@@ -3493,12 +2541,13 @@ int test_cds_local(args_t *args, bcf1_t *rec)
      while ( regitr_overlap(args->itr) )
      {
          gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
-        tscript_t *tr = cds->tr;
+        gf_tscript_t *tr = cds->tr;
          if ( !GF_is_coding(tr->type) ) continue;
          ret = 1;
  
-        if ( !tr->ref )
+        if ( !TSCRIPT_AUX(tr) )
          {
+            tr->aux = calloc(sizeof(tscript_t),1);
              tscript_init_ref(args, tr, chr);
              tscript_splice_ref(tr);
              khp_insert(trhp, args->active_tr, &tr);     // only to clean the reference afterwards
@@ -3507,8 +2556,8 @@ int test_cds_local(args_t *args, bcf1_t *rec)
          sanity_check_ref(args, tr, rec);
  
          kstring_t sref;
-        sref.s = tr->sref;
-        sref.l = tr->nsref;
+        sref.s = TSCRIPT_AUX(tr)->sref;
+        sref.l = TSCRIPT_AUX(tr)->nsref;
          sref.m = sref.l;
  
          for (i=1; i<rec->n_allele; i++)
@@ -3616,8 +2665,8 @@ int test_cds_local(args_t *args, bcf1_t *rec)
                  {
                      // create the aa variant string
                      kstring_t str = {0,0,0};
-                    int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
-                    int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+                    int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+                    int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
                      kputc_('|', &str);
                      kputw(aa_rbeg, &str);
                      kprint_aa_prediction(args,aa_rbeg,tref,&str);
@@ -3635,11 +2684,11 @@ int test_cds_local(args_t *args, bcf1_t *rec)
                      csq_stage(args, &csq, rec);
  
                      // all this only to clean vstr when vrec is flushed
-                    if ( !tr->root )
-                        tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
-                    tr->root->ncsq_list++;
-                    hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
-                    csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+                    if ( !TSCRIPT_AUX(tr)->root )
+                        TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+                    TSCRIPT_AUX(tr)->root->ncsq_list++;
+                    hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list);
+                    csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1;
                      rm_csq->type.vstr = str;
                  }
                  if ( csq_type & ~CSQ_COMPOUND )
@@ -3661,27 +2710,28 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
      static int overlaps_warned = 0, multiploid_warned = 0;
  
      int i, ret = 0, hap_ret;
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      // note that the off-by-one extension of rlen is deliberate to account for insertions
      if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
      while ( regitr_overlap(args->itr) )
      {
          gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
-        tscript_t *tr = cds->tr;
+        gf_tscript_t *tr = cds->tr;
          if ( !GF_is_coding(tr->type) ) continue;
          if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end;
          ret = 1;
-        if ( !tr->root )
+        if ( !TSCRIPT_AUX(tr) )
          {
              // initialize the transcript and its haplotype tree, fetch the reference sequence
+            tr->aux = calloc(sizeof(tscript_t),1);
              tscript_init_ref(args, tr, chr);
  
-            tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
-            tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n;     // maximum ploidy = diploid
-            tr->hap  = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
-            for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
-            tr->root->nend = tr->nhap;
-            tr->root->type = HAP_ROOT;
+            TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+            TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n;     // maximum ploidy = diploid
+            TSCRIPT_AUX(tr)->hap  = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*));
+            for (i=0; i<TSCRIPT_AUX(tr)->nhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL;
+            TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap;
+            TSCRIPT_AUX(tr)->root->type = HAP_ROOT;
  
              khp_insert(trhp, args->active_tr, &tr);
          }
@@ -3691,7 +2741,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
          if ( args->phase==PHASE_DROP_GT )
          {
              if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
-            hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+            hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root;
              hap_node_t *child  = (hap_node_t*)calloc(1,sizeof(hap_node_t));
              hap_ret = hap_init(args, parent, child, cds, rec, 1);
              if ( hap_ret!=0 )
@@ -3736,8 +2786,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
              parent->mchild = 1;
              parent->child  = (hap_node_t**) malloc(sizeof(hap_node_t*));
              parent->child[0] = child;
-            tr->hap[0] = child;
-            tr->hap[0]->nend = 1;
+            TSCRIPT_AUX(tr)->hap[0] = child;
+            TSCRIPT_AUX(tr)->hap[0]->nend = 1;
              continue;
          }
  
@@ -3795,12 +2845,12 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
                  assert( ial < rec->n_allele );
                  if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
  
-                hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+                hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root;
                  if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
                  {
                      // this haplotype has been seen in another sample
-                    tr->hap[i] = parent->child[ parent->cur_child[ial] ];
-                    tr->hap[i]->nend++;
+                    TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ];
+                    TSCRIPT_AUX(tr)->hap[i]->nend++;
                      parent->nend--;
                      continue;
                  }
@@ -3854,8 +2904,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf)
                  hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
                  parent->cur_child[ial] = j;
                  parent->child[j] = child;
-                tr->hap[i] = child;
-                tr->hap[i]->nend++;
+                TSCRIPT_AUX(tr)->hap[i] = child;
+                TSCRIPT_AUX(tr)->hap[i]->nend++;
                  parent->nend--;
              }
          }
@@ -3935,7 +2985,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
  }
  int test_utr(args_t *args, bcf1_t *rec)
  {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      // note that the off-by-one extension of rlen is deliberate to account for insertions
      if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
  
@@ -3946,7 +2996,7 @@ int test_utr(args_t *args, bcf1_t *rec)
      while ( regitr_overlap(args->itr) )
      {
          gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
-        tscript_t *tr = splice.tr = utr->tr;
+        gf_tscript_t *tr = splice.tr = utr->tr;
          for (i=1; i<rec->n_allele; i++)
          {
              if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
@@ -3973,7 +3023,7 @@ int test_utr(args_t *args, bcf1_t *rec)
  }
  int test_splice(args_t *args, bcf1_t *rec)
  {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
  
      splice_t splice;
@@ -4005,7 +3055,7 @@ int test_splice(args_t *args, bcf1_t *rec)
  }
  int test_tscript(args_t *args, bcf1_t *rec)
  {
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
      if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
  
      splice_t splice;
@@ -4014,7 +3064,7 @@ int test_tscript(args_t *args, bcf1_t *rec)
      int i, ret = 0;
      while ( regitr_overlap(args->itr) )
      {
-        tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+        gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
          for (i=1; i<rec->n_allele; i++)
          {
              if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
@@ -4048,7 +3098,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
          warned = 1;
      }
  
-    const char *chr = bcf_seqname(args->hdr,rec);
+    const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
  
      // only insertions atm
      int beg = rec->pos + 1;
@@ -4063,7 +3113,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
              csq_t csq;
              memset(&csq, 0, sizeof(csq_t));
              gf_cds_t *cds    = regitr_payload(args->itr,gf_cds_t*);
-            tscript_t *tr    = cds->tr;
+            gf_tscript_t *tr = cds->tr;
              csq.type.type    = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class;
              csq.pos          = rec->pos;
              csq.type.biotype = tr->type;
@@ -4081,7 +3131,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
              csq_t csq;
              memset(&csq, 0, sizeof(csq_t));
              gf_utr_t *utr    = regitr_payload(args->itr, gf_utr_t*);
-            tscript_t *tr    = utr->tr;
+            gf_tscript_t *tr = utr->tr;
              csq.type.type    = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class;
              csq.pos          = rec->pos;
              csq.type.biotype = tr->type;
@@ -4120,7 +3170,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
          {
              csq_t csq;
              memset(&csq, 0, sizeof(csq_t));
-            tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+            gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
              splice.vcf.alt = rec->d.allele[1];
              splice.csq     = csq_class;
              int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
@@ -4181,7 +3231,10 @@ static void process(args_t *args, bcf1_t **rec_ptr)
          // Perform a simple sanity check (that does not catch much), the chromosome must be present in the
          // reference file
          if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
-            error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+        {
+            if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) )
+                error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+        }
      }
      if ( prev_pos > rec->pos )
          error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
@@ -4256,9 +3309,12 @@ static const char *usage(void)
          "                                       r: require phased GTs, throw an error on unphased het GTs\n"
          "                                       R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
          "                                       s: skip unphased hets\n"
-        "Options:\n"
-        "   -e, --exclude EXPR                Exclude sites for which the expression is true\n"
+        "GFF options:\n"
+        "       --dump-gff FILE.gz            Dump the parsed GFF file (for debugging purposes)\n"
          "       --force                       Run even if some sanity checks fail\n"
+        "       --unify-chr-names 1|0         Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n"
+        "General options:\n"
+        "   -e, --exclude EXPR                Exclude sites for which the expression is true\n"
          "   -i, --include EXPR                Select sites for which the expression is true\n"
          "       --no-version                  Do not append version and command line to the header\n"
          "   -o, --output FILE                 Write output to a file [standard output]\n"
@@ -4274,6 +3330,7 @@ static const char *usage(void)
          "       --targets-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
          "       --threads INT                 Use multithreading with <int> worker threads [0]\n"
          "   -v, --verbose INT                 Verbosity level 0-2 [1]\n"
+        "       --write-index                 Automatically index the output files [off]\n"
          "\n"
          "Example:\n"
          "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
@@ -4294,6 +3351,7 @@ int main_csq(int argc, char *argv[])
      args->verbosity = 1;
      args->record_cmd_line = 1;
      args->clevel = -1;
+    args->unify_chr_names = 1;
  
      static struct option loptions[] =
      {
@@ -4323,6 +3381,9 @@ int main_csq(int argc, char *argv[])
          {"targets-file",1,0,'T'},
          {"targets-overlap",required_argument,NULL,5},
          {"no-version",no_argument,NULL,3},
+        {"write-index",no_argument,NULL,6},
+        {"dump-gff",required_argument,NULL,7},
+        {"unify-chr-names",required_argument,NULL,8},
          {0,0,0,0}
      };
      int c, targets_is_file = 0, regions_is_file = 0;
@@ -4341,7 +3402,7 @@ int main_csq(int argc, char *argv[])
              case  3 : args->record_cmd_line = 0; break;
              case 'b':
                      args->brief_predictions = 1;
-                    fprintf(bcftools_stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+                    fprintf(bcftools_stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n");
                      break;
              case 'B':
                      args->brief_predictions = strtol(optarg,&tmp,10);
@@ -4411,6 +3472,13 @@ int main_csq(int argc, char *argv[])
                  targets_overlap = parse_overlap_option(optarg);
                  if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
                  break;
+            case  6 : args->write_index = 1; break;
+            case  7 : args->dump_gff = optarg; break;
+            case  8 :
+                if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
+                else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1;
+                else error("Could not parse: --unify-chr-names %s\n",optarg);
+                break;
              case 'h':
              case '?': error("%s",usage());
              default: error("The option not recognised: %s\n\n", optarg); break;
diff --git a/bcftools/filter.c b/bcftools/filter.c

index 3925475b79daf07d55636bcf5266525578ce7723..b6547f81f25745ccef9c9707ec28233d27e0107c 100644 (file)
--- a/bcftools/filter.c
+++ b/bcftools/filter.c
@@ -109,8 +109,8 @@ struct _filter_t
  #if ENABLE_PERL_FILTERS
      PerlInterpreter *perl;
  #endif
-    char **undef_tag;
-    int nundef_tag;
+    char **undef_tag, **used_tag;
+    int nundef_tag, nused_tag;
      int status, exit_on_error;
  };
  
@@ -328,6 +328,32 @@ const char **filter_list_undef_tags(filter_t *filter, int *ntags)
      *ntags = filter->nundef_tag;
      return (const char**)filter->undef_tag;
  }
+static void filter_add_used_tag(filter_t *filter, const char *prefix, char *str)
+{
+    int i;
+    kstring_t tmp = {0,0,0};
+    if ( prefix ) kputs(prefix,&tmp);
+    kputs(str,&tmp);
+    for (i=0; i<filter->nused_tag; i++)
+        if ( !strcmp(tmp.s,filter->used_tag[i]) ) break;
+    if ( i<filter->nused_tag )
+    {
+        free(tmp.s);
+        return;
+    }
+
+    filter->nused_tag++;
+    filter->used_tag = (char**)realloc(filter->used_tag,sizeof(*filter->used_tag)*filter->nused_tag);
+    if ( !filter->used_tag ) error("Could not allocate memory\n");
+    filter->used_tag[filter->nused_tag-1] = tmp.s;
+    if ( !filter->used_tag[filter->nused_tag-1] ) error("Could not allocate memory\n");
+}
+const char **filter_list_used_tags(filter_t *filter, int *ntags)
+{
+    *ntags = filter->nused_tag;
+    return (const char**)filter->used_tag;
+}
+
  
  
  /*
@@ -2841,6 +2867,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          {
              tok->setter = filters_set_qual;
              tok->tag = strdup("QUAL");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
@@ -2855,24 +2882,28 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              tok->tag = strdup("FILTER");
              filter->max_unpack |= BCF_UN_FLT;
              tok->tag_type = BCF_HL_FLT;
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
          {
              tok->comparator = filters_cmp_id;
              tok->tag = strdup("ID");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"CHROM",len) )
          {
              tok->setter = &filters_set_chrom;
              tok->tag = strdup("CHROM");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"POS",len) )
          {
              tok->setter = &filters_set_pos;
              tok->tag = strdup("POS");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"REF",len) )
@@ -2880,6 +2911,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              tok->setter = &filters_set_ref_string;
              tok->is_str = 1;
              tok->tag = strdup("REF");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"ALT",len) )
@@ -2891,6 +2923,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              tok->idxs[0] = -1;
              tok->nidxs   = 1;
              tok->idx     = -2;
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"N_ALT",len) )
@@ -3018,6 +3051,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          }
          tok->tag = strdup(tmp.s);
          if ( tmp.s ) free(tmp.s);
+        filter_add_used_tag(filter,is_fmt ? "FORMAT/" : "INFO/",tok->tag);
          return 0;
      }
      else if ( !strcasecmp(tmp.s,"ALT") )
@@ -3026,6 +3060,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          tok->is_str = 1;
          tok->tag = strdup(tmp.s);
          free(tmp.s);
+        filter_add_used_tag(filter,NULL,tok->tag);
          return 0;
      }
      else if ( !strcasecmp(tmp.s,"AN") )
@@ -3669,7 +3704,9 @@ void filter_destroy(filter_t *filter)
          }
      }
      for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
+    for (i=0; i<filter->nused_tag; i++) free(filter->used_tag[i]);
      free(filter->undef_tag);
+    free(filter->used_tag);
      free(filter->cached_GT.buf);
      free(filter->cached_GT.mask);
      free(filter->filters);
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c

index 8e2d1d16c102d6399fc296b4fdf5ca468715ee3f..d0e26258cc4f65554418081bc5e08fdace71f504 100644 (file)
--- a/bcftools/filter.c.pysam.c
+++ b/bcftools/filter.c.pysam.c
@@ -111,8 +111,8 @@ struct _filter_t
  #if ENABLE_PERL_FILTERS
      PerlInterpreter *perl;
  #endif
-    char **undef_tag;
-    int nundef_tag;
+    char **undef_tag, **used_tag;
+    int nundef_tag, nused_tag;
      int status, exit_on_error;
  };
  
@@ -330,6 +330,32 @@ const char **filter_list_undef_tags(filter_t *filter, int *ntags)
      *ntags = filter->nundef_tag;
      return (const char**)filter->undef_tag;
  }
+static void filter_add_used_tag(filter_t *filter, const char *prefix, char *str)
+{
+    int i;
+    kstring_t tmp = {0,0,0};
+    if ( prefix ) kputs(prefix,&tmp);
+    kputs(str,&tmp);
+    for (i=0; i<filter->nused_tag; i++)
+        if ( !strcmp(tmp.s,filter->used_tag[i]) ) break;
+    if ( i<filter->nused_tag )
+    {
+        free(tmp.s);
+        return;
+    }
+
+    filter->nused_tag++;
+    filter->used_tag = (char**)realloc(filter->used_tag,sizeof(*filter->used_tag)*filter->nused_tag);
+    if ( !filter->used_tag ) error("Could not allocate memory\n");
+    filter->used_tag[filter->nused_tag-1] = tmp.s;
+    if ( !filter->used_tag[filter->nused_tag-1] ) error("Could not allocate memory\n");
+}
+const char **filter_list_used_tags(filter_t *filter, int *ntags)
+{
+    *ntags = filter->nused_tag;
+    return (const char**)filter->used_tag;
+}
+
  
  
  /*
@@ -2843,6 +2869,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          {
              tok->setter = filters_set_qual;
              tok->tag = strdup("QUAL");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
@@ -2857,24 +2884,28 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              tok->tag = strdup("FILTER");
              filter->max_unpack |= BCF_UN_FLT;
              tok->tag_type = BCF_HL_FLT;
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
          {
              tok->comparator = filters_cmp_id;
              tok->tag = strdup("ID");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"CHROM",len) )
          {
              tok->setter = &filters_set_chrom;
              tok->tag = strdup("CHROM");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"POS",len) )
          {
              tok->setter = &filters_set_pos;
              tok->tag = strdup("POS");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"REF",len) )
@@ -2882,6 +2913,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              tok->setter = &filters_set_ref_string;
              tok->is_str = 1;
              tok->tag = strdup("REF");
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"ALT",len) )
@@ -2893,6 +2925,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              tok->idxs[0] = -1;
              tok->nidxs   = 1;
              tok->idx     = -2;
+            filter_add_used_tag(filter,NULL,tok->tag);
              return 0;
          }
          else if ( !strncasecmp(str,"N_ALT",len) )
@@ -3020,6 +3053,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          }
          tok->tag = strdup(tmp.s);
          if ( tmp.s ) free(tmp.s);
+        filter_add_used_tag(filter,is_fmt ? "FORMAT/" : "INFO/",tok->tag);
          return 0;
      }
      else if ( !strcasecmp(tmp.s,"ALT") )
@@ -3028,6 +3062,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          tok->is_str = 1;
          tok->tag = strdup(tmp.s);
          free(tmp.s);
+        filter_add_used_tag(filter,NULL,tok->tag);
          return 0;
      }
      else if ( !strcasecmp(tmp.s,"AN") )
@@ -3671,7 +3706,9 @@ void filter_destroy(filter_t *filter)
          }
      }
      for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
+    for (i=0; i<filter->nused_tag; i++) free(filter->used_tag[i]);
      free(filter->undef_tag);
+    free(filter->used_tag);
      free(filter->cached_GT.buf);
      free(filter->cached_GT.mask);
      free(filter->filters);
diff --git a/bcftools/filter.h b/bcftools/filter.h

index 7be842a3ad28fc229745f1835e7dbfbf94b0fb72..cc60d6b96547a061d46baa4b90a48f7a59fe5a32 100644 (file)
--- a/bcftools/filter.h
+++ b/bcftools/filter.h
@@ -79,5 +79,6 @@ filter_t *filter_parse(bcf_hdr_t *hdr, const char *str);
    */
  int filter_status(filter_t *filter);
  const char **filter_list_undef_tags(filter_t *filter, int *nundef);
+const char **filter_list_used_tags(filter_t *filter, int *nused);
  
  #endif
diff --git a/bcftools/gff.c b/bcftools/gff.c

new file mode 100644 (file)

index 0000000..90da84b
--- /dev/null
+++ b/bcftools/gff.c
@@ -0,0 +1,1098 @@
+/* The MIT License
+
+   Copyright (c) 2023 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "gff.h"
+
+/*
+    Helper structures, only for initialization
+
+    ftr_t
+        temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, gf_tscript_t*)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+    int type;           // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+    uint32_t beg;
+    uint32_t end;
+    uint32_t trid;
+    uint32_t strand:1;  // STRAND_REV,STRAND_FWD
+    uint32_t phase:2;   // 0, 1, 2, or 3 for unknown
+    uint32_t iseq:29;
+}
+ftr_t;
+
+/*
+    Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+    to integer id.  To keep the memory requirements low, the original version
+    relied on IDs in the form of a string prefix and a numerical id.  However,
+    it turns out that this assumption is not valid for some ensembl GFFs, see
+    for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+    void *str2id;       // khash_str2int
+    int nstr, mstr;
+    char **str;         // numeric id to string
+}
+id_tbl_t;
+
+typedef struct
+{
+    // all exons, CDS, UTRs
+    ftr_t *ftr;
+    int nftr, mftr;
+
+    // mapping from gene id to gf_gene_t
+    kh_int2gene_t *gid2gene;
+
+    // mapping from transcript id to tscript, for quick CDS anchoring
+    kh_int2tscript_t *id2tr;
+
+    // sequences
+    void *seq2int;  // str2int hash
+    char **seq;
+    int nseq, mseq;
+
+    // ignored biotypes
+    void *ignored_biotypes;
+
+    id_tbl_t gene_ids;   // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
+
+    // pointers to the current partially processed line
+    char *id, *id_end, *parent, *parent_end, *biotype, *biotype_end,
+         *chr, *chr_end, *name, *name_end, *type, *type_end;
+}
+aux_t;
+
+struct gff_t_
+{
+    const char *fname, *dump_fname;
+
+    // the main regidx lookups, from chr:beg-end to overlapping features and
+    // index iterator
+    regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+
+    // temporary structures, deleted after initializtion
+    aux_t init;
+
+    // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
+    id_tbl_t tscript_ids;
+
+    int strip_chr_names, verbosity;
+    int force;      // force run under various conditions. Currently only to skip out-of-phase transcripts
+
+    struct {
+        int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+        int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+    } warned;
+};
+
+static const char *gf_strings_noncoding[] =
+{
+    "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+    "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+    "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+    "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+    "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+    "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene",    "translated_unprocessed_pseudogene",
+    "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+    "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf",
+    "lncRNA"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+int gff_set(gff_t *gff, gff_opt_t key, ...)
+{
+    va_list args;
+    switch (key)
+    {
+        case dump_fname:
+            va_start(args, key);
+            gff->dump_fname = va_arg(args,char*);
+            va_end(args);
+            return 0;
+
+        case force_out_of_phase:
+            va_start(args, key);
+            gff->force = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case strip_chr_names:
+            va_start(args, key);
+            gff->strip_chr_names = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case verbosity:
+            va_start(args, key);
+            gff->verbosity = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        default:
+            error("The key %d is not supported with gff_set\n",key);
+    }
+    return 0;
+}
+
+void *gff_get(gff_t *gff, gff_opt_t key)
+{
+    switch (key)
+    {
+        case idx_cds: return gff->idx_cds;
+        case idx_utr: return gff->idx_utr;
+        case idx_exon: return gff->idx_exon;
+        case idx_tscript: return gff->idx_tscript;
+        default:
+            error("The key %d is not supported with gff_get\n",key);
+    }
+    return NULL;
+}
+
+const char *gff_id2string(gff_t *gff, id_type_t type, int id)    // currently only transcript ids
+{
+    return gff->tscript_ids.str[id];
+}
+
+const char *gf_type2gff_string(int type)
+{
+    if ( !GF_is_coding(type) )
+    {
+        if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+        type &= (1<<(GF_coding_bit+1)) - 1;
+        return gf_strings_special[type - 1];
+    }
+    type &= (1<<GF_coding_bit) - 1;
+    return gf_strings_coding[type - 1];
+}
+
+/*
+    gff parsing functions
+*/
+static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
+{
+    aux_t *aux = &gff->init;
+    char tmp = chr_end[1];
+    chr_end[1] = 0;
+    int iseq;
+    if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+    {
+        char *new_chr = strdup(chr_beg);
+        hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+        aux->seq[aux->nseq] = new_chr;
+        iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+        aux->nseq++;
+        assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
+    }
+    chr_end[1] = tmp;
+    return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+    while ( *ss && *ss!='\t' ) ss++;
+    if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    return ss+1;
+}
+static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, char **chr_end)
+{
+    char *se = (char*) line;
+    while ( *se && *se!='\t' ) se++;
+    if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
+    *chr_beg = (char*) line;
+    *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+    char *se = ss;
+    *beg = strtol(ss, &se, 10) - 1;
+    if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+    ss = se+1;
+    *end = strtol(ss, &se, 10) - 1;
+    if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    return se+1;
+}
+static void gff_id_init(id_tbl_t *tbl)
+{
+    memset(tbl, 0, sizeof(*tbl));
+    tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+    khash_str2int_destroy_free(tbl->str2id);
+    free(tbl->str);
+}
+static inline int gff_id_register(id_tbl_t *tbl, char *beg, char *end, uint32_t *id_ptr)
+{
+    char tmp = end[1];
+    end[1] = 0;
+    int id;
+    if ( khash_str2int_get(tbl->str2id, beg, &id) < 0 )
+    {
+        id = tbl->nstr++;
+        hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+        tbl->str[id] = strdup(beg);
+        khash_str2int_set(tbl->str2id, tbl->str[id], id);
+    }
+    end[1] = tmp;
+    *id_ptr = id;
+    return 0;
+}
+static inline int gff_parse_biotype(char *line)
+{
+    if ( !line ) return -1;
+    switch (*line)
+    {
+        case 'p':
+            if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+            else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+            else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+            else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+            break;
+        case 'a':
+            if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+            else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+            else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+            break;
+        case 'I':
+            if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_C",4) ) return GF_IG_C;
+            else if ( !strncmp(line,"IG_D",4) ) return GF_IG_D;
+            else if ( !strncmp(line,"IG_J",4) ) return GF_IG_J;
+            else if ( !strncmp(line,"IG_V",4) ) return GF_IG_V;
+            else if ( !strncmp(line,"IG_LV",5) ) return GF_IG_LV;
+            break;
+        case 'T':
+            if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+            else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+            else if ( !strncmp(line,"TR_C",4) ) return GF_TR_C;
+            else if ( !strncmp(line,"TR_D",4) ) return GF_TR_D;
+            else if ( !strncmp(line,"TR_J",4) ) return GF_TR_J;
+            else if ( !strncmp(line,"TR_V",4) ) return GF_TR_V;
+            break;
+        case 'M':
+            if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+            else if ( !strncasecmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+            else if ( !strncasecmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+            else if ( !strncasecmp(line,"MRNA",4) ) return GF_PROTEIN_CODING;
+            break;
+        case 'l':
+            if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+            if ( !strncmp(line,"lncRNA",7) ) return GF_lncRNA;
+            break;
+        case 'm':
+            if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+            else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+            else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+            else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+            else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+            else if ( !strncasecmp(line,"mRNA",4) ) return GF_PROTEIN_CODING;
+            break;
+        case 'r':
+            if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+            else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+            else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+            else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+            break;
+        case 's':
+            if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+            else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+            else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+            else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+            else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+            else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+            else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+            break;
+        case 't':
+            if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+            else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+            break;
+        case 'n':
+            if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+            else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+            break;
+        case 'N':
+            if ( !strncmp(line,"NMD",3) ) return GF_NMD;
+            break;
+        case 'k':
+            if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+            break;
+        case 'u':
+            if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+            else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+            break;
+        case 'L':
+            if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+            break;
+        case '3':
+            if ( !strncasecmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+            else if ( !strncasecmp(line,"3_prime_overlapping_ncRNA",25) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+            break;
+        case 'd':
+            if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+            break;
+        case 'v':
+            if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+            break;
+        case 'b':
+            if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+            break;
+    }
+    return 0;
+}
+static inline int gff_ignored_biotype(gff_t *gff, char *ss, char *se)
+{
+    if ( !ss ) return 0;
+
+    char tmp = se[1];
+    se[1] = 0;
+
+    char *key = ss;
+    int n = 0;
+    if ( khash_str2int_get(gff->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+    khash_str2int_set(gff->init.ignored_biotypes, key, n+1);
+
+    se[1] = tmp;
+    return 1;
+}
+static gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+    khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+    gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+    if ( !gene )
+    {
+        gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+        int ret;
+        k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+        kh_val(aux->gid2gene,k) = gene;
+    }
+    return gene;
+}
+static void gff_parse_transcript(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+
+    ftr->type = gff_parse_biotype(aux->biotype);
+    if ( ftr->type <= 0 )
+    {
+        char tmp = aux->type_end[1];
+        aux->type_end[1] = 0;
+        ftr->type = gff_parse_biotype(aux->type);
+        aux->type_end[1] = tmp;
+    }
+    if ( ftr->type <= 0 )
+    {
+        if ( !gff_ignored_biotype(gff,aux->biotype,aux->biotype_end) )
+        {
+            if ( gff->verbosity > 0 )
+            {
+                if ( !gff->warned.unknown_tscript_biotype || gff->verbosity > 1 )
+                    fprintf(stderr,"Warning: Ignoring transcript with unknown biotype .. %s\n", line);
+                gff->warned.unknown_tscript_biotype++;
+            }
+        }
+        return;
+    }
+
+    if ( !aux->id )
+        error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    if ( !aux->parent )
+        error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+    uint32_t trid,gene_id;
+    gff_id_register(&gff->tscript_ids, aux->id, aux->id_end, &trid);
+    gff_id_register(&aux->gene_ids, aux->parent, aux->parent_end, &gene_id);
+
+    gf_tscript_t *tr = (gf_tscript_t*) calloc(1,sizeof(gf_tscript_t));
+    tr->id     = trid;
+    tr->strand = ftr->strand;
+    tr->gene   = gene_init(aux, gene_id);
+    tr->type   = ftr->type;
+    tr->beg    = ftr->beg;
+    tr->end    = ftr->end;
+
+    khint_t k;
+    int ret;
+    k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+    kh_val(aux->id2tr,k) = tr;
+}
+// register exon, CDS, UTR
+static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    if ( !aux->parent )
+        error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring found: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+    // associate with transcript id
+    gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid);
+
+    if ( ftr->strand==-1 && gff->verbosity > 0 )
+    {
+        if ( !gff->warned.unknown_strand || gff->verbosity > 1 )
+            fprintf(stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line);
+        gff->warned.unknown_strand++;
+    }
+    if ( ftr->phase==-1 && gff->verbosity > 0 )
+    {
+        if ( !gff->warned.unknown_phase|| gff->verbosity > 1 )
+            fprintf(stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line);
+        gff->warned.unknown_phase++;
+    }
+    ftr->iseq = feature_set_seq(gff, aux->chr,aux->chr_end);
+}
+static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    if ( !aux->id ) return;
+
+    uint32_t gene_id;
+    gff_id_register(&aux->gene_ids, aux->id, aux->id_end, &gene_id);
+
+    gf_gene_t *gene = gene_init(aux, gene_id);
+    if ( gene->name )
+    {
+        if ( !gff->warned.duplicate_id || gff->verbosity > 1 )
+            fprintf(stderr,"Warning: The GFF contains features with duplicate id .. %s\n",line);
+        gff->warned.duplicate_id++;
+        return;
+    }
+
+    gene->iseq   = feature_set_seq(gff, aux->chr,aux->chr_end);
+    gene->beg    = ftr->beg;
+    gene->end    = ftr->end;
+    gene->strand = ftr->strand;
+    gene->id     = gene_id;
+
+    if ( aux->name )
+    {
+        gene->name = (char*) malloc(aux->name_end - aux->name + 2);
+        memcpy(gene->name,aux->name,aux->name_end - aux->name + 1);
+        gene->name[aux->name_end - aux->name + 1] = 0;
+    }
+    else
+        gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
+}
+
+// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them,
+// or -1 to indiciate the structure needs not be saved (either because of an error or because saved
+// as transcript or gene.)
+static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
+{
+    // - skip empty lines and commented lines
+    // - columns
+    //      1.      chr
+    //      2.      <skip>
+    //      3.      CDS, transcript, gene, ...
+    //      4-5.    beg,end
+    //      6.      <skip>
+    //      7.      strand
+    //      8.      phase
+    //      9.      Parent=transcript:ENST(\d+);ID=...;biotype=... etc
+
+    char *ss = line;
+    if ( !*ss ) return -1;      // skip blank lines
+    if ( *ss=='#' ) return -1;  // skip comments
+
+    aux_t *aux = &gff->init;
+    gff_parse_chr(gff, line, &aux->chr, &aux->chr_end);
+    ss = gff_skip(line, aux->chr_end + 2);
+
+    // 3rd column: is this a CDS, transcript, gene, etc.. The parsing order by frequency in Homo_sapiens.GRCh37.87.gff3
+    int is_gene_line = 0;
+    ftr->type = 0;
+    aux->type = ss;
+    if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+    else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+    else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+    else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+    else if ( !strncmp("biological_region\t",ss,18) ) { return -1; }    // skip
+    else if ( !strncmp("gene\t",ss,5) ) { is_gene_line = 1; ss += 5; }
+    else ss = gff_skip(line, ss);
+    aux->type_end = ss - 1;
+
+    // 4-5th columns: beg,end
+    ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+
+    // 6th column: skip
+    ss = gff_skip(line, ss);
+
+    // 7th column: strand
+    ftr->strand = -1;
+    if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+    else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+    ss += 2;
+
+    // 8th column: phase (codon offset)
+    ftr->phase = -1;
+    if ( *ss == '0' ) ftr->phase = 0;
+    else if ( *ss == '1' ) ftr->phase = 1;
+    else if ( *ss == '2' ) ftr->phase = 2;
+    else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN;     // exons and even CDS in some GFFs do not have phase
+    ss += 2;
+
+    // 9th column: id, parent, name, biotype
+    aux->name = NULL, aux->id = NULL, aux->parent = NULL, aux->biotype = NULL;
+    while ( *ss )
+    {
+        char *es = ss;
+        while ( *es && *es!=';' ) es++;
+        if ( !strncmp(ss,"ID=",3) )
+        {
+            ss += 3;
+            aux->id_end = es - 1;
+            aux->id = ss;
+            if ( !strncmp(ss,"gene:",5) ) { aux->id += 5; is_gene_line = 1; }
+            else if ( !strncmp(ss,"transcript:",11) ) aux->id += 11;
+        }
+        else if ( !strncmp(ss,"Name=",5) ) { aux->name = ss + 5; aux->name_end = es - 1; }
+        else if ( !strncmp(ss,"Parent=",7) )
+        {
+            ss += 7;
+            aux->parent_end = es - 1;
+            aux->parent = ss;
+            if ( !strncmp(ss,"gene:",5) ) aux->parent += 5;
+            else if ( !strncmp(ss,"transcript:",11) ) aux->parent += 11;
+        }
+        else if ( !strncmp(ss,"biotype=",8) ) { aux->biotype = ss + 8; aux->biotype_end = es - 1; }
+        else if ( !strncmp(ss,"gene_biotype=",13) ) { aux->biotype = ss + 13; aux->biotype_end = es - 1; }
+        if ( !*es ) break;
+        ss = es + 1;
+    }
+
+    if ( is_gene_line || !aux->parent )
+    {
+        gff_parse_gene(gff, line, ftr);
+        return -1;
+    }
+
+    if ( ftr->type )
+    {
+        gff_parse_exon(gff, line, ftr);
+        return 0;
+    }
+
+    gff_parse_transcript(gff, line, ftr);
+    return -1;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+    // comparison function for qsort of transcripts's CDS
+    if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+    if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+    return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+    *chr_beg = *chr_end = aux->seq[iseq];
+    while ( (*chr_end)[1] ) (*chr_end)++;
+}
+static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+    khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+    gf_tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+    assert( tr );
+    return tr;
+}
+static void register_cds(gff_t *gff, ftr_t *ftr)
+{
+    // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+    //  ftr is the result of parsing a gff CDS line
+    aux_t *aux = &gff->init;
+
+    gf_tscript_t *tr = tscript_init(aux, ftr->trid);
+    if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+    gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+    cds->tr    = tr;
+    cds->beg   = ftr->beg;
+    cds->len   = ftr->end - ftr->beg + 1;
+    cds->icds  = 0;     // to keep valgrind on mac happy
+    cds->phase = ftr->phase;
+
+    hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+    tr->cds[tr->ncds++] = cds;
+}
+static void register_utr(gff_t *gff, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+    utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+    utr->beg   = ftr->beg;
+    utr->end   = ftr->end;
+    utr->tr    = tscript_init(aux, ftr->trid);
+
+    char *chr_beg, *chr_end;
+    chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+    regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+static void register_exon(gff_t *gff, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+    exon->beg = ftr->beg;
+    exon->end = ftr->end;
+    exon->tr  = tscript_init(aux, ftr->trid);
+
+    char *chr_beg, *chr_end;
+    chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+    regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+static void tscript_init_cds(gff_t *gff)
+{
+    aux_t *aux = &gff->init;
+
+    // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+    khint_t k;
+    for (k=0; k<kh_end(aux->id2tr); k++)
+    {
+        if ( !kh_exist(aux->id2tr, k) ) continue;
+        gf_tscript_t *tr = (gf_tscript_t*) kh_val(aux->id2tr, k);
+
+        // position-to-tscript lookup
+        char *chr_beg, *chr_end;
+        chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+        regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+        if ( !tr->ncds ) continue;      // transcript with no CDS
+
+        // sort CDs
+        qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+        // trim non-coding start
+        int i, len = 0;
+        if ( tr->strand==STRAND_FWD )
+        {
+            if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
+            {
+                if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+                tr->cds[0]->beg += tr->cds[0]->phase;
+                tr->cds[0]->len -= tr->cds[0]->phase;
+                tr->cds[0]->phase = 0;
+            }
+
+            // sanity check phase; the phase number in gff tells us how many bases to skip in this
+            // feature to reach the first base of the next codon
+            int tscript_ok = 1;
+            for (i=0; i<tr->ncds; i++)
+            {
+                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+                {
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+                            fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+                        gff->warned.unknown_cds_phase++;
+                    }
+                    len += tr->cds[i]->len;
+                    continue;
+                }
+                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+                if ( phase!=len%3 )
+                {
+                    if ( !gff->force )
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                                gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+                            fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                                    gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                        gff->warned.wrong_phase++;
+                    }
+                    tscript_ok = 0;
+                    break;
+                }
+                len += tr->cds[i]->len;
+            }
+            if ( !tscript_ok ) continue;    // skip this transcript
+        }
+        else
+        {
+            if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
+            {
+                // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+                // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141.
+                // This also fixes phase of 5' incomplete CDS, see test/csq/ENST00000520868/ENST00000520868.gff
+                // todo: the same for the fwd strand
+                i = tr->ncds - 1;
+                int phase = tr->cds[i]->phase;
+                if ( phase ) tr->trim |= TRIM_5PRIME;
+                while ( i>=0 && phase > tr->cds[i]->len )
+                {
+                    phase -= tr->cds[i]->len;
+                    tr->cds[i]->phase = 0;
+                    tr->cds[i]->len   = 0;
+                    i--;
+                }
+                if ( gff->verbosity > 0 && tr->cds[i]->phase )
+                {
+                    if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+                        fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+                    gff->warned.incomplete_cds++;
+                }
+                tr->cds[i]->len  -= tr->cds[i]->phase;
+                tr->cds[i]->phase = 0;
+            }
+
+            // sanity check phase
+            int tscript_ok = 1;
+            for (i=tr->ncds-1; i>=0; i--)
+            {
+                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+                {
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+                            fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+                        gff->warned.unknown_cds_phase++;
+                    }
+                    len += tr->cds[i]->len;
+                    continue;
+                }
+                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+                if ( phase!=len%3 )
+                {
+                    if ( !gff->force )
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                                gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+                            fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                                    gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                        gff->warned.wrong_phase++;
+                    }
+                    tscript_ok = 0;
+                    break;
+                }
+                len += tr->cds[i]->len;
+            }
+            if ( !tscript_ok ) continue;    // skip this transcript
+        }
+
+        // set len. At the same check that CDS within a transcript do not overlap
+        len = 0;
+        for (i=0; i<tr->ncds; i++)
+        {
+            tr->cds[i]->icds = i;
+            len += tr->cds[i]->len;
+            if ( !i ) continue;
+
+            gf_cds_t *a = tr->cds[i-1];
+            gf_cds_t *b = tr->cds[i];
+            if ( a->beg + a->len - 1 >= b->beg )
+            {
+                if ( gff->verbosity > 0 )
+                {
+                    if ( !gff->warned.overlapping_cds || gff->verbosity > 1 )
+                        fprintf(stderr,"Warning: GFF contains overlapping CDS %s, %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32" (ribosomal slippage?)\n",
+                                gff->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+                    gff->warned.overlapping_cds++;
+                }
+            }
+        }
+
+        if ( len%3 != 0 )
+        {
+            // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+            //  http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+            // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+            if ( gff->verbosity > 0 )
+            {
+                if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+                    fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+                gff->warned.incomplete_cds++;
+            }
+
+            tr->trim |= TRIM_3PRIME;
+            if ( tr->strand==STRAND_FWD )
+            {
+                i = tr->ncds - 1;
+                while ( i>=0 && len%3 )
+                {
+                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+                    tr->cds[i]->len -= dlen;
+                    len -= dlen;
+                    i--;
+                }
+            }
+            else
+            {
+                i = 0;
+                while ( i<tr->ncds && len%3 )
+                {
+                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+                    tr->cds[i]->len -= dlen;
+                    tr->cds[i]->beg += dlen;
+                    len -= dlen;
+                    i++;
+                }
+            }
+        }
+
+        // set CDS offsets and insert into regidx
+        len=0;
+        for (i=0; i<tr->ncds; i++)
+        {
+            tr->cds[i]->pos = len;
+            len += tr->cds[i]->len;
+            regidx_push(gff->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+        }
+    }
+}
+
+static void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+static void regidx_free_tscript(void *payload) { gf_tscript_t *tr = *((gf_tscript_t**)payload); free(tr->cds); free(tr); }
+
+static int gff_dump(gff_t *gff, const char *fname)
+{
+    BGZF *out = bgzf_open(fname,"wg");
+    if ( !out ) error("Failed to open %s: %s\n", fname, strerror(errno));
+
+    kstring_t str = {0,0,0};
+
+    khint_t k;
+    for (k=0; k<kh_end(gff->init.gid2gene); k++)
+    {
+        if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+        gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+        char *gene_id = gff->init.gene_ids.str[gene->id];
+        str.l = 0;
+        ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+
+    regitr_t *itr = regitr_init(gff->idx_tscript);
+    while ( regitr_loop(itr) )
+    {
+        gf_tscript_t *tr = regitr_payload(itr, gf_tscript_t*);
+        char *gene_id =  gff->init.gene_ids.str[tr->gene->id];
+        const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type);
+        str.l = 0;
+        ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_cds);
+    while ( regitr_loop(itr) )
+    {
+        gf_cds_t *cds = regitr_payload(itr,gf_cds_t*);
+        gf_tscript_t *tr = cds->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_utr);
+    while ( regitr_loop(itr) )
+    {
+        gf_utr_t *utr = regitr_payload(itr,gf_utr_t*);
+        gf_tscript_t *tr = utr->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_exon);
+    while ( regitr_loop(itr) )
+    {
+        gf_exon_t *exon = regitr_payload(itr,gf_exon_t*);
+        gf_tscript_t *tr = exon->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    if ( bgzf_close(out)!=0 ) error("Error: close failed .. %s\n", fname);
+    free(str.s);
+
+    return 0;
+}
+
+int gff_parse(gff_t *gff)
+{
+    if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname);
+
+    aux_t *aux = &gff->init;
+    aux->seq2int   = khash_str2int_init();   // chrom's numeric id
+    aux->gid2gene  = kh_init(int2gene);      // gene id to gf_gene_t, for idx_gene
+    aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
+    gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
+    aux->ignored_biotypes = khash_str2int_init();
+    gff_id_init(&aux->gene_ids);
+    gff_id_init(&gff->tscript_ids);
+
+    // parse gff
+    kstring_t str = {0,0,0};
+    htsFile *fp = hts_open(gff->fname,"r");
+    if ( !fp ) error("Failed to read %s\n", gff->fname);
+    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+    {
+        hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+        int ret = gff_parse_line(gff, str.s, aux->ftr + aux->nftr);
+        if ( !ret ) aux->nftr++;
+    }
+    free(str.s);
+    if ( hts_close(fp)!=0 ) error("Close failed: %s\n", gff->fname);
+
+
+    // process gff information: connect CDS and exons to transcripts
+    gff->idx_cds  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+    gff->idx_utr  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+    gff->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+
+    int i;
+    for (i=0; i<aux->nftr; i++)
+    {
+        ftr_t *ftr = &aux->ftr[i];
+
+        // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+        khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+        if ( k==kh_end(aux->id2tr) ) continue;       // no corresponding transcript registered, must be an unsupported biotype
+
+        gf_tscript_t *tr = kh_val(aux->id2tr,k);
+        tr->used = 1;
+        tr->gene->used = 1;
+
+        // populate regidx by category:
+        //      ftr->type   .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+        //      gene->type  .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+        if ( ftr->type==GF_CDS ) register_cds(gff, ftr);
+        else if ( ftr->type==GF_EXON ) register_exon(gff, ftr);
+        else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
+        else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
+        else
+            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+    }
+    tscript_init_cds(gff);
+
+    if ( gff->verbosity > 0 )
+    {
+        fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+                regidx_nregs(gff->idx_tscript),
+                regidx_nregs(gff->idx_exon),
+                regidx_nregs(gff->idx_cds),
+                regidx_nregs(gff->idx_utr));
+    }
+
+    if ( gff->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
+    {
+        khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+        fprintf(stderr,"Ignored the following biotypes:\n");
+        for (i = kh_begin(ign); i < kh_end(ign); i++)
+        {
+            if ( !kh_exist(ign,i)) continue;
+            const char *biotype = kh_key(ign,i);
+            if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
+            fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
+        }
+    }
+    khash_str2int_destroy_free(aux->ignored_biotypes);
+
+    // warned about unprinted warnings
+    if ( gff->verbosity > 0 )
+    {
+        int nwarn = 0;
+        #define INC_NWARN(X) if (gff->warned.X) nwarn += gff->verbosity > 1 ? 0 : gff->warned.X - 1;
+        INC_NWARN(unknown_chr);
+        INC_NWARN(unknown_tscript_biotype);
+        INC_NWARN(unknown_strand);
+        INC_NWARN(unknown_phase);
+        INC_NWARN(duplicate_id);
+        INC_NWARN(unknown_cds_phase);
+        INC_NWARN(incomplete_cds);
+        INC_NWARN(wrong_phase);
+        INC_NWARN(overlapping_cds);
+        if ( nwarn > 0 )
+            fprintf(stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn);
+    }
+
+    if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname);
+
+    if (  !regidx_nregs(gff->idx_tscript) )
+        error("Error: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
+              "       or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
+              "       of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
+
+    free(aux->seq);
+    free(aux->ftr);
+    khash_str2int_destroy_free(aux->seq2int);
+    // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+    kh_destroy(int2tscript,aux->id2tr);
+    gff_id_destroy(&aux->gene_ids);
+
+    return 0;
+}
+
+gff_t *gff_init(const char *fname)
+{
+    gff_t *gff = calloc(sizeof(gff_t),1);
+    gff->fname = fname;
+    return gff;
+}
+void gff_destroy(gff_t *gff)
+{
+    khint_t k;
+    if ( gff->init.gid2gene )
+    {
+        for (k=0; k<kh_end(gff->init.gid2gene); k++)
+        {
+            if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+            gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+            free(gene->name);
+            free(gene);
+        }
+        kh_destroy(int2gene,gff->init.gid2gene);
+    }
+
+    regidx_destroy(gff->idx_cds);
+    regidx_destroy(gff->idx_utr);
+    regidx_destroy(gff->idx_exon);
+    regidx_destroy(gff->idx_tscript);
+
+    gff_id_destroy(&gff->tscript_ids);
+    free(gff);
+}
+
diff --git a/bcftools/gff.c.pysam.c b/bcftools/gff.c.pysam.c

new file mode 100644 (file)

index 0000000..f5c817d
--- /dev/null
+++ b/bcftools/gff.c.pysam.c
@@ -0,0 +1,1100 @@
+#include "bcftools.pysam.h"
+
+/* The MIT License
+
+   Copyright (c) 2023 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "gff.h"
+
+/*
+    Helper structures, only for initialization
+
+    ftr_t
+        temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, gf_tscript_t*)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+    int type;           // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+    uint32_t beg;
+    uint32_t end;
+    uint32_t trid;
+    uint32_t strand:1;  // STRAND_REV,STRAND_FWD
+    uint32_t phase:2;   // 0, 1, 2, or 3 for unknown
+    uint32_t iseq:29;
+}
+ftr_t;
+
+/*
+    Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+    to integer id.  To keep the memory requirements low, the original version
+    relied on IDs in the form of a string prefix and a numerical id.  However,
+    it turns out that this assumption is not valid for some ensembl GFFs, see
+    for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+    void *str2id;       // khash_str2int
+    int nstr, mstr;
+    char **str;         // numeric id to string
+}
+id_tbl_t;
+
+typedef struct
+{
+    // all exons, CDS, UTRs
+    ftr_t *ftr;
+    int nftr, mftr;
+
+    // mapping from gene id to gf_gene_t
+    kh_int2gene_t *gid2gene;
+
+    // mapping from transcript id to tscript, for quick CDS anchoring
+    kh_int2tscript_t *id2tr;
+
+    // sequences
+    void *seq2int;  // str2int hash
+    char **seq;
+    int nseq, mseq;
+
+    // ignored biotypes
+    void *ignored_biotypes;
+
+    id_tbl_t gene_ids;   // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
+
+    // pointers to the current partially processed line
+    char *id, *id_end, *parent, *parent_end, *biotype, *biotype_end,
+         *chr, *chr_end, *name, *name_end, *type, *type_end;
+}
+aux_t;
+
+struct gff_t_
+{
+    const char *fname, *dump_fname;
+
+    // the main regidx lookups, from chr:beg-end to overlapping features and
+    // index iterator
+    regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+
+    // temporary structures, deleted after initializtion
+    aux_t init;
+
+    // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
+    id_tbl_t tscript_ids;
+
+    int strip_chr_names, verbosity;
+    int force;      // force run under various conditions. Currently only to skip out-of-phase transcripts
+
+    struct {
+        int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+        int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+    } warned;
+};
+
+static const char *gf_strings_noncoding[] =
+{
+    "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+    "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+    "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+    "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+    "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+    "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene",    "translated_unprocessed_pseudogene",
+    "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+    "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf",
+    "lncRNA"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+int gff_set(gff_t *gff, gff_opt_t key, ...)
+{
+    va_list args;
+    switch (key)
+    {
+        case dump_fname:
+            va_start(args, key);
+            gff->dump_fname = va_arg(args,char*);
+            va_end(args);
+            return 0;
+
+        case force_out_of_phase:
+            va_start(args, key);
+            gff->force = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case strip_chr_names:
+            va_start(args, key);
+            gff->strip_chr_names = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        case verbosity:
+            va_start(args, key);
+            gff->verbosity = va_arg(args,int);
+            va_end(args);
+            return 0;
+
+        default:
+            error("The key %d is not supported with gff_set\n",key);
+    }
+    return 0;
+}
+
+void *gff_get(gff_t *gff, gff_opt_t key)
+{
+    switch (key)
+    {
+        case idx_cds: return gff->idx_cds;
+        case idx_utr: return gff->idx_utr;
+        case idx_exon: return gff->idx_exon;
+        case idx_tscript: return gff->idx_tscript;
+        default:
+            error("The key %d is not supported with gff_get\n",key);
+    }
+    return NULL;
+}
+
+const char *gff_id2string(gff_t *gff, id_type_t type, int id)    // currently only transcript ids
+{
+    return gff->tscript_ids.str[id];
+}
+
+const char *gf_type2gff_string(int type)
+{
+    if ( !GF_is_coding(type) )
+    {
+        if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+        type &= (1<<(GF_coding_bit+1)) - 1;
+        return gf_strings_special[type - 1];
+    }
+    type &= (1<<GF_coding_bit) - 1;
+    return gf_strings_coding[type - 1];
+}
+
+/*
+    gff parsing functions
+*/
+static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
+{
+    aux_t *aux = &gff->init;
+    char tmp = chr_end[1];
+    chr_end[1] = 0;
+    int iseq;
+    if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+    {
+        char *new_chr = strdup(chr_beg);
+        hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+        aux->seq[aux->nseq] = new_chr;
+        iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+        aux->nseq++;
+        assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
+    }
+    chr_end[1] = tmp;
+    return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+    while ( *ss && *ss!='\t' ) ss++;
+    if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    return ss+1;
+}
+static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, char **chr_end)
+{
+    char *se = (char*) line;
+    while ( *se && *se!='\t' ) se++;
+    if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
+    *chr_beg = (char*) line;
+    *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+    char *se = ss;
+    *beg = strtol(ss, &se, 10) - 1;
+    if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+    ss = se+1;
+    *end = strtol(ss, &se, 10) - 1;
+    if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    return se+1;
+}
+static void gff_id_init(id_tbl_t *tbl)
+{
+    memset(tbl, 0, sizeof(*tbl));
+    tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+    khash_str2int_destroy_free(tbl->str2id);
+    free(tbl->str);
+}
+static inline int gff_id_register(id_tbl_t *tbl, char *beg, char *end, uint32_t *id_ptr)
+{
+    char tmp = end[1];
+    end[1] = 0;
+    int id;
+    if ( khash_str2int_get(tbl->str2id, beg, &id) < 0 )
+    {
+        id = tbl->nstr++;
+        hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+        tbl->str[id] = strdup(beg);
+        khash_str2int_set(tbl->str2id, tbl->str[id], id);
+    }
+    end[1] = tmp;
+    *id_ptr = id;
+    return 0;
+}
+static inline int gff_parse_biotype(char *line)
+{
+    if ( !line ) return -1;
+    switch (*line)
+    {
+        case 'p':
+            if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+            else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+            else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+            else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+            break;
+        case 'a':
+            if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+            else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+            else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+            break;
+        case 'I':
+            if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+            else if ( !strncmp(line,"IG_C",4) ) return GF_IG_C;
+            else if ( !strncmp(line,"IG_D",4) ) return GF_IG_D;
+            else if ( !strncmp(line,"IG_J",4) ) return GF_IG_J;
+            else if ( !strncmp(line,"IG_V",4) ) return GF_IG_V;
+            else if ( !strncmp(line,"IG_LV",5) ) return GF_IG_LV;
+            break;
+        case 'T':
+            if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+            else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+            else if ( !strncmp(line,"TR_C",4) ) return GF_TR_C;
+            else if ( !strncmp(line,"TR_D",4) ) return GF_TR_D;
+            else if ( !strncmp(line,"TR_J",4) ) return GF_TR_J;
+            else if ( !strncmp(line,"TR_V",4) ) return GF_TR_V;
+            break;
+        case 'M':
+            if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+            else if ( !strncasecmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+            else if ( !strncasecmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+            else if ( !strncasecmp(line,"MRNA",4) ) return GF_PROTEIN_CODING;
+            break;
+        case 'l':
+            if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+            if ( !strncmp(line,"lncRNA",7) ) return GF_lncRNA;
+            break;
+        case 'm':
+            if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+            else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+            else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+            else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+            else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+            else if ( !strncasecmp(line,"mRNA",4) ) return GF_PROTEIN_CODING;
+            break;
+        case 'r':
+            if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+            else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+            else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+            else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+            break;
+        case 's':
+            if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+            else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+            else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+            else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+            else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+            else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+            else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+            break;
+        case 't':
+            if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+            else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+            else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+            break;
+        case 'n':
+            if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+            else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+            break;
+        case 'N':
+            if ( !strncmp(line,"NMD",3) ) return GF_NMD;
+            break;
+        case 'k':
+            if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+            break;
+        case 'u':
+            if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+            else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+            break;
+        case 'L':
+            if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+            break;
+        case '3':
+            if ( !strncasecmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+            else if ( !strncasecmp(line,"3_prime_overlapping_ncRNA",25) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+            break;
+        case 'd':
+            if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+            break;
+        case 'v':
+            if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+            break;
+        case 'b':
+            if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+            break;
+    }
+    return 0;
+}
+static inline int gff_ignored_biotype(gff_t *gff, char *ss, char *se)
+{
+    if ( !ss ) return 0;
+
+    char tmp = se[1];
+    se[1] = 0;
+
+    char *key = ss;
+    int n = 0;
+    if ( khash_str2int_get(gff->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+    khash_str2int_set(gff->init.ignored_biotypes, key, n+1);
+
+    se[1] = tmp;
+    return 1;
+}
+static gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+    khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+    gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+    if ( !gene )
+    {
+        gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+        int ret;
+        k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+        kh_val(aux->gid2gene,k) = gene;
+    }
+    return gene;
+}
+static void gff_parse_transcript(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+
+    ftr->type = gff_parse_biotype(aux->biotype);
+    if ( ftr->type <= 0 )
+    {
+        char tmp = aux->type_end[1];
+        aux->type_end[1] = 0;
+        ftr->type = gff_parse_biotype(aux->type);
+        aux->type_end[1] = tmp;
+    }
+    if ( ftr->type <= 0 )
+    {
+        if ( !gff_ignored_biotype(gff,aux->biotype,aux->biotype_end) )
+        {
+            if ( gff->verbosity > 0 )
+            {
+                if ( !gff->warned.unknown_tscript_biotype || gff->verbosity > 1 )
+                    fprintf(bcftools_stderr,"Warning: Ignoring transcript with unknown biotype .. %s\n", line);
+                gff->warned.unknown_tscript_biotype++;
+            }
+        }
+        return;
+    }
+
+    if ( !aux->id )
+        error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+    if ( !aux->parent )
+        error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+    uint32_t trid,gene_id;
+    gff_id_register(&gff->tscript_ids, aux->id, aux->id_end, &trid);
+    gff_id_register(&aux->gene_ids, aux->parent, aux->parent_end, &gene_id);
+
+    gf_tscript_t *tr = (gf_tscript_t*) calloc(1,sizeof(gf_tscript_t));
+    tr->id     = trid;
+    tr->strand = ftr->strand;
+    tr->gene   = gene_init(aux, gene_id);
+    tr->type   = ftr->type;
+    tr->beg    = ftr->beg;
+    tr->end    = ftr->end;
+
+    khint_t k;
+    int ret;
+    k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+    kh_val(aux->id2tr,k) = tr;
+}
+// register exon, CDS, UTR
+static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    if ( !aux->parent )
+        error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring found: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+    // associate with transcript id
+    gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid);
+
+    if ( ftr->strand==-1 && gff->verbosity > 0 )
+    {
+        if ( !gff->warned.unknown_strand || gff->verbosity > 1 )
+            fprintf(bcftools_stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line);
+        gff->warned.unknown_strand++;
+    }
+    if ( ftr->phase==-1 && gff->verbosity > 0 )
+    {
+        if ( !gff->warned.unknown_phase|| gff->verbosity > 1 )
+            fprintf(bcftools_stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line);
+        gff->warned.unknown_phase++;
+    }
+    ftr->iseq = feature_set_seq(gff, aux->chr,aux->chr_end);
+}
+static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    if ( !aux->id ) return;
+
+    uint32_t gene_id;
+    gff_id_register(&aux->gene_ids, aux->id, aux->id_end, &gene_id);
+
+    gf_gene_t *gene = gene_init(aux, gene_id);
+    if ( gene->name )
+    {
+        if ( !gff->warned.duplicate_id || gff->verbosity > 1 )
+            fprintf(bcftools_stderr,"Warning: The GFF contains features with duplicate id .. %s\n",line);
+        gff->warned.duplicate_id++;
+        return;
+    }
+
+    gene->iseq   = feature_set_seq(gff, aux->chr,aux->chr_end);
+    gene->beg    = ftr->beg;
+    gene->end    = ftr->end;
+    gene->strand = ftr->strand;
+    gene->id     = gene_id;
+
+    if ( aux->name )
+    {
+        gene->name = (char*) malloc(aux->name_end - aux->name + 2);
+        memcpy(gene->name,aux->name,aux->name_end - aux->name + 1);
+        gene->name[aux->name_end - aux->name + 1] = 0;
+    }
+    else
+        gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
+}
+
+// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them,
+// or -1 to indiciate the structure needs not be saved (either because of an error or because saved
+// as transcript or gene.)
+static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
+{
+    // - skip empty lines and commented lines
+    // - columns
+    //      1.      chr
+    //      2.      <skip>
+    //      3.      CDS, transcript, gene, ...
+    //      4-5.    beg,end
+    //      6.      <skip>
+    //      7.      strand
+    //      8.      phase
+    //      9.      Parent=transcript:ENST(\d+);ID=...;biotype=... etc
+
+    char *ss = line;
+    if ( !*ss ) return -1;      // skip blank lines
+    if ( *ss=='#' ) return -1;  // skip comments
+
+    aux_t *aux = &gff->init;
+    gff_parse_chr(gff, line, &aux->chr, &aux->chr_end);
+    ss = gff_skip(line, aux->chr_end + 2);
+
+    // 3rd column: is this a CDS, transcript, gene, etc.. The parsing order by frequency in Homo_sapiens.GRCh37.87.gff3
+    int is_gene_line = 0;
+    ftr->type = 0;
+    aux->type = ss;
+    if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+    else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+    else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+    else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+    else if ( !strncmp("biological_region\t",ss,18) ) { return -1; }    // skip
+    else if ( !strncmp("gene\t",ss,5) ) { is_gene_line = 1; ss += 5; }
+    else ss = gff_skip(line, ss);
+    aux->type_end = ss - 1;
+
+    // 4-5th columns: beg,end
+    ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+
+    // 6th column: skip
+    ss = gff_skip(line, ss);
+
+    // 7th column: strand
+    ftr->strand = -1;
+    if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+    else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+    ss += 2;
+
+    // 8th column: phase (codon offset)
+    ftr->phase = -1;
+    if ( *ss == '0' ) ftr->phase = 0;
+    else if ( *ss == '1' ) ftr->phase = 1;
+    else if ( *ss == '2' ) ftr->phase = 2;
+    else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN;     // exons and even CDS in some GFFs do not have phase
+    ss += 2;
+
+    // 9th column: id, parent, name, biotype
+    aux->name = NULL, aux->id = NULL, aux->parent = NULL, aux->biotype = NULL;
+    while ( *ss )
+    {
+        char *es = ss;
+        while ( *es && *es!=';' ) es++;
+        if ( !strncmp(ss,"ID=",3) )
+        {
+            ss += 3;
+            aux->id_end = es - 1;
+            aux->id = ss;
+            if ( !strncmp(ss,"gene:",5) ) { aux->id += 5; is_gene_line = 1; }
+            else if ( !strncmp(ss,"transcript:",11) ) aux->id += 11;
+        }
+        else if ( !strncmp(ss,"Name=",5) ) { aux->name = ss + 5; aux->name_end = es - 1; }
+        else if ( !strncmp(ss,"Parent=",7) )
+        {
+            ss += 7;
+            aux->parent_end = es - 1;
+            aux->parent = ss;
+            if ( !strncmp(ss,"gene:",5) ) aux->parent += 5;
+            else if ( !strncmp(ss,"transcript:",11) ) aux->parent += 11;
+        }
+        else if ( !strncmp(ss,"biotype=",8) ) { aux->biotype = ss + 8; aux->biotype_end = es - 1; }
+        else if ( !strncmp(ss,"gene_biotype=",13) ) { aux->biotype = ss + 13; aux->biotype_end = es - 1; }
+        if ( !*es ) break;
+        ss = es + 1;
+    }
+
+    if ( is_gene_line || !aux->parent )
+    {
+        gff_parse_gene(gff, line, ftr);
+        return -1;
+    }
+
+    if ( ftr->type )
+    {
+        gff_parse_exon(gff, line, ftr);
+        return 0;
+    }
+
+    gff_parse_transcript(gff, line, ftr);
+    return -1;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+    // comparison function for qsort of transcripts's CDS
+    if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+    if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+    return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+    *chr_beg = *chr_end = aux->seq[iseq];
+    while ( (*chr_end)[1] ) (*chr_end)++;
+}
+static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+    khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+    gf_tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+    assert( tr );
+    return tr;
+}
+static void register_cds(gff_t *gff, ftr_t *ftr)
+{
+    // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+    //  ftr is the result of parsing a gff CDS line
+    aux_t *aux = &gff->init;
+
+    gf_tscript_t *tr = tscript_init(aux, ftr->trid);
+    if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+    gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+    cds->tr    = tr;
+    cds->beg   = ftr->beg;
+    cds->len   = ftr->end - ftr->beg + 1;
+    cds->icds  = 0;     // to keep valgrind on mac happy
+    cds->phase = ftr->phase;
+
+    hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+    tr->cds[tr->ncds++] = cds;
+}
+static void register_utr(gff_t *gff, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+    utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+    utr->beg   = ftr->beg;
+    utr->end   = ftr->end;
+    utr->tr    = tscript_init(aux, ftr->trid);
+
+    char *chr_beg, *chr_end;
+    chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+    regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+static void register_exon(gff_t *gff, ftr_t *ftr)
+{
+    aux_t *aux = &gff->init;
+    gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+    exon->beg = ftr->beg;
+    exon->end = ftr->end;
+    exon->tr  = tscript_init(aux, ftr->trid);
+
+    char *chr_beg, *chr_end;
+    chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+    regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+static void tscript_init_cds(gff_t *gff)
+{
+    aux_t *aux = &gff->init;
+
+    // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+    khint_t k;
+    for (k=0; k<kh_end(aux->id2tr); k++)
+    {
+        if ( !kh_exist(aux->id2tr, k) ) continue;
+        gf_tscript_t *tr = (gf_tscript_t*) kh_val(aux->id2tr, k);
+
+        // position-to-tscript lookup
+        char *chr_beg, *chr_end;
+        chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+        regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+        if ( !tr->ncds ) continue;      // transcript with no CDS
+
+        // sort CDs
+        qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+        // trim non-coding start
+        int i, len = 0;
+        if ( tr->strand==STRAND_FWD )
+        {
+            if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
+            {
+                if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+                tr->cds[0]->beg += tr->cds[0]->phase;
+                tr->cds[0]->len -= tr->cds[0]->phase;
+                tr->cds[0]->phase = 0;
+            }
+
+            // sanity check phase; the phase number in gff tells us how many bases to skip in this
+            // feature to reach the first base of the next codon
+            int tscript_ok = 1;
+            for (i=0; i<tr->ncds; i++)
+            {
+                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+                {
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+                            fprintf(bcftools_stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+                        gff->warned.unknown_cds_phase++;
+                    }
+                    len += tr->cds[i]->len;
+                    continue;
+                }
+                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+                if ( phase!=len%3 )
+                {
+                    if ( !gff->force )
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                                gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+                            fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                                    gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                        gff->warned.wrong_phase++;
+                    }
+                    tscript_ok = 0;
+                    break;
+                }
+                len += tr->cds[i]->len;
+            }
+            if ( !tscript_ok ) continue;    // skip this transcript
+        }
+        else
+        {
+            if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
+            {
+                // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+                // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141.
+                // This also fixes phase of 5' incomplete CDS, see test/csq/ENST00000520868/ENST00000520868.gff
+                // todo: the same for the fwd strand
+                i = tr->ncds - 1;
+                int phase = tr->cds[i]->phase;
+                if ( phase ) tr->trim |= TRIM_5PRIME;
+                while ( i>=0 && phase > tr->cds[i]->len )
+                {
+                    phase -= tr->cds[i]->len;
+                    tr->cds[i]->phase = 0;
+                    tr->cds[i]->len   = 0;
+                    i--;
+                }
+                if ( gff->verbosity > 0 && tr->cds[i]->phase )
+                {
+                    if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+                        fprintf(bcftools_stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+                    gff->warned.incomplete_cds++;
+                }
+                tr->cds[i]->len  -= tr->cds[i]->phase;
+                tr->cds[i]->phase = 0;
+            }
+
+            // sanity check phase
+            int tscript_ok = 1;
+            for (i=tr->ncds-1; i>=0; i--)
+            {
+                if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+                {
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+                            fprintf(bcftools_stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+                        gff->warned.unknown_cds_phase++;
+                    }
+                    len += tr->cds[i]->len;
+                    continue;
+                }
+                int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+                if ( phase!=len%3 )
+                {
+                    if ( !gff->force )
+                        error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                                gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    if ( gff->verbosity > 0 )
+                    {
+                        if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+                            fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+                                    gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                        gff->warned.wrong_phase++;
+                    }
+                    tscript_ok = 0;
+                    break;
+                }
+                len += tr->cds[i]->len;
+            }
+            if ( !tscript_ok ) continue;    // skip this transcript
+        }
+
+        // set len. At the same check that CDS within a transcript do not overlap
+        len = 0;
+        for (i=0; i<tr->ncds; i++)
+        {
+            tr->cds[i]->icds = i;
+            len += tr->cds[i]->len;
+            if ( !i ) continue;
+
+            gf_cds_t *a = tr->cds[i-1];
+            gf_cds_t *b = tr->cds[i];
+            if ( a->beg + a->len - 1 >= b->beg )
+            {
+                if ( gff->verbosity > 0 )
+                {
+                    if ( !gff->warned.overlapping_cds || gff->verbosity > 1 )
+                        fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s, %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32" (ribosomal slippage?)\n",
+                                gff->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+                    gff->warned.overlapping_cds++;
+                }
+            }
+        }
+
+        if ( len%3 != 0 )
+        {
+            // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+            //  http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+            // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+            if ( gff->verbosity > 0 )
+            {
+                if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+                    fprintf(bcftools_stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+                gff->warned.incomplete_cds++;
+            }
+
+            tr->trim |= TRIM_3PRIME;
+            if ( tr->strand==STRAND_FWD )
+            {
+                i = tr->ncds - 1;
+                while ( i>=0 && len%3 )
+                {
+                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+                    tr->cds[i]->len -= dlen;
+                    len -= dlen;
+                    i--;
+                }
+            }
+            else
+            {
+                i = 0;
+                while ( i<tr->ncds && len%3 )
+                {
+                    int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+                    tr->cds[i]->len -= dlen;
+                    tr->cds[i]->beg += dlen;
+                    len -= dlen;
+                    i++;
+                }
+            }
+        }
+
+        // set CDS offsets and insert into regidx
+        len=0;
+        for (i=0; i<tr->ncds; i++)
+        {
+            tr->cds[i]->pos = len;
+            len += tr->cds[i]->len;
+            regidx_push(gff->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+        }
+    }
+}
+
+static void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+static void regidx_free_tscript(void *payload) { gf_tscript_t *tr = *((gf_tscript_t**)payload); free(tr->cds); free(tr); }
+
+static int gff_dump(gff_t *gff, const char *fname)
+{
+    BGZF *out = bgzf_open(fname,"wg");
+    if ( !out ) error("Failed to open %s: %s\n", fname, strerror(errno));
+
+    kstring_t str = {0,0,0};
+
+    khint_t k;
+    for (k=0; k<kh_end(gff->init.gid2gene); k++)
+    {
+        if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+        gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+        char *gene_id = gff->init.gene_ids.str[gene->id];
+        str.l = 0;
+        ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+
+    regitr_t *itr = regitr_init(gff->idx_tscript);
+    while ( regitr_loop(itr) )
+    {
+        gf_tscript_t *tr = regitr_payload(itr, gf_tscript_t*);
+        char *gene_id =  gff->init.gene_ids.str[tr->gene->id];
+        const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type);
+        str.l = 0;
+        ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_cds);
+    while ( regitr_loop(itr) )
+    {
+        gf_cds_t *cds = regitr_payload(itr,gf_cds_t*);
+        gf_tscript_t *tr = cds->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_utr);
+    while ( regitr_loop(itr) )
+    {
+        gf_utr_t *utr = regitr_payload(itr,gf_utr_t*);
+        gf_tscript_t *tr = utr->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    itr = regitr_init(gff->idx_exon);
+    while ( regitr_loop(itr) )
+    {
+        gf_exon_t *exon = regitr_payload(itr,gf_exon_t*);
+        gf_tscript_t *tr = exon->tr;
+        str.l = 0;
+        ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+        if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+    }
+    regitr_destroy(itr);
+
+    if ( bgzf_close(out)!=0 ) error("Error: close failed .. %s\n", fname);
+    free(str.s);
+
+    return 0;
+}
+
+int gff_parse(gff_t *gff)
+{
+    if ( gff->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", gff->fname);
+
+    aux_t *aux = &gff->init;
+    aux->seq2int   = khash_str2int_init();   // chrom's numeric id
+    aux->gid2gene  = kh_init(int2gene);      // gene id to gf_gene_t, for idx_gene
+    aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
+    gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
+    aux->ignored_biotypes = khash_str2int_init();
+    gff_id_init(&aux->gene_ids);
+    gff_id_init(&gff->tscript_ids);
+
+    // parse gff
+    kstring_t str = {0,0,0};
+    htsFile *fp = hts_open(gff->fname,"r");
+    if ( !fp ) error("Failed to read %s\n", gff->fname);
+    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+    {
+        hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+        int ret = gff_parse_line(gff, str.s, aux->ftr + aux->nftr);
+        if ( !ret ) aux->nftr++;
+    }
+    free(str.s);
+    if ( hts_close(fp)!=0 ) error("Close failed: %s\n", gff->fname);
+
+
+    // process gff information: connect CDS and exons to transcripts
+    gff->idx_cds  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+    gff->idx_utr  = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+    gff->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+
+    int i;
+    for (i=0; i<aux->nftr; i++)
+    {
+        ftr_t *ftr = &aux->ftr[i];
+
+        // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+        khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+        if ( k==kh_end(aux->id2tr) ) continue;       // no corresponding transcript registered, must be an unsupported biotype
+
+        gf_tscript_t *tr = kh_val(aux->id2tr,k);
+        tr->used = 1;
+        tr->gene->used = 1;
+
+        // populate regidx by category:
+        //      ftr->type   .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+        //      gene->type  .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+        if ( ftr->type==GF_CDS ) register_cds(gff, ftr);
+        else if ( ftr->type==GF_EXON ) register_exon(gff, ftr);
+        else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
+        else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
+        else
+            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+    }
+    tscript_init_cds(gff);
+
+    if ( gff->verbosity > 0 )
+    {
+        fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+                regidx_nregs(gff->idx_tscript),
+                regidx_nregs(gff->idx_exon),
+                regidx_nregs(gff->idx_cds),
+                regidx_nregs(gff->idx_utr));
+    }
+
+    if ( gff->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
+    {
+        khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+        fprintf(bcftools_stderr,"Ignored the following biotypes:\n");
+        for (i = kh_begin(ign); i < kh_end(ign); i++)
+        {
+            if ( !kh_exist(ign,i)) continue;
+            const char *biotype = kh_key(ign,i);
+            if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
+            fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
+        }
+    }
+    khash_str2int_destroy_free(aux->ignored_biotypes);
+
+    // warned about unprinted warnings
+    if ( gff->verbosity > 0 )
+    {
+        int nwarn = 0;
+        #define INC_NWARN(X) if (gff->warned.X) nwarn += gff->verbosity > 1 ? 0 : gff->warned.X - 1;
+        INC_NWARN(unknown_chr);
+        INC_NWARN(unknown_tscript_biotype);
+        INC_NWARN(unknown_strand);
+        INC_NWARN(unknown_phase);
+        INC_NWARN(duplicate_id);
+        INC_NWARN(unknown_cds_phase);
+        INC_NWARN(incomplete_cds);
+        INC_NWARN(wrong_phase);
+        INC_NWARN(overlapping_cds);
+        if ( nwarn > 0 )
+            fprintf(bcftools_stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn);
+    }
+
+    if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname);
+
+    if (  !regidx_nregs(gff->idx_tscript) )
+        error("Error: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
+              "       or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
+              "       of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
+
+    free(aux->seq);
+    free(aux->ftr);
+    khash_str2int_destroy_free(aux->seq2int);
+    // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+    kh_destroy(int2tscript,aux->id2tr);
+    gff_id_destroy(&aux->gene_ids);
+
+    return 0;
+}
+
+gff_t *gff_init(const char *fname)
+{
+    gff_t *gff = calloc(sizeof(gff_t),1);
+    gff->fname = fname;
+    return gff;
+}
+void gff_destroy(gff_t *gff)
+{
+    khint_t k;
+    if ( gff->init.gid2gene )
+    {
+        for (k=0; k<kh_end(gff->init.gid2gene); k++)
+        {
+            if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+            gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+            free(gene->name);
+            free(gene);
+        }
+        kh_destroy(int2gene,gff->init.gid2gene);
+    }
+
+    regidx_destroy(gff->idx_cds);
+    regidx_destroy(gff->idx_utr);
+    regidx_destroy(gff->idx_exon);
+    regidx_destroy(gff->idx_tscript);
+
+    gff_id_destroy(&gff->tscript_ids);
+    free(gff);
+}
+
diff --git a/bcftools/gff.h b/bcftools/gff.h

new file mode 100644 (file)

index 0000000..ebb6463
--- /dev/null
+++ b/bcftools/gff.h
@@ -0,0 +1,332 @@
+/* The MIT License
+
+   Copyright (c) 2023 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+/*
+    GFF parsing code refactored from csq.c
+
+    Things that would be nice to have
+        - dynamic N_REF_PAD
+        - for stop-lost events (also in frameshifts) report the number of truncated aa's
+        - memory could be greatly reduced by indexing gff (but it is quite compact already)
+        - deletions that go beyond transcript boundaries are not checked at sequence level
+            - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
+            - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
+
+    Read about transcript types here
+        http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
+        http://www.ensembl.org/info/genome/variation/predicted_data.html
+        https://www.gencodegenes.org/pages/biotypes.html
+
+    List of supported biotypes
+        antisense
+        IG_C_gene
+        IG_D_gene
+        IG_J_gene
+        IG_LV_gene
+        IG_V_gene
+        lincRNA
+        lncRNA      .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
+        macro_lncRNA
+        miRNA
+        misc_RNA
+        Mt_rRNA
+        Mt_tRNA
+        polymorphic_pseudogene
+        processed_transcript
+        protein_coding, mRNA
+        ribozyme
+        rRNA
+        sRNA
+        scRNA
+        scaRNA
+        sense_intronic
+        sense_overlapping
+        snRNA
+        snoRNA
+        TR_C_gene
+        TR_D_gene
+        TR_J_gene
+        TR_V_gene
+
+    The gff parsing logic
+        We collect features such by combining gff lines A,B,C as follows:
+            A .. gene line with a supported biotype
+                    A.ID=~/^gene:/
+
+            B .. transcript line referencing A with supported biotype
+                    B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
+
+            C .. corresponding CDS, exon, and UTR lines:
+                    C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+
+        For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
+        complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
+
+
+    The supported consequence types, sorted by impact:
+        splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
+        splice_donor_variant    .. start region of an intron changed (2bp at the 5' end of an intron)
+        stop_gained             .. DNA sequence variant resulting in a stop codon
+        frameshift_variant      .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
+        stop_lost               .. elongated transcript, stop codon changed
+        start_lost              .. the first codon changed
+        inframe_altering        .. combination of indels leading to unchanged reading frame and length
+        inframe_insertion       .. inserted coding sequence, unchanged reading frame
+        inframe_deletion        .. deleted coding sequence, unchanged reading frame
+        missense_variant        .. amino acid (aa) change, unchanged length
+        splice_region_variant   .. change within 1-3 bases of the exon or 3-8 bases of the intron
+        synonymous_variant      .. DNA sequence variant resulting in no amino acid change
+        stop_retained_variant   .. different stop codon
+        start_retained_variant  .. start codon retained by indel realignment
+        non_coding_variant      .. variant in non-coding sequence, such as RNA gene
+        5_prime_UTR_variant
+        3_prime_UTR_variant
+        intron_variant          .. reported only if none of the above
+        intergenic_variant      .. reported only if none of the above
+
+
+    The annotation algorithm.
+        The algorithm checks if the variant falls in a region of a supported type. The
+        search is performed in the following order, until a match is found:
+            1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
+            2. idx_utr(gf_utr_t) - check UTR hits
+            3. idx_exon(gf_exon_t) - check for splice variants
+            4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
+
+        These regidx indexes are created by parsing a gff3 file as follows:
+            1.  create the array "ftr" of all UTR, CDS, exons. This will be
+            processed later and pruned based on transcript types we want to keep.
+            In the same go, create the hash "id2tr" of transcripts to keep
+            (based on biotype) which maps from transcript_id to a transcript. At
+            the same time also build the hash "gid2gene" which maps from gene_id to
+            gf_gene_t pointer.
+
+            2.  build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
+            Use only features from "ftr" which are present in "id2tr".
+
+            3.  clean data that won't be needed anymore: ftr, id2tr, gid2gene.
+
+    Data structures.
+        idx_cds, idx_utr, idx_exon, idx_tscript:
+            as described above, regidx structures for fast lookup of exons/transcripts
+            overlapping a region, the payload is a pointer to tscript.cds
+*/
+
+#ifndef GFF_H__
+#define GFF_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <getopt.h>
+#include <math.h>
+#include <inttypes.h>
+#include <htslib/hts.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/faidx.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "regidx.h"
+
+#ifndef __FUNCTION__
+#  define __FUNCTION__ __func__
+#endif
+
+// Definition of splice_region, splice_acceptor and splice_donor
+#define N_SPLICE_DONOR         2
+#define N_SPLICE_REGION_EXON   3
+#define N_SPLICE_REGION_INTRON 8
+
+#define STRAND_REV 0
+#define STRAND_FWD 1
+
+#define TRIM_NONE   0
+#define TRIM_5PRIME 1
+#define TRIM_3PRIME 2
+
+
+// GFF line types
+#define GFF_UNKN_LINE    0
+#define GFF_TSCRIPT_LINE 1
+#define GFF_GENE_LINE    2
+
+
+/*
+    Genomic features, for fast lookup by position to overlapping features
+*/
+#define GF_coding_bit 6
+#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
+#define GF_MT_rRNA                       1                      // non-coding: 1, 2, ...
+#define GF_MT_tRNA                       2
+#define GF_lincRNA                       3
+#define GF_miRNA                         4
+#define GF_MISC_RNA                      5
+#define GF_rRNA                          6
+#define GF_snRNA                         7
+#define GF_snoRNA                        8
+#define GF_PROCESSED_TRANSCRIPT          9
+#define GF_ANTISENSE                    10
+#define GF_macro_lncRNA                 11
+#define GF_ribozyme                     12
+#define GF_sRNA                         13
+#define GF_scRNA                        14
+#define GF_scaRNA                       15
+#define GF_SENSE_INTRONIC               16
+#define GF_SENSE_OVERLAPPING            17
+#define GF_PSEUDOGENE                   18
+#define GF_PROCESSED_PSEUDOGENE         19
+#define GF_ARTIFACT                     20
+#define GF_IG_PSEUDOGENE                21
+#define GF_IG_C_PSEUDOGENE              22
+#define GF_IG_J_PSEUDOGENE              23
+#define GF_IG_V_PSEUDOGENE              24
+#define GF_TR_V_PSEUDOGENE              25
+#define GF_TR_J_PSEUDOGENE              26
+#define GF_MT_tRNA_PSEUDOGENE           27
+#define GF_misc_RNA_PSEUDOGENE          28
+#define GF_miRNA_PSEUDOGENE             29
+#define GF_RIBOZYME                     30
+#define GF_RETAINED_INTRON              31
+#define GF_RETROTRANSPOSED              32
+#define GF_tRNA_PSEUDOGENE              33
+#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE     34
+#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE   35
+#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE       36
+#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE    37
+#define GF_TRANSLATED_PROCESSED_PSEUDOGENE      38
+#define GF_KNOWN_NCRNA                          39
+#define GF_UNITARY_PSEUDOGENE                   40
+#define GF_UNPROCESSED_PSEUDOGENE               41
+#define GF_LRG_GENE                             42
+#define GF_3PRIME_OVERLAPPING_ncRNA             43
+#define GF_DISRUPTED_DOMAIN                     44
+#define GF_vaultRNA                             45
+#define GF_BIDIRECTIONAL_PROMOTER_lncRNA        46
+#define GF_AMBIGUOUS_ORF                        47
+#define GF_lncRNA                               48
+#define GF_PROTEIN_CODING               (1|(1<<GF_coding_bit))  // coding: 65, 66, ...
+#define GF_POLYMORPHIC_PSEUDOGENE       (2|(1<<GF_coding_bit))
+#define GF_IG_C                         (3|(1<<GF_coding_bit))
+#define GF_IG_D                         (4|(1<<GF_coding_bit))
+#define GF_IG_J                         (5|(1<<GF_coding_bit))
+#define GF_IG_LV                        (6|(1<<GF_coding_bit))
+#define GF_IG_V                         (7|(1<<GF_coding_bit))
+#define GF_TR_C                         (8|(1<<GF_coding_bit))
+#define GF_TR_D                         (9|(1<<GF_coding_bit))
+#define GF_TR_J                        (10|(1<<GF_coding_bit))
+#define GF_TR_V                        (11|(1<<GF_coding_bit))
+#define GF_NMD                         (12|(1<<GF_coding_bit))
+#define GF_NON_STOP_DECAY              (13|(1<<GF_coding_bit))
+#define GF_CDS      ((1<<(GF_coding_bit+1))+1)                  // special types: 129, 130, ...
+#define GF_EXON     ((1<<(GF_coding_bit+1))+2)
+#define GF_UTR3     ((1<<(GF_coding_bit+1))+3)
+#define GF_UTR5     ((1<<(GF_coding_bit+1))+4)
+// GF_MAX = (1<<30)-1, see hap_node_t
+
+#define CDS_PHASE_UNKN 3
+typedef struct gf_tscript_t_ gf_tscript_t;
+typedef struct
+{
+    gf_tscript_t *tr;   // transcript
+    uint32_t beg;       // the start coordinate of the CDS (on the reference strand, 0-based)
+    uint32_t pos;       // 0-based index of the first exon base within the transcript (only to
+                        //  update hap_node_t.sbeg in hap_init, could be calculated on the fly)
+    uint32_t len;       // exon length
+    uint32_t icds:30,   // exon index within the transcript
+             phase:2;   // offset of the CDS: 0,1,2 or 3 for unknown
+}
+gf_cds_t;
+typedef struct
+{
+    char *name;                     // human readable name, e.g. ORF45
+    uint32_t iseq;
+    uint32_t id,beg,end,strand:31,  // used only by --dump-gff
+             used:1;                // does it have any exons, CDS, UTR?
+}
+gf_gene_t;
+typedef struct
+{
+    uint32_t beg,end;
+    gf_tscript_t *tr;
+}
+gf_exon_t;
+typedef enum { prime3, prime5 } utr_t;
+typedef struct
+{
+    utr_t which;
+    uint32_t beg,end;
+    gf_tscript_t *tr;
+}
+gf_utr_t;
+struct gf_tscript_t_
+{
+    uint32_t id;        // transcript id
+    uint32_t beg,end;   // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
+    uint32_t strand:1,  // STRAND_REV or STRAND_FWD
+             used:1,    // does it have any exons, UTRs, CDS?
+             ncds:30,   // number of exons
+             mcds;
+    gf_cds_t **cds;     // ordered list of exons
+    uint32_t trim:2,    // complete, 5' or 3' trimmed, see TRIM_* types
+             type:30;   // one of GF_* types
+    gf_gene_t *gene;
+    void *aux;          // auxiliary user data
+};
+
+typedef enum
+{
+    // write options
+    verbosity,          // int, 0-2
+    strip_chr_names,    // int, 0 to leave as is, 1 to strip 'chr' prefix
+    force_out_of_phase, // int, 1 to proceed even CDS exon out of expected phase
+    dump_fname,         // const char*, dump the parsed GFF into this file, for debugging purposes
+
+    // read options
+    idx_cds,
+    idx_utr,
+    idx_exon,
+    idx_tscript,
+}
+gff_opt_t;
+
+typedef enum { transcript } id_type_t;  // for gff_id2str
+
+typedef struct gff_t_ gff_t;
+
+gff_t *gff_init(const char *fname);
+int gff_parse(gff_t *gff);
+void gff_destroy(gff_t *gff);
+
+int gff_set(gff_t *gff, gff_opt_t key, ...);   // returns 0 on success
+void *gff_get(gff_t *gff, gff_opt_t key);
+const char *gff_id2string(gff_t *gff, id_type_t type, int id);
+const char *gf_type2gff_string(int type);
+
+#endif
diff --git a/bcftools/hex.h b/bcftools/hex.h

index d915b2862a44d8b2e1c9e55a0838b88b7969e158..95210e3cf8372f3cc45a1be01ca25beec5501a49 100644 (file)
--- a/bcftools/hex.h
+++ b/bcftools/hex.h
@@ -3,10 +3,10 @@
  // hex.h
  //
  // @category   Libraries
-// @author     Nicola Asuni <nicola.asuni@genomicsplc.com>
+// @author     Nicola Asuni <info@tecnick.com>
+// @link       https://github.com/tecnickcom/variantkey
+// @license    MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE)
  // @copyright  2017-2018 GENOMICS plc
-// @license    MIT (see LICENSE)
-// @link       https://github.com/genomicsplc/variantkey
  //
  // LICENSE
  //
diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c

index 9b21b18731fd38ec40d356646be1e6c76b267e79..d42a6a36046f5425aa96b87ff11279281f419232 100644 (file)
--- a/bcftools/mpileup.c
+++ b/bcftools/mpileup.c
@@ -1,6 +1,6 @@
  /*  mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
  
-    Copyright (C) 2008-2022 Genome Research Ltd.
+    Copyright (C) 2008-2023 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -101,6 +101,8 @@ typedef struct {
      int indels_v20;
      int argc;
      char **argv;
+    int write_index;
+    char *index_fn;
  } mplp_conf_t;
  
  typedef struct {
@@ -489,37 +491,43 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
              if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
                  // Left & right cigar op match.
                  int lr = b->core.l_qseq > 500;
-                int lm = 0, rm = 0, k;
+                int lm = 0, rm = 0, k, nm = 0;
                  for (k = 0; k < ncig; k++) {
                      int cop = bam_cigar_op(cig[k]);
                      if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
                          continue;
  
                      if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
-                        cop == BAM_CEQUAL)
+                        cop == BAM_CEQUAL) {
                          lm += bam_cigar_oplen(cig[k]);
-                    else
+                        nm++;
+                    } else {
                          break;
+                    }
                  }
  
-                for (k = ncig-1; k >= 0; k--) {
-                    int cop = bam_cigar_op(cig[k]);
-                    if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                // if everything is a match (or sequence (mis)match) then move on
+                // because we don't have an indel in the middle
+                if (nm != ncig) {
+                    for (k = ncig-1; k >= 0; k--) {
+                        int cop = bam_cigar_op(cig[k]);
+                        if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                            continue;
+
+                        if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+                            cop == BAM_CEQUAL)
+                            rm += bam_cigar_oplen(cig[k]);
+                        else
+                            break;
+                    }
+
+                    if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
                          continue;
  
-                    if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
-                        cop == BAM_CEQUAL)
-                        rm += bam_cigar_oplen(cig[k]);
-                    else
-                        break;
+                    if (lm >= REALN_DIST && rm >= REALN_DIST &&
+                        has_clip < (0.15+0.05*(nt>20))*nt)
+                        continue;
                  }
-
-                if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
-                    continue;
-
-                if (lm >= REALN_DIST && rm >= REALN_DIST &&
-                    has_clip < (0.15+0.05*(nt>20))*nt)
-                    continue;
              }
  
              if (b->core.l_qseq > 500) {
@@ -849,6 +857,7 @@ static int mpileup(mplp_conf_t *conf)
      for (i=0; i<nsmpl; i++)
          bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
      if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
+    if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname);
  
      conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
                                conf->delta_baseQ);
@@ -958,6 +967,15 @@ static int mpileup(mplp_conf_t *conf)
      bcf_destroy1(conf->bcf_rec);
      if (conf->bcf_fp)
      {
+        if ( conf->write_index )
+        {
+            if ( bcf_idx_save(conf->bcf_fp)<0 )
+            {
+                if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
+                error("Error: cannot write to index %s\n",conf->index_fn);
+            }
+            free(conf->index_fn);
+        }
          if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
          bcf_hdr_destroy(conf->bcf_hdr);
          bcf_call_destroy(conf->bca);
@@ -1227,6 +1245,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
          "  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
          "                          'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
          "      --threads INT       Use multithreading with INT worker threads [0]\n"
+        "      --write-index       Automatically index the output files [off]\n"
          "\n"
          "SNP/INDEL genotype likelihoods options:\n"
          "  -X, --config STR        Specify platform specific profiles (see below)\n"
@@ -1375,6 +1394,7 @@ int main_mpileup(int argc, char *argv[])
          {"seed", required_argument, NULL, 13},
          {"ambig-reads", required_argument, NULL, 14},
          {"ar", required_argument, NULL, 14},
+        {"write-index",no_argument,NULL,21},
          {NULL, 0, NULL, 0}
      };
      while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
@@ -1497,6 +1517,7 @@ int main_mpileup(int argc, char *argv[])
              }
              break;
          case  20: mplp.indels_v20 = 1; break;
+        case  21: mplp.write_index = 1; break;
          case 'A': use_orphan = 1; break;
          case 'F': mplp.min_frac = atof(optarg); break;
          case 'm': mplp.min_support = atoi(optarg); break;
diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c

index 724a0ec1147b30a34971f81df902d75ae8ad5657..81c5849c55fa88cc3ed4bdbd79c57c8389e20574 100644 (file)
--- a/bcftools/mpileup.c.pysam.c
+++ b/bcftools/mpileup.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
  
-    Copyright (C) 2008-2022 Genome Research Ltd.
+    Copyright (C) 2008-2023 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -103,6 +103,8 @@ typedef struct {
      int indels_v20;
      int argc;
      char **argv;
+    int write_index;
+    char *index_fn;
  } mplp_conf_t;
  
  typedef struct {
@@ -491,37 +493,43 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
              if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
                  // Left & right cigar op match.
                  int lr = b->core.l_qseq > 500;
-                int lm = 0, rm = 0, k;
+                int lm = 0, rm = 0, k, nm = 0;
                  for (k = 0; k < ncig; k++) {
                      int cop = bam_cigar_op(cig[k]);
                      if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
                          continue;
  
                      if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
-                        cop == BAM_CEQUAL)
+                        cop == BAM_CEQUAL) {
                          lm += bam_cigar_oplen(cig[k]);
-                    else
+                        nm++;
+                    } else {
                          break;
+                    }
                  }
  
-                for (k = ncig-1; k >= 0; k--) {
-                    int cop = bam_cigar_op(cig[k]);
-                    if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                // if everything is a match (or sequence (mis)match) then move on
+                // because we don't have an indel in the middle
+                if (nm != ncig) {
+                    for (k = ncig-1; k >= 0; k--) {
+                        int cop = bam_cigar_op(cig[k]);
+                        if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                            continue;
+
+                        if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+                            cop == BAM_CEQUAL)
+                            rm += bam_cigar_oplen(cig[k]);
+                        else
+                            break;
+                    }
+
+                    if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
                          continue;
  
-                    if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
-                        cop == BAM_CEQUAL)
-                        rm += bam_cigar_oplen(cig[k]);
-                    else
-                        break;
+                    if (lm >= REALN_DIST && rm >= REALN_DIST &&
+                        has_clip < (0.15+0.05*(nt>20))*nt)
+                        continue;
                  }
-
-                if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
-                    continue;
-
-                if (lm >= REALN_DIST && rm >= REALN_DIST &&
-                    has_clip < (0.15+0.05*(nt>20))*nt)
-                    continue;
              }
  
              if (b->core.l_qseq > 500) {
@@ -851,6 +859,7 @@ static int mpileup(mplp_conf_t *conf)
      for (i=0; i<nsmpl; i++)
          bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
      if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
+    if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname);
  
      conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
                                conf->delta_baseQ);
@@ -960,6 +969,15 @@ static int mpileup(mplp_conf_t *conf)
      bcf_destroy1(conf->bcf_rec);
      if (conf->bcf_fp)
      {
+        if ( conf->write_index )
+        {
+            if ( bcf_idx_save(conf->bcf_fp)<0 )
+            {
+                if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
+                error("Error: cannot write to index %s\n",conf->index_fn);
+            }
+            free(conf->index_fn);
+        }
          if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
          bcf_hdr_destroy(conf->bcf_hdr);
          bcf_call_destroy(conf->bca);
@@ -1229,6 +1247,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
          "  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
          "                          'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
          "      --threads INT       Use multithreading with INT worker threads [0]\n"
+        "      --write-index       Automatically index the output files [off]\n"
          "\n"
          "SNP/INDEL genotype likelihoods options:\n"
          "  -X, --config STR        Specify platform specific profiles (see below)\n"
@@ -1377,6 +1396,7 @@ int main_mpileup(int argc, char *argv[])
          {"seed", required_argument, NULL, 13},
          {"ambig-reads", required_argument, NULL, 14},
          {"ar", required_argument, NULL, 14},
+        {"write-index",no_argument,NULL,21},
          {NULL, 0, NULL, 0}
      };
      while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
@@ -1499,6 +1519,7 @@ int main_mpileup(int argc, char *argv[])
              }
              break;
          case  20: mplp.indels_v20 = 1; break;
+        case  21: mplp.write_index = 1; break;
          case 'A': use_orphan = 1; break;
          case 'F': mplp.min_frac = atof(optarg); break;
          case 'm': mplp.min_support = atoi(optarg); break;
diff --git a/bcftools/reheader.c b/bcftools/reheader.c

index 4458f27bce88c34180e709091c1cc528cfcf567c..ed852173ccfdaccbeede730d82676171c978928f 100644 (file)
--- a/bcftools/reheader.c
+++ b/bcftools/reheader.c
@@ -68,7 +68,8 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
      kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0};
      char *chr_name = NULL, *p, *q = line + 9;   // skip ##contig=
      char *end = q;
-    int nopen = 1, chr_len = 0;
+    int nopen = 1;
+    hts_pos_t chr_len = 0;
      while ( *end && *end!='\n' ) end++;
      while ( *q && *q!='\n' && nopen>0 )
      {
@@ -118,7 +119,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
          if ( !strcmp("ID",key.s) )
          {
              if ( khash_str2int_has_key(chr_seen,val.s) ) continue;
-            chr_len = faidx_seq_len(fai, val.s);
+            chr_len = faidx_seq_len64(fai, val.s);
              if ( chr_len==-1 )
              {
                  free(val.s); free(key.s); free(tmp.s);
@@ -136,7 +137,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
          if ( quoted ) kputc('"',&tmp);
      }
      if ( !chr_name ) return end;
-    ksprintf(dst,"##contig=<ID=%s,length=%d%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
+    ksprintf(dst,"##contig=<ID=%s,length=%"PRIhts_pos"%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
      free(key.s); free(val.s); free(tmp.s);
      return q;
  }
@@ -211,7 +212,7 @@ static void update_from_fai(args_t *args)
      for (i=0; i<n; i++)
      {
          if ( khash_str2int_has_key(chr_seen,faidx_iseq(fai,i)) ) continue;
-        ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%d>\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i)));
+        ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%"PRIhts_pos">\n",faidx_iseq(fai,i),faidx_seq_len64(fai,faidx_iseq(fai,i)));
      }
      kputs(tmp+1,&hdr_txt_new);
  
@@ -699,7 +700,7 @@ int main_reheader(int argc, char *argv[])
      int c;
      args_t *args  = (args_t*) calloc(1,sizeof(args_t));
      args->argc    = argc; args->argv = argv;
-    
+
      static struct option loptions[] =
      {
          {"temp-prefix",1,0,'T'},
diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c

index a0698709dd00b9b2eb2125c13e6ba91c9bc64fc4..44dff8c9c5bdcf4831b2fb2e298c2648dca20bfc 100644 (file)
--- a/bcftools/reheader.c.pysam.c
+++ b/bcftools/reheader.c.pysam.c
@@ -70,7 +70,8 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
      kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0};
      char *chr_name = NULL, *p, *q = line + 9;   // skip ##contig=
      char *end = q;
-    int nopen = 1, chr_len = 0;
+    int nopen = 1;
+    hts_pos_t chr_len = 0;
      while ( *end && *end!='\n' ) end++;
      while ( *q && *q!='\n' && nopen>0 )
      {
@@ -120,7 +121,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
          if ( !strcmp("ID",key.s) )
          {
              if ( khash_str2int_has_key(chr_seen,val.s) ) continue;
-            chr_len = faidx_seq_len(fai, val.s);
+            chr_len = faidx_seq_len64(fai, val.s);
              if ( chr_len==-1 )
              {
                  free(val.s); free(key.s); free(tmp.s);
@@ -138,7 +139,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
          if ( quoted ) kputc('"',&tmp);
      }
      if ( !chr_name ) return end;
-    ksprintf(dst,"##contig=<ID=%s,length=%d%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
+    ksprintf(dst,"##contig=<ID=%s,length=%"PRIhts_pos"%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
      free(key.s); free(val.s); free(tmp.s);
      return q;
  }
@@ -213,7 +214,7 @@ static void update_from_fai(args_t *args)
      for (i=0; i<n; i++)
      {
          if ( khash_str2int_has_key(chr_seen,faidx_iseq(fai,i)) ) continue;
-        ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%d>\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i)));
+        ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%"PRIhts_pos">\n",faidx_iseq(fai,i),faidx_seq_len64(fai,faidx_iseq(fai,i)));
      }
      kputs(tmp+1,&hdr_txt_new);
  
@@ -701,7 +702,7 @@ int main_reheader(int argc, char *argv[])
      int c;
      args_t *args  = (args_t*) calloc(1,sizeof(args_t));
      args->argc    = argc; args->argv = argv;
-    
+
      static struct option loptions[] =
      {
          {"temp-prefix",1,0,'T'},
diff --git a/bcftools/tsv2vcf.c b/bcftools/tsv2vcf.c

index 596e75a0a76b513fbec88600081cf9adbb78f7b4..22dec30654742bdfd3edb36d32783c0f72353cb8 100644 (file)
--- a/bcftools/tsv2vcf.c
+++ b/bcftools/tsv2vcf.c
@@ -10,10 +10,10 @@
      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      copies of the Software, and to permit persons to whom the Software is
      furnished to do so, subject to the following conditions:
-    
+
      The above copyright notice and this permission notice shall be included in
      all copies or substantial portions of the Software.
-    
+
      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
diff --git a/bcftools/tsv2vcf.c.pysam.c b/bcftools/tsv2vcf.c.pysam.c

index 8c621572893b42af951ca78559744005231df06e..83de6f3ebd4aa03d7ab730f48d9e007177109d82 100644 (file)
--- a/bcftools/tsv2vcf.c.pysam.c
+++ b/bcftools/tsv2vcf.c.pysam.c
@@ -12,10 +12,10 @@
      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      copies of the Software, and to permit persons to whom the Software is
      furnished to do so, subject to the following conditions:
-    
+
      The above copyright notice and this permission notice shall be included in
      all copies or substantial portions of the Software.
-    
+
      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
diff --git a/bcftools/variantkey.h b/bcftools/variantkey.h

index ccd4d8dd0d9d80485b559d3d37c857cdc7507467..a74935fb756491817e5a045ec6a0d9d7178d3aea 100644 (file)
--- a/bcftools/variantkey.h
+++ b/bcftools/variantkey.h
@@ -3,14 +3,15 @@
  // variantkey.h
  //
  // @category   Libraries
-// @author     Nicola Asuni <nicola.asuni@genomicsplc.com>
-// @copyright  2017-2018 GENOMICS plc
-// @license    MIT (see LICENSE)
-// @link       https://github.com/genomicsplc/variantkey
+// @author     Nicola Asuni <info@tecnick.com>
+// @link       https://github.com/tecnickcom/variantkey
+// @license    MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE)
+// @copyright  2017-2018 GENOMICS plc, 2018-2023 Nicola Asuni - Tecnick.com
  //
  // LICENSE
  //
  // Copyright (c) 2017-2018 GENOMICS plc
+// Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com
  //
  // Permission is hereby granted, free of charge, to any person obtaining a copy
  // of this software and associated documentation files (the "Software"), to deal
@@ -54,6 +55,7 @@
  #define VKMASK_REFALT   0x000000007FFFFFFF  //!< VariantKey binary mask for REF+ALT   [ 00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111111 ]
  #define VKSHIFT_CHROM   59 //!< CHROM LSB position from the VariantKey LSB
  #define VKSHIFT_POS     31 //!< POS LSB position from the VariantKey LSB
+#define MAXUINT32       0xFFFFFFFF //!< Maximum value for uint32_t
  
  /**
   * VariantKey struct.
@@ -75,16 +77,54 @@ typedef struct vkrange_t
      uint64_t max; //!< Maximum VariantKey value for any given REF+ALT encoding
  } vkrange_t;
  
-/** @brief Returns chromosome numerical encoding.
+/** @brief Returns the encoding for a numerical chromosome input.
   *
   * @param chrom  Chromosome. An identifier from the reference genome, no white-space permitted.
   * @param size   Length of the chrom string, excluding the terminating null byte.
   *
   * @return CHROM code
   */
+static inline uint8_t encode_numeric_chrom(const char *chrom, size_t size)
+{
+    size_t i;
+    uint8_t v = (chrom[0] - '0');
+    for (i = 1; i < size; i++)
+    {
+        if ((chrom[i] > '9') || (chrom[i] < '0'))
+        {
+            return 0; // NA: a character that is not a numebr was found.
+        }
+        v = ((v * 10) + (chrom[i] - '0'));
+    }
+    return v;
+}
+
+
+/** @brief Returns a true value (1) if the input chrom has 'chr' prefix (case insensitive).
+ *
+ * @param chrom  Chromosome. An identifier from the reference genome, no white-space permitted.
+ * @param size   Length of the chrom string, excluding the terminating null byte.
+ *
+ * @return True (1) if the chr prefix is present.
+ */
+static inline int has_chrom_chr_prefix(const char *chrom, size_t size)
+{
+    return ((size > 3)
+            && ((chrom[0] == 'c') || (chrom[0] == 'C'))
+            && ((chrom[1] == 'h') || (chrom[1] == 'H'))
+            && ((chrom[2] == 'r') || (chrom[2] == 'R')));
+}
+
+/** @brief Returns chromosome numerical encoding.
+ *
+ * @param chrom  Chromosome. An identifier from the reference genome, no white-space permitted.
+ * @param size   Length of the chrom string, excluding the terminating null byte.
+ *
+ * @return CHROM code or 0 in case of invalid input.
+ */
  static inline uint8_t encode_chrom(const char *chrom, size_t size)
  {
-    // X > 23 ; Y > 24 ; M > 25
+    // X = 23; Y = 24; M = 25; any other letter is mapped to 0:
      static const uint8_t onecharmap[] =
      {
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -98,12 +138,9 @@ static inline uint8_t encode_chrom(const char *chrom, size_t size)
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      };
-    // remove "chr" prefix
-    if ((size > 3)
-            && ((chrom[0] == 'c') || (chrom[0] == 'C'))
-            && ((chrom[1] == 'h') || (chrom[1] == 'H'))
-            && ((chrom[2] == 'r') || (chrom[2] == 'R')))
+    if (has_chrom_chr_prefix(chrom, size))
      {
+        // remove "chr" prefix
          chrom += 3;
          size -= 3;
      }
@@ -111,19 +148,9 @@ static inline uint8_t encode_chrom(const char *chrom, size_t size)
      {
          return 0;
      }
-    if ((chrom[0] <= '9') && (chrom[0] >= '0')) // Number
+    if ((chrom[0] <= '9') && (chrom[0] >= '0'))
      {
-        size_t i;
-        uint8_t v = (chrom[0] - '0');
-        for (i = 1; i < size; i++)
-        {
-            if ((chrom[i] > '9') || (chrom[i] < '0'))
-            {
-                return 0; // NA
-            }
-            v = ((v * 10) + (chrom[i] - '0'));
-        }
-        return v;
+        return encode_numeric_chrom(chrom, size);
      }
      if ((size == 1) || ((size == 2) && ((chrom[1] == 'T') || (chrom[1] == 't'))))
      {
@@ -159,10 +186,10 @@ static inline uint32_t encode_base(const uint8_t c)
  {
      /*
        Encode base:
-      A > 0
-      C > 1
-      G > 2
-      T > 3
+      A = 0
+      C = 1
+      G = 2
+      T = 3
      */
      static const uint32_t map[] =
      {
@@ -205,7 +232,7 @@ static inline uint32_t encode_refalt_rev(const char *ref, size_t sizeref, const
      uint8_t bitpos = 23;
      if ((encode_allele(&h, &bitpos, ref, sizeref) < 0) || (encode_allele(&h, &bitpos, alt, sizealt) < 0))
      {
-        return 0; // error code
+        return MAXUINT32; // error code
      }
      return h;
  }
@@ -318,7 +345,7 @@ static inline uint32_t encode_refalt(const char *ref, size_t sizeref, const char
      if ((sizeref + sizealt) <= 11)
      {
          uint32_t h = encode_refalt_rev(ref, sizeref, alt, sizealt);
-        if (h != 0)
+        if (h != MAXUINT32)
          {
              return h;
          }
@@ -486,7 +513,9 @@ static inline void decode_variantkey(uint64_t code, variantkey_t *vk)
      vk->refalt = extract_variantkey_refalt(code);
  }
  
-/** @brief Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT.
+/**
+ * Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT.
+ * The variant should be already normalized (see normalize_variant or use normalized_variantkey).
   *
   * @param chrom      Chromosome. An identifier from the reference genome, no white-space or leading zeros permitted.
   * @param sizechrom  Length of the chrom string, excluding the terminating null byte.
diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c

index 495d2b5a3a907160f1bd4af7a66e224680c62f09..b2e39ef7b6de2feb6434a54b80ddcbaac1c8da39 100644 (file)
--- a/bcftools/vcfannotate.c
+++ b/bcftools/vcfannotate.c
@@ -1,6 +1,6 @@
  /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -118,6 +118,8 @@ typedef struct _args_t
      htsFile *out_fh;
      int output_type, n_threads, clevel;
      bcf_sr_regions_t *tgts;
+    char *index_fn;
+    int write_index;
  
      regidx_t *tgt_idx;  // keep everything in memory only with .tab annotation file and -c BEG,END columns
      regitr_t *tgt_itr;
@@ -2863,9 +2865,16 @@ static void init_data(args_t *args)
  
      if ( args->mark_sites )
      {
-        if ( !args->targets_fname ) error("The -a option not given\n");
-        bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
-            args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
+        if ( !args->targets_fname )
+        {
+            if ( args->mark_sites_logic!=MARK_LISTED ) error("The -a option not given but -%s logic was requested\n",args->mark_sites);
+            fprintf(stderr,"Note: The -a option not given, all sites will be annotated with INFO/%s\n",args->mark_sites);
+            bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites marked with `bcftools annotate -m %s`\">",
+                    args->mark_sites,args->mark_sites);
+        }
+        else
+            bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
+                args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
      }
  
      if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
@@ -2881,6 +2890,7 @@ static void init_data(args_t *args)
          if ( args->n_threads )
              hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
          if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
+        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      }
  }
  
@@ -2943,7 +2953,19 @@ static void destroy_data(args_t *args)
          convert_destroy(args->set_ids);
      if ( args->filter )
          filter_destroy(args->filter);
-    if (args->out_fh) hts_close(args->out_fh);
+    if (args->out_fh)
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+    }
      free(args->sample_map);
      free(args->merge_method_str.s);
  }
@@ -3072,6 +3094,7 @@ static void annotate(args_t *args, bcf1_t *line)
          for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
          if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
          {
+            hts_pos_t vcf_end = line->pos + line->rlen - 1;
              while ( regitr_overlap(args->tgt_itr) )
              {
                  annot_line_t *tmp = &args->alines[0];
@@ -3082,7 +3105,7 @@ static void annotate(args_t *args, bcf1_t *line)
                  // Check min overlap
                  int len_ann = tmp->end - tmp->start + 1;
                  int len_vcf = line->rlen;
-                int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
+                int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
                  assert( isec > 0 );
                  if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
                  if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
@@ -3096,9 +3119,9 @@ static void annotate(args_t *args, bcf1_t *line)
                          error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
                      if ( ret==0 )
                          args->cols[j].done = 1;
+                    has_overlap = 1;
                  }
              }
-            has_overlap = 1;
          }
          for (j=0; j<args->ncols; j++)
          {
@@ -3273,6 +3296,8 @@ static void annotate(args_t *args, bcf1_t *line)
  
      if ( args->mark_sites )
      {
+        if ( !args->targets_fname ) has_overlap = 1;
+
          // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
          if ( args->mark_sites_logic==MARK_LISTED )
              bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0);
@@ -3315,6 +3340,7 @@ static void usage(args_t *args)
      fprintf(stderr, "       --single-overlaps           Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
      fprintf(stderr, "   -x, --remove LIST               List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
      fprintf(stderr, "       --threads INT               Number of extra output compression threads [0]\n");
+    fprintf(stderr, "       --write-index               Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Examples:\n");
      fprintf(stderr, "   http://samtools.github.io/bcftools/howtos/annotate.html\n");
@@ -3371,6 +3397,7 @@ int main_vcfannotate(int argc, char *argv[])
          {"min-overlap",required_argument,NULL,12},
          {"no-version",no_argument,NULL,8},
          {"force",no_argument,NULL,'f'},
+        {"write-index",no_argument,NULL,13},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -3447,6 +3474,7 @@ int main_vcfannotate(int argc, char *argv[])
              case 10 : args->single_overlaps = 1; break;
              case 11 : args->rename_annots = optarg; break;
              case 12 : args->min_overlap_str = optarg; break;
+            case 13 : args->write_index = 1; break;
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
          }
diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c

index 54f6a3936f0281382d80708c41c85d20e31af930..2234ddca9ef94e6bf38e13da6a4b0dd1a2b569cd 100644 (file)
--- a/bcftools/vcfannotate.c.pysam.c
+++ b/bcftools/vcfannotate.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -120,6 +120,8 @@ typedef struct _args_t
      htsFile *out_fh;
      int output_type, n_threads, clevel;
      bcf_sr_regions_t *tgts;
+    char *index_fn;
+    int write_index;
  
      regidx_t *tgt_idx;  // keep everything in memory only with .tab annotation file and -c BEG,END columns
      regitr_t *tgt_itr;
@@ -2865,9 +2867,16 @@ static void init_data(args_t *args)
  
      if ( args->mark_sites )
      {
-        if ( !args->targets_fname ) error("The -a option not given\n");
-        bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
-            args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
+        if ( !args->targets_fname )
+        {
+            if ( args->mark_sites_logic!=MARK_LISTED ) error("The -a option not given but -%s logic was requested\n",args->mark_sites);
+            fprintf(bcftools_stderr,"Note: The -a option not given, all sites will be annotated with INFO/%s\n",args->mark_sites);
+            bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites marked with `bcftools annotate -m %s`\">",
+                    args->mark_sites,args->mark_sites);
+        }
+        else
+            bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
+                args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
      }
  
      if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
@@ -2883,6 +2892,7 @@ static void init_data(args_t *args)
          if ( args->n_threads )
              hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
          if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
+        if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      }
  }
  
@@ -2945,7 +2955,19 @@ static void destroy_data(args_t *args)
          convert_destroy(args->set_ids);
      if ( args->filter )
          filter_destroy(args->filter);
-    if (args->out_fh) hts_close(args->out_fh);
+    if (args->out_fh)
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+    }
      free(args->sample_map);
      free(args->merge_method_str.s);
  }
@@ -3074,6 +3096,7 @@ static void annotate(args_t *args, bcf1_t *line)
          for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
          if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
          {
+            hts_pos_t vcf_end = line->pos + line->rlen - 1;
              while ( regitr_overlap(args->tgt_itr) )
              {
                  annot_line_t *tmp = &args->alines[0];
@@ -3084,7 +3107,7 @@ static void annotate(args_t *args, bcf1_t *line)
                  // Check min overlap
                  int len_ann = tmp->end - tmp->start + 1;
                  int len_vcf = line->rlen;
-                int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
+                int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
                  assert( isec > 0 );
                  if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
                  if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
@@ -3098,9 +3121,9 @@ static void annotate(args_t *args, bcf1_t *line)
                          error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
                      if ( ret==0 )
                          args->cols[j].done = 1;
+                    has_overlap = 1;
                  }
              }
-            has_overlap = 1;
          }
          for (j=0; j<args->ncols; j++)
          {
@@ -3275,6 +3298,8 @@ static void annotate(args_t *args, bcf1_t *line)
  
      if ( args->mark_sites )
      {
+        if ( !args->targets_fname ) has_overlap = 1;
+
          // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
          if ( args->mark_sites_logic==MARK_LISTED )
              bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0);
@@ -3317,6 +3342,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "       --single-overlaps           Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
      fprintf(bcftools_stderr, "   -x, --remove LIST               List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
      fprintf(bcftools_stderr, "       --threads INT               Number of extra output compression threads [0]\n");
+    fprintf(bcftools_stderr, "       --write-index               Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Examples:\n");
      fprintf(bcftools_stderr, "   http://samtools.github.io/bcftools/howtos/annotate.html\n");
@@ -3373,6 +3399,7 @@ int main_vcfannotate(int argc, char *argv[])
          {"min-overlap",required_argument,NULL,12},
          {"no-version",no_argument,NULL,8},
          {"force",no_argument,NULL,'f'},
+        {"write-index",no_argument,NULL,13},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -3449,6 +3476,7 @@ int main_vcfannotate(int argc, char *argv[])
              case 10 : args->single_overlaps = 1; break;
              case 11 : args->rename_annots = optarg; break;
              case 12 : args->min_overlap_str = optarg; break;
+            case 13 : args->write_index = 1; break;
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
          }
diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c

index 1cd6f504cdf4ea055f33de7cd6f159dc859c9114..d2f6e2c5fcf37106bac220938086335c521e6326 100644 (file)
--- a/bcftools/vcfcall.c
+++ b/bcftools/vcfcall.c
@@ -1,6 +1,6 @@
  /*  vcfcall.c -- SNP/indel variant calling from VCF/BCF.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -97,6 +97,8 @@ typedef struct
  
      int argc;
      char **argv;
+    char *index_fn;
+    int write_index;
  
      //  int flag, prior_type, n1, n_sub, *sublist, n_perm;
      //  uint32_t *trio_aux;
@@ -715,6 +717,7 @@ static void init_data(args_t *args)
  
      if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
      if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
  
      if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
  }
@@ -753,6 +756,15 @@ static void destroy_data(args_t *args)
      free(args->str.s);
      if ( args->gvcf ) gvcf_destroy(args->gvcf);
      bcf_hdr_destroy(args->aux.hdr);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
      bcf_sr_destroy(args->aux.srs);
  }
@@ -908,6 +920,7 @@ static void usage(args_t *args)
      fprintf(stderr, "   -M, --keep-masked-ref           Keep sites with masked reference allele (REF=N)\n");
      fprintf(stderr, "   -V, --skip-variants TYPE        Skip indels/snps\n");
      fprintf(stderr, "   -v, --variants-only             Output variant sites only\n");
+    fprintf(stderr, "       --write-index               Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Consensus/variant calling options:\n");
      fprintf(stderr, "   -c, --consensus-caller          The original calling method (conflicts with -m)\n");
@@ -990,6 +1003,7 @@ int main_vcfcall(int argc, char *argv[])
          {"chromosome-X",no_argument,NULL,'X'},
          {"chromosome-Y",no_argument,NULL,'Y'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
          {NULL,0,NULL,0}
      };
  
@@ -1076,6 +1090,7 @@ int main_vcfcall(int argc, char *argv[])
                  args.regions_overlap = parse_overlap_option(optarg);
                  if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
                  break;
+            case  10: args.write_index = 1; break;
              default: usage(&args);
          }
      }
diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c

index 975247ca59708c417941dd93bc37479c6270025e..a955342f47c00a8f20ec677a634d3da17f612073 100644 (file)
--- a/bcftools/vcfcall.c.pysam.c
+++ b/bcftools/vcfcall.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfcall.c -- SNP/indel variant calling from VCF/BCF.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -99,6 +99,8 @@ typedef struct
  
      int argc;
      char **argv;
+    char *index_fn;
+    int write_index;
  
      //  int flag, prior_type, n1, n_sub, *sublist, n_perm;
      //  uint32_t *trio_aux;
@@ -717,6 +719,7 @@ static void init_data(args_t *args)
  
      if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
      if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
  
      if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
  }
@@ -755,6 +758,15 @@ static void destroy_data(args_t *args)
      free(args->str.s);
      if ( args->gvcf ) gvcf_destroy(args->gvcf);
      bcf_hdr_destroy(args->aux.hdr);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
      bcf_sr_destroy(args->aux.srs);
  }
@@ -910,6 +922,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "   -M, --keep-masked-ref           Keep sites with masked reference allele (REF=N)\n");
      fprintf(bcftools_stderr, "   -V, --skip-variants TYPE        Skip indels/snps\n");
      fprintf(bcftools_stderr, "   -v, --variants-only             Output variant sites only\n");
+    fprintf(bcftools_stderr, "       --write-index               Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Consensus/variant calling options:\n");
      fprintf(bcftools_stderr, "   -c, --consensus-caller          The original calling method (conflicts with -m)\n");
@@ -992,6 +1005,7 @@ int main_vcfcall(int argc, char *argv[])
          {"chromosome-X",no_argument,NULL,'X'},
          {"chromosome-Y",no_argument,NULL,'Y'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
          {NULL,0,NULL,0}
      };
  
@@ -1078,6 +1092,7 @@ int main_vcfcall(int argc, char *argv[])
                  args.regions_overlap = parse_overlap_option(optarg);
                  if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
                  break;
+            case  10: args.write_index = 1; break;
              default: usage(&args);
          }
      }
diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c

index 74fd036b87871b42232d9c42b54fd61a31665a37..8e25cc5902113753b161da3d24845b184f027996 100644 (file)
--- a/bcftools/vcfconcat.c
+++ b/bcftools/vcfconcat.c
@@ -1,6 +1,6 @@
  /*  vcfconcat.c -- Concatenate or combine VCF/BCF files.
  
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -46,6 +46,8 @@ typedef struct _args_t
      int output_type, n_threads, record_cmd_line, clevel;
      bcf_hdr_t *out_hdr;
      int *seen_seq;
+    char *index_fn;
+    int write_index;
  
      // phasing
      int *start_pos, start_tid, ifname;
@@ -59,10 +61,21 @@ typedef struct _args_t
      int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap;
      int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers;
      int verbose, explicit_output_type, ligate_force, ligate_warn;
+    int sites_only;
      htsThreadPool *tpool;
  }
  args_t;
  
+static bcf_hdr_t *drop_hdr_genotypes(args_t *args, bcf_hdr_t *hdr)
+{
+    if ( !args->sites_only ) return hdr;
+    bcf_hdr_t *rmme = hdr;
+    hdr = bcf_hdr_subset(rmme, 0, 0, 0);
+    bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+    bcf_hdr_destroy(rmme);
+    return hdr;
+}
+
  static void init_data(args_t *args)
  {
      bcf1_t *line = NULL;
@@ -83,6 +96,8 @@ static void init_data(args_t *args)
      {
          htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
          bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+        hdr = drop_hdr_genotypes(args, hdr);
+
          args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
          if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
              error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
@@ -142,6 +157,7 @@ static void init_data(args_t *args)
          hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool);
      }
      if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
  
      if ( args->allow_overlaps )
      {
@@ -203,7 +219,16 @@ static void destroy_data(args_t *args)
      int i;
      if ( args->out_fh )
      {
-        if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"stdout");
      }
      if ( args->tpool && !args->files )
      {
@@ -264,7 +289,7 @@ static void phased_flush(args_t *args)
          bcf1_t *brec = args->buf[i+1];
  
          int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
-        if ( nGTs < 0 ) 
+        if ( nGTs < 0 )
          {
              if ( !gt_absent_warned )
              {
@@ -359,7 +384,7 @@ static void phased_flush(args_t *args)
              bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
              PQ_printed = 1;
              for (j=0; j<nsmpl; j++)
-                if ( args->phase_qual[j] < args->min_PQ ) 
+                if ( args->phase_qual[j] < args->min_PQ )
                  {
                      args->phase_set[j] = rec->pos+1;
                      args->phase_set_changed = 1;
@@ -582,13 +607,14 @@ static void concat(args_t *args)
              {
                  bcf1_t *line = bcf_sr_get_line(args->files,i);
                  if ( !line ) continue;
+                if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
                  bcf_translate(args->out_hdr, args->files->readers[i].header, line);
                  if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
                  if ( args->remove_dups ) break;
              }
          }
      }
-    else    // concatenating
+    else    // concatenate as is
      {
          struct timeval t0, t1;
          kstring_t tmp = {0,0,0};
@@ -604,6 +630,13 @@ static void concat(args_t *args)
              htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]);
              if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool);
              bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]);
+            if ( args->sites_only )
+            {
+                bcf_hdr_t *hdr_ori = hdr;
+                hdr = bcf_hdr_subset(hdr_ori, 0, 0, 0);
+                bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+                bcf_hdr_destroy(hdr_ori);
+            }
              if ( !fp->is_bin && args->output_type&FT_VCF )
              {
                  line->max_unpack = BCF_UN_STR;
@@ -611,6 +644,22 @@ static void concat(args_t *args)
                  while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
                  {
                      char *str = fp->line.s;
+
+                    // remove genotypes
+                    if ( args->sites_only )
+                    {
+                        int ntab = 0;
+                        while ( *str )
+                        {
+                            if ( *str == '\t' && ++ntab==8 )
+                            {
+                                *str = 0;
+                                break;
+                            }
+                            str++;
+                        }
+                        str = fp->line.s;
+                    }
                      while ( *str && *str!='\t' ) str++;
                      tmp.l = 0;
                      kputsn(fp->line.s,str-fp->line.s,&tmp);
@@ -639,6 +688,7 @@ static void concat(args_t *args)
                  line->max_unpack = 0;
                  while ( bcf_read(fp, hdr, line)==0 )
                  {
+                    if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
                      bcf_translate(args->out_hdr, hdr, line);
  
                      if ( prev_chr_id!=line->rid )
@@ -917,6 +967,7 @@ static void usage(args_t *args)
      fprintf(stderr, "   -d, --rm-dups STRING           Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
      fprintf(stderr, "   -D, --remove-duplicates        Alias for -d exact\n");
      fprintf(stderr, "   -f, --file-list FILE           Read the list of files from a file.\n");
+    fprintf(stderr, "   -G, --drop-genotypes           Drop individual genotype information.\n");
      fprintf(stderr, "   -l, --ligate                   Ligate phased VCFs by matching phase at overlapping haplotypes\n");
      fprintf(stderr, "       --ligate-force             Ligate even non-overlapping chunks, keep all sites\n");
      fprintf(stderr, "       --ligate-warn              Drop sites in imperfect overlaps\n");
@@ -931,6 +982,7 @@ static void usage(args_t *args)
      fprintf(stderr, "       --regions-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
      fprintf(stderr, "       --threads INT              Use multithreading with <int> worker threads [0]\n");
      fprintf(stderr, "   -v, --verbose 0|1              Set verbosity level [1]\n");
+    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -969,10 +1021,12 @@ int main_vcfconcat(int argc, char *argv[])
          {"file-list",required_argument,NULL,'f'},
          {"min-PQ",required_argument,NULL,'q'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,13},
+        {"drop-genotypes",no_argument,NULL,'G'},
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0)
      {
          switch (c) {
              case 'c': args->compact_PS = 1; break;
@@ -980,7 +1034,7 @@ int main_vcfconcat(int argc, char *argv[])
              case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
              case 'd': args->remove_dups = optarg; break;
              case 'D': args->remove_dups = "exact"; break;
-            case 'q': 
+            case 'q':
                  args->min_PQ = strtol(optarg,&tmp,10);
                  if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
                  break;
@@ -988,6 +1042,7 @@ int main_vcfconcat(int argc, char *argv[])
              case 'a': args->allow_overlaps = 1; break;
              case 'l': args->phased_concat = 1; break;
              case 'f': args->file_list = optarg; break;
+            case 'G': args->sites_only = 1; break;
              case 'o': args->output_fname = optarg; break;
              case 'O':
                  args->explicit_output_type = 1;
@@ -1021,6 +1076,7 @@ int main_vcfconcat(int argc, char *argv[])
                        args->verbose = strtol(optarg, &tmp, 0);
                        if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
                        break;
+            case 13 : args->write_index = 1; break;
              case 'h':
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
@@ -1035,6 +1091,7 @@ int main_vcfconcat(int argc, char *argv[])
      }
      if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n");
      if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n");
+    if ( args->sites_only && args->phased_concat ) error("The options --drop-genotypes and --ligate cannot be combined\n");
      if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
      if ( args->file_list )
      {
@@ -1049,6 +1106,7 @@ int main_vcfconcat(int argc, char *argv[])
      {
          if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
          if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
+        if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n");
          naive_concat(args);
          destroy_data(args);
          free(args);
diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c

index e1baeefea3805e94f8102a4f58ba0a98a9bdf065..0d3b3943cc1c8d6cfcba2120ca785616dd213d41 100644 (file)
--- a/bcftools/vcfconcat.c.pysam.c
+++ b/bcftools/vcfconcat.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfconcat.c -- Concatenate or combine VCF/BCF files.
  
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -48,6 +48,8 @@ typedef struct _args_t
      int output_type, n_threads, record_cmd_line, clevel;
      bcf_hdr_t *out_hdr;
      int *seen_seq;
+    char *index_fn;
+    int write_index;
  
      // phasing
      int *start_pos, start_tid, ifname;
@@ -61,10 +63,21 @@ typedef struct _args_t
      int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap;
      int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers;
      int verbose, explicit_output_type, ligate_force, ligate_warn;
+    int sites_only;
      htsThreadPool *tpool;
  }
  args_t;
  
+static bcf_hdr_t *drop_hdr_genotypes(args_t *args, bcf_hdr_t *hdr)
+{
+    if ( !args->sites_only ) return hdr;
+    bcf_hdr_t *rmme = hdr;
+    hdr = bcf_hdr_subset(rmme, 0, 0, 0);
+    bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+    bcf_hdr_destroy(rmme);
+    return hdr;
+}
+
  static void init_data(args_t *args)
  {
      bcf1_t *line = NULL;
@@ -85,6 +98,8 @@ static void init_data(args_t *args)
      {
          htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
          bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+        hdr = drop_hdr_genotypes(args, hdr);
+
          args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
          if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
              error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
@@ -144,6 +159,7 @@ static void init_data(args_t *args)
          hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool);
      }
      if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
  
      if ( args->allow_overlaps )
      {
@@ -205,7 +221,16 @@ static void destroy_data(args_t *args)
      int i;
      if ( args->out_fh )
      {
-        if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"bcftools_stdout");
      }
      if ( args->tpool && !args->files )
      {
@@ -266,7 +291,7 @@ static void phased_flush(args_t *args)
          bcf1_t *brec = args->buf[i+1];
  
          int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
-        if ( nGTs < 0 ) 
+        if ( nGTs < 0 )
          {
              if ( !gt_absent_warned )
              {
@@ -361,7 +386,7 @@ static void phased_flush(args_t *args)
              bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
              PQ_printed = 1;
              for (j=0; j<nsmpl; j++)
-                if ( args->phase_qual[j] < args->min_PQ ) 
+                if ( args->phase_qual[j] < args->min_PQ )
                  {
                      args->phase_set[j] = rec->pos+1;
                      args->phase_set_changed = 1;
@@ -584,13 +609,14 @@ static void concat(args_t *args)
              {
                  bcf1_t *line = bcf_sr_get_line(args->files,i);
                  if ( !line ) continue;
+                if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
                  bcf_translate(args->out_hdr, args->files->readers[i].header, line);
                  if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
                  if ( args->remove_dups ) break;
              }
          }
      }
-    else    // concatenating
+    else    // concatenate as is
      {
          struct timeval t0, t1;
          kstring_t tmp = {0,0,0};
@@ -606,6 +632,13 @@ static void concat(args_t *args)
              htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]);
              if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool);
              bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]);
+            if ( args->sites_only )
+            {
+                bcf_hdr_t *hdr_ori = hdr;
+                hdr = bcf_hdr_subset(hdr_ori, 0, 0, 0);
+                bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+                bcf_hdr_destroy(hdr_ori);
+            }
              if ( !fp->is_bin && args->output_type&FT_VCF )
              {
                  line->max_unpack = BCF_UN_STR;
@@ -613,6 +646,22 @@ static void concat(args_t *args)
                  while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
                  {
                      char *str = fp->line.s;
+
+                    // remove genotypes
+                    if ( args->sites_only )
+                    {
+                        int ntab = 0;
+                        while ( *str )
+                        {
+                            if ( *str == '\t' && ++ntab==8 )
+                            {
+                                *str = 0;
+                                break;
+                            }
+                            str++;
+                        }
+                        str = fp->line.s;
+                    }
                      while ( *str && *str!='\t' ) str++;
                      tmp.l = 0;
                      kputsn(fp->line.s,str-fp->line.s,&tmp);
@@ -641,6 +690,7 @@ static void concat(args_t *args)
                  line->max_unpack = 0;
                  while ( bcf_read(fp, hdr, line)==0 )
                  {
+                    if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
                      bcf_translate(args->out_hdr, hdr, line);
  
                      if ( prev_chr_id!=line->rid )
@@ -919,6 +969,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "   -d, --rm-dups STRING           Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
      fprintf(bcftools_stderr, "   -D, --remove-duplicates        Alias for -d exact\n");
      fprintf(bcftools_stderr, "   -f, --file-list FILE           Read the list of files from a file.\n");
+    fprintf(bcftools_stderr, "   -G, --drop-genotypes           Drop individual genotype information.\n");
      fprintf(bcftools_stderr, "   -l, --ligate                   Ligate phased VCFs by matching phase at overlapping haplotypes\n");
      fprintf(bcftools_stderr, "       --ligate-force             Ligate even non-overlapping chunks, keep all sites\n");
      fprintf(bcftools_stderr, "       --ligate-warn              Drop sites in imperfect overlaps\n");
@@ -933,6 +984,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "       --regions-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
      fprintf(bcftools_stderr, "       --threads INT              Use multithreading with <int> worker threads [0]\n");
      fprintf(bcftools_stderr, "   -v, --verbose 0|1              Set verbosity level [1]\n");
+    fprintf(bcftools_stderr, "       --write-index              Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      bcftools_exit(1);
  }
@@ -971,10 +1023,12 @@ int main_vcfconcat(int argc, char *argv[])
          {"file-list",required_argument,NULL,'f'},
          {"min-PQ",required_argument,NULL,'q'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,13},
+        {"drop-genotypes",no_argument,NULL,'G'},
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0)
      {
          switch (c) {
              case 'c': args->compact_PS = 1; break;
@@ -982,7 +1036,7 @@ int main_vcfconcat(int argc, char *argv[])
              case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
              case 'd': args->remove_dups = optarg; break;
              case 'D': args->remove_dups = "exact"; break;
-            case 'q': 
+            case 'q':
                  args->min_PQ = strtol(optarg,&tmp,10);
                  if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
                  break;
@@ -990,6 +1044,7 @@ int main_vcfconcat(int argc, char *argv[])
              case 'a': args->allow_overlaps = 1; break;
              case 'l': args->phased_concat = 1; break;
              case 'f': args->file_list = optarg; break;
+            case 'G': args->sites_only = 1; break;
              case 'o': args->output_fname = optarg; break;
              case 'O':
                  args->explicit_output_type = 1;
@@ -1023,6 +1078,7 @@ int main_vcfconcat(int argc, char *argv[])
                        args->verbose = strtol(optarg, &tmp, 0);
                        if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
                        break;
+            case 13 : args->write_index = 1; break;
              case 'h':
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
@@ -1037,6 +1093,7 @@ int main_vcfconcat(int argc, char *argv[])
      }
      if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n");
      if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n");
+    if ( args->sites_only && args->phased_concat ) error("The options --drop-genotypes and --ligate cannot be combined\n");
      if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
      if ( args->file_list )
      {
@@ -1051,6 +1108,7 @@ int main_vcfconcat(int argc, char *argv[])
      {
          if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
          if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
+        if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n");
          naive_concat(args);
          destroy_data(args);
          free(args);
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c

index ce5ed99810b939343148aeb6c8b18d604cad8c39..76c4a325adba2143f0b59f0b0d123e4b2000c4e3 100644 (file)
--- a/bcftools/vcfconvert.c
+++ b/bcftools/vcfconvert.c
@@ -1,6 +1,6 @@
  /*  vcfconvert.c -- convert between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -59,7 +59,7 @@ struct _args_t
      bcf_hdr_t *header;
      void (*convert_func)(struct _args_t *);
      struct {
-        int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; 
+        int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written;
      } n;
      kstring_t str;
      int32_t *gts;
@@ -70,6 +70,11 @@ struct _args_t
      char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
      char *outfname, *infname, *ref_fname, *sex_fname;
      int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
+    char *index_fn;
+    int write_index;
+    struct {
+        kstring_t ref,alt,refalt;
+    } tsv;
  };
  
  static void destroy_data(args_t *args)
@@ -139,6 +144,36 @@ static void open_vcf(args_t *args, const char *format_str)
      free(samples);
  }
  
+static int _set_ref_alt(args_t *args, bcf1_t *rec)
+{
+    args->tsv.refalt.l = 0;
+    kputs(args->tsv.ref.s, &args->tsv.refalt);
+    if ( strcmp(".",args->tsv.alt.s) && strcmp(args->tsv.ref.s,args->tsv.alt.s) )
+    {
+        kputc(',', &args->tsv.refalt);
+        kputs(args->tsv.alt.s, &args->tsv.refalt);
+    }
+    bcf_update_alleles_str(args->header, rec, args->tsv.refalt.s);
+    args->tsv.ref.l = 0;
+    args->tsv.alt.l = 0;
+    args->tsv.refalt.l = 0;
+    return 0;
+}
+static int tsv_setter_ref(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+    args_t *args = (args_t*) usr;
+    kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.ref);
+    if ( args->tsv.alt.l ) return _set_ref_alt(args,rec);
+    return 0;
+}
+static int tsv_setter_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+    args_t *args = (args_t*) usr;
+    kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.alt);
+    if ( args->tsv.ref.l ) return _set_ref_alt(args,rec);
+    return 0;
+}
+
  // Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error
  static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
  {
@@ -160,7 +195,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
      // REF,ALT
      args->str.l = 0;
      se = ++ss;
-    while ( se < tsv->se && *se!='_' ) se++; 
+    while ( se < tsv->se && *se!='_' ) se++;
      if ( *se!='_' ) return -1;
      kputsn(ss,se-ss,&args->str);
      ss = ++se;
@@ -269,12 +304,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
          if ( aa >= ab )
          {
              if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
-            else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); 
+            else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
          }
-        else if ( ab >= bb ) 
+        else if ( ab >= bb )
          {
              args->gts[2*i+0] = bcf_gt_unphased(0);
-            args->gts[2*i+1] = bcf_gt_unphased(1); 
+            args->gts[2*i+1] = bcf_gt_unphased(1);
          }
          else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
      }
@@ -293,7 +328,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
      else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
  
      // up is short for "unphased"
-    int nup = 0; 
+    int nup = 0;
      for (i=0; i<nsamples; i++)
      {
          char *ss = tsv->ss + 4*i + nup;
@@ -324,11 +359,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
                  break;
              default :
                  fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
-                return -1; 
+                return -1;
              }
              if( ss[all*2+up+1]=='*' ) up = up + 1;
          }
-        
+
          if(up && up != 2)
          {
              fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
@@ -356,13 +391,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
  static void gensample_to_vcf(args_t *args)
  {
      /*
-     *  Inpute: IMPUTE2 output (indentation changed here for clarity): 
+     *  Inpute: IMPUTE2 output (indentation changed here for clarity):
       *
       *      20:62116619_C_T 20:62116619     62116619 C T 0.969 0.031 0 ...
       *      ---             20:62116698_C_A 62116698 C A 1     0     0 ...
       *
       *  Second column is expected in the form of CHROM:POS_REF_ALT. We use second
-     *  column because the first can be empty ("--") when filling sites from reference 
+     *  column because the first can be empty ("--") when filling sites from reference
       *  panel. When the option --vcf-ids is given, the first column is used to set the
       *  VCF ID.
       *
@@ -455,6 +490,7 @@ static void gensample_to_vcf(args_t *args)
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
      bcf1_t *rec = bcf_init();
  
      nsamples -= 2;
@@ -474,6 +510,15 @@ static void gensample_to_vcf(args_t *args)
      }
      while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
  
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
      if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
      bcf_hdr_destroy(args->header);
@@ -589,6 +634,7 @@ static void haplegendsample_to_vcf(args_t *args)
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
      bcf1_t *rec = bcf_init();
  
      args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
@@ -616,6 +662,15 @@ static void haplegendsample_to_vcf(args_t *args)
          }
      }
  
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
      if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
      if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
@@ -731,6 +786,7 @@ static void hapsample_to_vcf(args_t *args)
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
      bcf1_t *rec = bcf_init();
  
      nsamples -= 2;
@@ -749,6 +805,15 @@ static void hapsample_to_vcf(args_t *args)
      }
      while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
  
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
      if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
      bcf_hdr_destroy(args->header);
@@ -784,7 +849,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
      }
      for (i=0; i<nlines; i++) free(lines[i]);
      free(lines);
-    for (i=0; i<bcf_hdr_nsamples(hdr); i++) 
+    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
          if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
      return sample2sex;
  }
@@ -847,7 +912,7 @@ static void vcf_to_gensample(args_t *args)
      if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
  
      // write samples file
-    if (sample_fname) 
+    if (sample_fname)
      {
          char *sample2sex = NULL;
          if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
@@ -915,7 +980,7 @@ static void vcf_to_gensample(args_t *args)
              nok++;
          }
      }
-    fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", 
+    fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
          nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
  
      if ( str.m ) free(str.s);
@@ -976,7 +1041,7 @@ static void vcf_to_haplegendsample(args_t *args)
      {
          char *sample2sex = NULL;
          if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
-        
+
          int i;
          BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
          str.l = 0;
@@ -1078,7 +1143,7 @@ static void vcf_to_hapsample(args_t *args)
          kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str);
      else
          kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
-    
+
      if ( args->hap2dip )
          kputs("%_GT_TO_HAP2\n", &str);
      else
@@ -1213,7 +1278,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[]
  {
      if ( se - ss > 2 ) return -1;   // currently only SNPs
  
-    if ( ss[0]=='-' )
+    if ( ss[0]=='-' || ss[0]=='.' )
      {
          // missing GT
          gts[0] = bcf_gt_missing;
@@ -1229,7 +1294,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[]
      if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
      if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
  
-    gts[0] = bcf_gt_unphased(alleles[a0]); 
+    gts[0] = bcf_gt_unphased(alleles[a0]);
      gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
  
      if ( ref==a0 && ref==a1  ) args->n.hom_rr++;    // hom ref: RR
@@ -1265,7 +1330,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
          }
          ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
          if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
-        if ( ret==-2 ) 
+        if ( ret==-2 )
          {
              // something else than a SNP
              free(ref);
@@ -1275,7 +1340,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
  
      args->str.l = 0;
      kputc(ref[0], &args->str);
-    for (i=0; i<5; i++) 
+    for (i=0; i<5; i++)
      {
          if ( alleles[i]>0 )
          {
@@ -1293,7 +1358,6 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
  static void tsv_to_vcf(args_t *args)
  {
      if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
-    if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
  
      args->ref = fai_load(args->ref_fname);
      if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
@@ -1303,17 +1367,21 @@ static void tsv_to_vcf(args_t *args)
      bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
      if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
  
-    int i, n;
-    char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
-    if ( !smpls ) error("Could not parse %s\n", args->sample_list);
-    for (i=0; i<n; i++)
+    int i, nsmpl;
+    char **smpl;
+    if ( args->sample_list )
      {
-        bcf_hdr_add_sample(args->header, smpls[i]);
-        free(smpls[i]);
+        smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+        if ( !smpl ) error("Could not parse %s\n", args->sample_list);
+        for (i=0; i<nsmpl; i++)
+        {
+            bcf_hdr_add_sample(args->header, smpl[i]);
+            free(smpl[i]);
+        }
+        free(smpl);
+        bcf_hdr_add_sample(args->header, NULL);
+        args->gts = (int32_t *) malloc(sizeof(int32_t)*nsmpl*2);
      }
-    free(smpls);
-    bcf_hdr_add_sample(args->header, NULL);
-    args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
  
      char wmode[8];
      set_wmode(wmode,args->output_type,args->outfname,args->clevel);
@@ -1321,12 +1389,18 @@ static void tsv_to_vcf(args_t *args)
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
  
      tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
      if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
      if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
      if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
-    if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
+    if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 )
+    {
+        if ( args->sample_list ) error("Expected AA column with -s/-S\n");
+        if ( tsv_register(tsv, "REF", tsv_setter_ref, args) < 0 || tsv_register(tsv, "ALT", tsv_setter_alt, args) < 0 )
+            error("Expected REF and ALT columns when AA was not given\n");
+    }
  
      bcf1_t *rec = bcf_init();
      bcf_float_set_missing(rec->qual);
@@ -1343,6 +1417,7 @@ static void tsv_to_vcf(args_t *args)
          if ( !tsv_parse(tsv, rec, line.s) )
          {
              if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+            args->n.written++;
          }
          else
              args->n.skipped++;
@@ -1350,20 +1425,36 @@ static void tsv_to_vcf(args_t *args)
      if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
      free(line.s);
  
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      bcf_hdr_destroy(args->header);
      if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
      tsv_destroy(tsv);
      bcf_destroy(rec);
      free(args->str.s);
      free(args->gts);
+    free(args->tsv.ref.s);
+    free(args->tsv.alt.s);
+    free(args->tsv.refalt.s);
  
      fprintf(stderr,"Rows total: \t%d\n", args->n.total);
      fprintf(stderr,"Rows skipped: \t%d\n", args->n.skipped);
-    fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing);
-    fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr);
-    fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra);
-    fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa);
-    fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa);
+    fprintf(stderr,"Sites written: \t%d\n", args->n.written);
+    if ( args->sample_list )
+    {
+        fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing);
+        fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr);
+        fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra);
+        fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa);
+        fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa);
+    }
  }
  
  static void vcf_to_vcf(args_t *args)
@@ -1377,6 +1468,7 @@ static void vcf_to_vcf(args_t *args)
  
      bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
      if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
  
      while ( bcf_sr_next_line(args->files) )
      {
@@ -1389,6 +1481,15 @@ static void vcf_to_vcf(args_t *args)
          }
          if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
      }
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
  }
  
@@ -1409,6 +1510,7 @@ static void gvcf_to_vcf(args_t *args)
      bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
      if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
      if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
  
      int32_t *itmp = NULL, nitmp = 0;
  
@@ -1419,7 +1521,7 @@ static void gvcf_to_vcf(args_t *args)
          {
              int pass = filter_test(args->filter, line, NULL);
              if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
-            if ( !pass ) 
+            if ( !pass )
              {
                  if ( bcf_write(out_fh,hdr,line)!=0  ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
                  continue;
@@ -1469,6 +1571,15 @@ static void gvcf_to_vcf(args_t *args)
          }
      }
      free(itmp);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
  }
  
@@ -1497,6 +1608,7 @@ static void usage(void)
      fprintf(stderr, "   -o, --output FILE              Output file name [stdout]\n");
      fprintf(stderr, "   -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
      fprintf(stderr, "       --threads INT              Use multithreading with INT worker threads [0]\n");
+    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
      fprintf(stderr, "   -G, --gensample2vcf ...        <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
@@ -1528,7 +1640,7 @@ static void usage(void)
      fprintf(stderr, "\n");
      fprintf(stderr, "TSV conversion:\n");
      fprintf(stderr, "       --tsv2vcf FILE\n");
-    fprintf(stderr, "   -c, --columns STRING           Columns of the input tsv file [ID,CHROM,POS,AA]\n");
+    fprintf(stderr, "   -c, --columns STRING           Columns of the input tsv file, see man page for details [ID,CHROM,POS,AA]\n");
      fprintf(stderr, "   -f, --fasta-ref FILE           Reference sequence in fasta format\n");
      fprintf(stderr, "   -s, --samples LIST             List of sample names\n");
      fprintf(stderr, "   -S, --samples-file FILE        File of sample names\n");
@@ -1590,6 +1702,7 @@ int main_vcfconvert(int argc, char *argv[])
          {"fasta-ref",required_argument,NULL,'f'},
          {"no-version",no_argument,NULL,10},
          {"keep-duplicates",no_argument,NULL,12},
+        {"write-index",no_argument,NULL,16},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -1618,6 +1731,7 @@ int main_vcfconvert(int argc, char *argv[])
              case  7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
              case  8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break;
              case 15 : args->gen_3N6 = 1; break;
+            case 16 : args->write_index = 1; break;
              case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
              case 'f': args->ref_fname = optarg; break;
              case 'c': args->columns = optarg; break;
@@ -1667,7 +1781,7 @@ int main_vcfconvert(int argc, char *argv[])
          else args->infname = argv[optind];
      }
      if ( !args->infname ) usage();
-    
+
      if ( args->convert_func ) args->convert_func(args);
      else vcf_to_vcf(args);
  
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c

index f340171b9680c8b9180a3eb79340bdd3c0b1e0e6..16bb3be685acbdfc15c4dc49f6d4b8f7ef294153 100644 (file)
--- a/bcftools/vcfconvert.c.pysam.c
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfconvert.c -- convert between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -61,7 +61,7 @@ struct _args_t
      bcf_hdr_t *header;
      void (*convert_func)(struct _args_t *);
      struct {
-        int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; 
+        int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written;
      } n;
      kstring_t str;
      int32_t *gts;
@@ -72,6 +72,11 @@ struct _args_t
      char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
      char *outfname, *infname, *ref_fname, *sex_fname;
      int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
+    char *index_fn;
+    int write_index;
+    struct {
+        kstring_t ref,alt,refalt;
+    } tsv;
  };
  
  static void destroy_data(args_t *args)
@@ -141,6 +146,36 @@ static void open_vcf(args_t *args, const char *format_str)
      free(samples);
  }
  
+static int _set_ref_alt(args_t *args, bcf1_t *rec)
+{
+    args->tsv.refalt.l = 0;
+    kputs(args->tsv.ref.s, &args->tsv.refalt);
+    if ( strcmp(".",args->tsv.alt.s) && strcmp(args->tsv.ref.s,args->tsv.alt.s) )
+    {
+        kputc(',', &args->tsv.refalt);
+        kputs(args->tsv.alt.s, &args->tsv.refalt);
+    }
+    bcf_update_alleles_str(args->header, rec, args->tsv.refalt.s);
+    args->tsv.ref.l = 0;
+    args->tsv.alt.l = 0;
+    args->tsv.refalt.l = 0;
+    return 0;
+}
+static int tsv_setter_ref(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+    args_t *args = (args_t*) usr;
+    kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.ref);
+    if ( args->tsv.alt.l ) return _set_ref_alt(args,rec);
+    return 0;
+}
+static int tsv_setter_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+    args_t *args = (args_t*) usr;
+    kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.alt);
+    if ( args->tsv.ref.l ) return _set_ref_alt(args,rec);
+    return 0;
+}
+
  // Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error
  static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
  {
@@ -162,7 +197,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
      // REF,ALT
      args->str.l = 0;
      se = ++ss;
-    while ( se < tsv->se && *se!='_' ) se++; 
+    while ( se < tsv->se && *se!='_' ) se++;
      if ( *se!='_' ) return -1;
      kputsn(ss,se-ss,&args->str);
      ss = ++se;
@@ -271,12 +306,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
          if ( aa >= ab )
          {
              if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
-            else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); 
+            else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
          }
-        else if ( ab >= bb ) 
+        else if ( ab >= bb )
          {
              args->gts[2*i+0] = bcf_gt_unphased(0);
-            args->gts[2*i+1] = bcf_gt_unphased(1); 
+            args->gts[2*i+1] = bcf_gt_unphased(1);
          }
          else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
      }
@@ -295,7 +330,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
      else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
  
      // up is short for "unphased"
-    int nup = 0; 
+    int nup = 0;
      for (i=0; i<nsamples; i++)
      {
          char *ss = tsv->ss + 4*i + nup;
@@ -326,11 +361,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
                  break;
              default :
                  fprintf(bcftools_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
-                return -1; 
+                return -1;
              }
              if( ss[all*2+up+1]=='*' ) up = up + 1;
          }
-        
+
          if(up && up != 2)
          {
              fprintf(bcftools_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
@@ -358,13 +393,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
  static void gensample_to_vcf(args_t *args)
  {
      /*
-     *  Inpute: IMPUTE2 output (indentation changed here for clarity): 
+     *  Inpute: IMPUTE2 output (indentation changed here for clarity):
       *
       *      20:62116619_C_T 20:62116619     62116619 C T 0.969 0.031 0 ...
       *      ---             20:62116698_C_A 62116698 C A 1     0     0 ...
       *
       *  Second column is expected in the form of CHROM:POS_REF_ALT. We use second
-     *  column because the first can be empty ("--") when filling sites from reference 
+     *  column because the first can be empty ("--") when filling sites from reference
       *  panel. When the option --vcf-ids is given, the first column is used to set the
       *  VCF ID.
       *
@@ -457,6 +492,7 @@ static void gensample_to_vcf(args_t *args)
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
      bcf1_t *rec = bcf_init();
  
      nsamples -= 2;
@@ -476,6 +512,15 @@ static void gensample_to_vcf(args_t *args)
      }
      while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
  
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
      if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
      bcf_hdr_destroy(args->header);
@@ -591,6 +636,7 @@ static void haplegendsample_to_vcf(args_t *args)
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
      bcf1_t *rec = bcf_init();
  
      args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
@@ -618,6 +664,15 @@ static void haplegendsample_to_vcf(args_t *args)
          }
      }
  
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
      if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
      if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
@@ -733,6 +788,7 @@ static void hapsample_to_vcf(args_t *args)
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
      bcf1_t *rec = bcf_init();
  
      nsamples -= 2;
@@ -751,6 +807,15 @@ static void hapsample_to_vcf(args_t *args)
      }
      while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
  
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
      if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
      bcf_hdr_destroy(args->header);
@@ -786,7 +851,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
      }
      for (i=0; i<nlines; i++) free(lines[i]);
      free(lines);
-    for (i=0; i<bcf_hdr_nsamples(hdr); i++) 
+    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
          if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
      return sample2sex;
  }
@@ -849,7 +914,7 @@ static void vcf_to_gensample(args_t *args)
      if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname);
  
      // write samples file
-    if (sample_fname) 
+    if (sample_fname)
      {
          char *sample2sex = NULL;
          if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
@@ -917,7 +982,7 @@ static void vcf_to_gensample(args_t *args)
              nok++;
          }
      }
-    fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", 
+    fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
          nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
  
      if ( str.m ) free(str.s);
@@ -978,7 +1043,7 @@ static void vcf_to_haplegendsample(args_t *args)
      {
          char *sample2sex = NULL;
          if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
-        
+
          int i;
          BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
          str.l = 0;
@@ -1080,7 +1145,7 @@ static void vcf_to_hapsample(args_t *args)
          kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str);
      else
          kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
-    
+
      if ( args->hap2dip )
          kputs("%_GT_TO_HAP2\n", &str);
      else
@@ -1215,7 +1280,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[]
  {
      if ( se - ss > 2 ) return -1;   // currently only SNPs
  
-    if ( ss[0]=='-' )
+    if ( ss[0]=='-' || ss[0]=='.' )
      {
          // missing GT
          gts[0] = bcf_gt_missing;
@@ -1231,7 +1296,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[]
      if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
      if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
  
-    gts[0] = bcf_gt_unphased(alleles[a0]); 
+    gts[0] = bcf_gt_unphased(alleles[a0]);
      gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
  
      if ( ref==a0 && ref==a1  ) args->n.hom_rr++;    // hom ref: RR
@@ -1267,7 +1332,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
          }
          ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
          if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
-        if ( ret==-2 ) 
+        if ( ret==-2 )
          {
              // something else than a SNP
              free(ref);
@@ -1277,7 +1342,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
  
      args->str.l = 0;
      kputc(ref[0], &args->str);
-    for (i=0; i<5; i++) 
+    for (i=0; i<5; i++)
      {
          if ( alleles[i]>0 )
          {
@@ -1295,7 +1360,6 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
  static void tsv_to_vcf(args_t *args)
  {
      if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
-    if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
  
      args->ref = fai_load(args->ref_fname);
      if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
@@ -1305,17 +1369,21 @@ static void tsv_to_vcf(args_t *args)
      bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
      if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
  
-    int i, n;
-    char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
-    if ( !smpls ) error("Could not parse %s\n", args->sample_list);
-    for (i=0; i<n; i++)
+    int i, nsmpl;
+    char **smpl;
+    if ( args->sample_list )
      {
-        bcf_hdr_add_sample(args->header, smpls[i]);
-        free(smpls[i]);
+        smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+        if ( !smpl ) error("Could not parse %s\n", args->sample_list);
+        for (i=0; i<nsmpl; i++)
+        {
+            bcf_hdr_add_sample(args->header, smpl[i]);
+            free(smpl[i]);
+        }
+        free(smpl);
+        bcf_hdr_add_sample(args->header, NULL);
+        args->gts = (int32_t *) malloc(sizeof(int32_t)*nsmpl*2);
      }
-    free(smpls);
-    bcf_hdr_add_sample(args->header, NULL);
-    args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
  
      char wmode[8];
      set_wmode(wmode,args->output_type,args->outfname,args->clevel);
@@ -1323,12 +1391,18 @@ static void tsv_to_vcf(args_t *args)
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
  
      tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
      if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
      if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
      if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
-    if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
+    if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 )
+    {
+        if ( args->sample_list ) error("Expected AA column with -s/-S\n");
+        if ( tsv_register(tsv, "REF", tsv_setter_ref, args) < 0 || tsv_register(tsv, "ALT", tsv_setter_alt, args) < 0 )
+            error("Expected REF and ALT columns when AA was not given\n");
+    }
  
      bcf1_t *rec = bcf_init();
      bcf_float_set_missing(rec->qual);
@@ -1345,6 +1419,7 @@ static void tsv_to_vcf(args_t *args)
          if ( !tsv_parse(tsv, rec, line.s) )
          {
              if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+            args->n.written++;
          }
          else
              args->n.skipped++;
@@ -1352,20 +1427,36 @@ static void tsv_to_vcf(args_t *args)
      if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
      free(line.s);
  
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      bcf_hdr_destroy(args->header);
      if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
      tsv_destroy(tsv);
      bcf_destroy(rec);
      free(args->str.s);
      free(args->gts);
+    free(args->tsv.ref.s);
+    free(args->tsv.alt.s);
+    free(args->tsv.refalt.s);
  
      fprintf(bcftools_stderr,"Rows total: \t%d\n", args->n.total);
      fprintf(bcftools_stderr,"Rows skipped: \t%d\n", args->n.skipped);
-    fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing);
-    fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr);
-    fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra);
-    fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa);
-    fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa);
+    fprintf(bcftools_stderr,"Sites written: \t%d\n", args->n.written);
+    if ( args->sample_list )
+    {
+        fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing);
+        fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr);
+        fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra);
+        fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa);
+        fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa);
+    }
  }
  
  static void vcf_to_vcf(args_t *args)
@@ -1379,6 +1470,7 @@ static void vcf_to_vcf(args_t *args)
  
      bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
      if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
  
      while ( bcf_sr_next_line(args->files) )
      {
@@ -1391,6 +1483,15 @@ static void vcf_to_vcf(args_t *args)
          }
          if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
      }
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
  }
  
@@ -1411,6 +1512,7 @@ static void gvcf_to_vcf(args_t *args)
      bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
      if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
      if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+    if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
  
      int32_t *itmp = NULL, nitmp = 0;
  
@@ -1421,7 +1523,7 @@ static void gvcf_to_vcf(args_t *args)
          {
              int pass = filter_test(args->filter, line, NULL);
              if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
-            if ( !pass ) 
+            if ( !pass )
              {
                  if ( bcf_write(out_fh,hdr,line)!=0  ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
                  continue;
@@ -1471,6 +1573,15 @@ static void gvcf_to_vcf(args_t *args)
          }
      }
      free(itmp);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out_fh)<0 )
+        {
+            if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
  }
  
@@ -1499,6 +1610,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "   -o, --output FILE              Output file name [bcftools_stdout]\n");
      fprintf(bcftools_stderr, "   -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
      fprintf(bcftools_stderr, "       --threads INT              Use multithreading with INT worker threads [0]\n");
+    fprintf(bcftools_stderr, "       --write-index              Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
      fprintf(bcftools_stderr, "   -G, --gensample2vcf ...        <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
@@ -1530,7 +1642,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "TSV conversion:\n");
      fprintf(bcftools_stderr, "       --tsv2vcf FILE\n");
-    fprintf(bcftools_stderr, "   -c, --columns STRING           Columns of the input tsv file [ID,CHROM,POS,AA]\n");
+    fprintf(bcftools_stderr, "   -c, --columns STRING           Columns of the input tsv file, see man page for details [ID,CHROM,POS,AA]\n");
      fprintf(bcftools_stderr, "   -f, --fasta-ref FILE           Reference sequence in fasta format\n");
      fprintf(bcftools_stderr, "   -s, --samples LIST             List of sample names\n");
      fprintf(bcftools_stderr, "   -S, --samples-file FILE        File of sample names\n");
@@ -1592,6 +1704,7 @@ int main_vcfconvert(int argc, char *argv[])
          {"fasta-ref",required_argument,NULL,'f'},
          {"no-version",no_argument,NULL,10},
          {"keep-duplicates",no_argument,NULL,12},
+        {"write-index",no_argument,NULL,16},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -1620,6 +1733,7 @@ int main_vcfconvert(int argc, char *argv[])
              case  7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
              case  8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break;
              case 15 : args->gen_3N6 = 1; break;
+            case 16 : args->write_index = 1; break;
              case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
              case 'f': args->ref_fname = optarg; break;
              case 'c': args->columns = optarg; break;
@@ -1669,7 +1783,7 @@ int main_vcfconvert(int argc, char *argv[])
          else args->infname = argv[optind];
      }
      if ( !args->infname ) usage();
-    
+
      if ( args->convert_func ) args->convert_func(args);
      else vcf_to_vcf(args);
  
diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c

index 68d8672477202fc5834133fe2b0b541a92e634c5..8665409d158dde9a394bd48f58cde79038a7b220 100644 (file)
--- a/bcftools/vcffilter.c
+++ b/bcftools/vcffilter.c
@@ -1,6 +1,6 @@
  /*  vcffilter.c -- Apply fixed-threshold filters.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -77,6 +77,8 @@ typedef struct _args_t
      char **argv, *output_fname, *targets_list, *regions_list, *mask_list;
      int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate;
      regidx_t *mask;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -491,6 +493,7 @@ static void usage(args_t *args)
      fprintf(stderr, "    -T, --targets-file FILE        Similar to -R but streams rather than index-jumps\n");
      fprintf(stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
      fprintf(stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
+    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -533,13 +536,14 @@ int main_vcffilter(int argc, char *argv[])
          {"SnpGap",required_argument,NULL,'g'},
          {"IndelGap",required_argument,NULL,'G'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,12},
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) {
          switch (c) {
              case 'g':
-                args->snp_gap = strtol(optarg,&tmp,10); 
+                args->snp_gap = strtol(optarg,&tmp,10);
                  if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
                  if ( *tmp==':' )
                  {
@@ -625,6 +629,7 @@ int main_vcffilter(int argc, char *argv[])
                  else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2;
                  else error("Could not parse: --mask-overlap %s\n",optarg);
                  break;
+            case  12 : args->write_index = 1; break;
              case 'h':
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
@@ -672,6 +677,7 @@ int main_vcffilter(int argc, char *argv[])
  
      init_data(args);
      if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      while ( bcf_sr_next_line(args->files) )
      {
          bcf1_t *line = bcf_sr_get_line(args->files, 0);
@@ -713,7 +719,15 @@ int main_vcffilter(int argc, char *argv[])
          }
      }
      buffered_filters(args, NULL);
-
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
      destroy_data(args);
      bcf_sr_destroy(args->files);
diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c

index f99808344d117bfe9fbc0f9d0961f397c79cf31a..6d17151e9ce8416a571f35d886bd445edea5cbcf 100644 (file)
--- a/bcftools/vcffilter.c.pysam.c
+++ b/bcftools/vcffilter.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcffilter.c -- Apply fixed-threshold filters.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -79,6 +79,8 @@ typedef struct _args_t
      char **argv, *output_fname, *targets_list, *regions_list, *mask_list;
      int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate;
      regidx_t *mask;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -493,6 +495,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "    -T, --targets-file FILE        Similar to -R but streams rather than index-jumps\n");
      fprintf(bcftools_stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
      fprintf(bcftools_stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
+    fprintf(bcftools_stderr, "        --write-index              Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      bcftools_exit(1);
  }
@@ -535,13 +538,14 @@ int main_vcffilter(int argc, char *argv[])
          {"SnpGap",required_argument,NULL,'g'},
          {"IndelGap",required_argument,NULL,'G'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,12},
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) {
          switch (c) {
              case 'g':
-                args->snp_gap = strtol(optarg,&tmp,10); 
+                args->snp_gap = strtol(optarg,&tmp,10);
                  if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
                  if ( *tmp==':' )
                  {
@@ -627,6 +631,7 @@ int main_vcffilter(int argc, char *argv[])
                  else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2;
                  else error("Could not parse: --mask-overlap %s\n",optarg);
                  break;
+            case  12 : args->write_index = 1; break;
              case 'h':
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
@@ -674,6 +679,7 @@ int main_vcffilter(int argc, char *argv[])
  
      init_data(args);
      if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      while ( bcf_sr_next_line(args->files) )
      {
          bcf1_t *line = bcf_sr_get_line(args->files, 0);
@@ -715,7 +721,15 @@ int main_vcffilter(int argc, char *argv[])
          }
      }
      buffered_filters(args, NULL);
-
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
      destroy_data(args);
      bcf_sr_destroy(args->files);
diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c

index f646e1f6d17e98ac5913159532956ab514aff1e7..561be62a5c4679ad381eb8a6388a0f03f9dfec68 100644 (file)
--- a/bcftools/vcfgtcheck.c
+++ b/bcftools/vcfgtcheck.c
@@ -1,6 +1,6 @@
  /*  vcfgtcheck.c -- Check sample identity.
  
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -59,6 +59,7 @@ typedef struct
      int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
      int regions_overlap, targets_overlap;
      int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+    int nused[2][2];
      double *pdiff, *qry_prob, *gt_prob;
      uint32_t *ndiff,*ncnt,ncmp, npairs;
      int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
@@ -309,7 +310,7 @@ static void init_data(args_t *args)
          init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
      }
      if ( args->gt_samples )
-    {   
+    {
          init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
              args->gt_hdr ? args->gt_hdr : args->qry_hdr,
              args->gt_fname ? args->gt_fname : args->qry_fname);
@@ -377,7 +378,7 @@ static void init_data(args_t *args)
          args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
  
          // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
-        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding 
+        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
          // probabilities of 0/0, 0/1, and 1/1 genotypes
          for (i=0; i<8; i++)
              for (j=0; j<3; j++)
@@ -555,7 +556,9 @@ static void process_line(args_t *args)
          args->gt_arr = args->qry_arr;
      }
  
+    // stats: number of compared sites, and used tags
      args->ncmp++;
+    args->nused[qry_use_GT][gt_use_GT]++;
  
      double af,hwe_dsg[8];
      if ( args->calc_hwe_prob )
@@ -636,7 +639,7 @@ static void process_line(args_t *args)
                  gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
                  if ( !gt_dsg ) continue;                        // missing value
                  if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
-               
+
                  ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
                  qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
                  if ( !qry_dsg ) continue;                       // missing value
@@ -797,11 +800,15 @@ static void report(args_t *args)
      fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
      fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
      fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+    fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
+    fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
+    fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
+    fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
      fprintf(args->fp,"# DC, discordance:\n");
      fprintf(args->fp,"#     - query sample\n");
      fprintf(args->fp,"#     - genotyped sample\n");
-    fprintf(args->fp,"#     - discordance (number of mismatches; smaller is better)\n");
-    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+    fprintf(args->fp,"#     - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n");
+    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n");
      fprintf(args->fp,"#     - number of sites compared (bigger is better)\n");
      fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
  
@@ -1023,7 +1030,7 @@ static int is_input_okay(args_t *args, int nmatch)
      return 1;
  
  not_okay:
-    fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", 
+    fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
          bcf_seqname(hdr,rec),rec->pos+1,msg);
      return 0;
  }
@@ -1097,7 +1104,7 @@ int main_vcfgtcheck(int argc, char *argv[])
      args->es_max_mem = strdup("500M");
  
      // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
-    //    - min_inter: pairs with smaller err value will be considered identical 
+    //    - min_inter: pairs with smaller err value will be considered identical
      //    - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
      //                  different. If negative, the cutoff may be heuristically lowered
      args->min_inter_err =  0.23;
@@ -1169,7 +1176,7 @@ int main_vcfgtcheck(int argc, char *argv[])
              case 3 : args->calc_hwe_prob = 0; break;
              case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
              case 5 : args->dry_run = 1; break;
-            case 6 : 
+            case 6 :
                  args->distinctive_sites = strtod(optarg,&tmp);
                  if ( *tmp )
                  {
@@ -1202,7 +1209,7 @@ int main_vcfgtcheck(int argc, char *argv[])
                  else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
                  else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
                  break;
-            case 'S': 
+            case 'S':
                  if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
                  else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
                  else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c

index e0a70ba9eadb125d59ea4610ebc8e87365e06205..54568b0544802098070c6c11532560f9fa537d4d 100644 (file)
--- a/bcftools/vcfgtcheck.c.pysam.c
+++ b/bcftools/vcfgtcheck.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfgtcheck.c -- Check sample identity.
  
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -61,6 +61,7 @@ typedef struct
      int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
      int regions_overlap, targets_overlap;
      int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+    int nused[2][2];
      double *pdiff, *qry_prob, *gt_prob;
      uint32_t *ndiff,*ncnt,ncmp, npairs;
      int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
@@ -311,7 +312,7 @@ static void init_data(args_t *args)
          init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
      }
      if ( args->gt_samples )
-    {   
+    {
          init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
              args->gt_hdr ? args->gt_hdr : args->qry_hdr,
              args->gt_fname ? args->gt_fname : args->qry_fname);
@@ -379,7 +380,7 @@ static void init_data(args_t *args)
          args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
  
          // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
-        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding 
+        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
          // probabilities of 0/0, 0/1, and 1/1 genotypes
          for (i=0; i<8; i++)
              for (j=0; j<3; j++)
@@ -557,7 +558,9 @@ static void process_line(args_t *args)
          args->gt_arr = args->qry_arr;
      }
  
+    // stats: number of compared sites, and used tags
      args->ncmp++;
+    args->nused[qry_use_GT][gt_use_GT]++;
  
      double af,hwe_dsg[8];
      if ( args->calc_hwe_prob )
@@ -638,7 +641,7 @@ static void process_line(args_t *args)
                  gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
                  if ( !gt_dsg ) continue;                        // missing value
                  if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
-               
+
                  ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
                  qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
                  if ( !qry_dsg ) continue;                       // missing value
@@ -799,11 +802,15 @@ static void report(args_t *args)
      fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
      fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
      fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+    fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
+    fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
+    fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
+    fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
      fprintf(args->fp,"# DC, discordance:\n");
      fprintf(args->fp,"#     - query sample\n");
      fprintf(args->fp,"#     - genotyped sample\n");
-    fprintf(args->fp,"#     - discordance (number of mismatches; smaller is better)\n");
-    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+    fprintf(args->fp,"#     - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n");
+    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n");
      fprintf(args->fp,"#     - number of sites compared (bigger is better)\n");
      fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
  
@@ -1025,7 +1032,7 @@ static int is_input_okay(args_t *args, int nmatch)
      return 1;
  
  not_okay:
-    fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", 
+    fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
          bcf_seqname(hdr,rec),rec->pos+1,msg);
      return 0;
  }
@@ -1099,7 +1106,7 @@ int main_vcfgtcheck(int argc, char *argv[])
      args->es_max_mem = strdup("500M");
  
      // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
-    //    - min_inter: pairs with smaller err value will be considered identical 
+    //    - min_inter: pairs with smaller err value will be considered identical
      //    - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
      //                  different. If negative, the cutoff may be heuristically lowered
      args->min_inter_err =  0.23;
@@ -1171,7 +1178,7 @@ int main_vcfgtcheck(int argc, char *argv[])
              case 3 : args->calc_hwe_prob = 0; break;
              case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
              case 5 : args->dry_run = 1; break;
-            case 6 : 
+            case 6 :
                  args->distinctive_sites = strtod(optarg,&tmp);
                  if ( *tmp )
                  {
@@ -1204,7 +1211,7 @@ int main_vcfgtcheck(int argc, char *argv[])
                  else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
                  else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
                  break;
-            case 'S': 
+            case 'S':
                  if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
                  else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
                  else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c

index a755a85b41e35d910a77e532d18403e08bf6558c..4ee29b4c87f92ed843bb8a7eb1bff4dcf68366cb 100644 (file)
--- a/bcftools/vcfisec.c
+++ b/bcftools/vcfisec.c
@@ -1,6 +1,6 @@
  /*  vcfisec.c -- Create intersections, unions and complements of VCF files.
  
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -60,6 +60,8 @@ typedef struct
      char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
      char *isec_exact;
      int argc, record_cmd_line;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -148,6 +150,8 @@ void isec_vcf(args_t *args)
          if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
          if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
          if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+        if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output");
      }
      if ( !args->nwrite && !out_std && !args->prefix )
          fprintf(stderr,"Note: -w option not given, printing list of sites...\n");
@@ -253,7 +257,19 @@ void isec_vcf(args_t *args)
          }
      }
      if ( str.s ) free(str.s);
-    if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+    if ( out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(out_fh)<0 )
+            {
+                if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+    }
  }
  
  static void add_filter(args_t *args, char *expr, int logic)
@@ -481,6 +497,7 @@ static void usage(void)
      fprintf(stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
      fprintf(stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
      fprintf(stderr, "    -w, --write LIST               List of files to write with -p given as 1-based indexes. By default, all files are written\n");
+    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Examples:\n");
      fprintf(stderr, "   # Create intersection and complements of two sets saving the output in dir/*\n");
@@ -537,6 +554,7 @@ int main_vcfisec(int argc, char *argv[])
          {"output-type",required_argument,NULL,'O'},
          {"threads",required_argument,NULL,9},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -608,6 +626,7 @@ int main_vcfisec(int argc, char *argv[])
                  break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
              case 'h':
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c

index 50214a6d2abd3a4fde224900beeca538e4d0b1d8..76e4d3a9f57e13c6c7fcfb3dbda7da572c9cf86b 100644 (file)
--- a/bcftools/vcfisec.c.pysam.c
+++ b/bcftools/vcfisec.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfisec.c -- Create intersections, unions and complements of VCF files.
  
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -62,6 +62,8 @@ typedef struct
      char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
      char *isec_exact;
      int argc, record_cmd_line;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -150,6 +152,8 @@ void isec_vcf(args_t *args)
          if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
          if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
          if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+        if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 )
+            error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output");
      }
      if ( !args->nwrite && !out_std && !args->prefix )
          fprintf(bcftools_stderr,"Note: -w option not given, printing list of sites...\n");
@@ -255,7 +259,19 @@ void isec_vcf(args_t *args)
          }
      }
      if ( str.s ) free(str.s);
-    if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+    if ( out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(out_fh)<0 )
+            {
+                if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+    }
  }
  
  static void add_filter(args_t *args, char *expr, int logic)
@@ -483,6 +499,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "        --targets-overlap 0|1|2    Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
      fprintf(bcftools_stderr, "        --threads INT              Use multithreading with <int> worker threads [0]\n");
      fprintf(bcftools_stderr, "    -w, --write LIST               List of files to write with -p given as 1-based indexes. By default, all files are written\n");
+    fprintf(bcftools_stderr, "        --write-index              Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Examples:\n");
      fprintf(bcftools_stderr, "   # Create intersection and complements of two sets saving the output in dir/*\n");
@@ -539,6 +556,7 @@ int main_vcfisec(int argc, char *argv[])
          {"output-type",required_argument,NULL,'O'},
          {"threads",required_argument,NULL,9},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -610,6 +628,7 @@ int main_vcfisec(int argc, char *argv[])
                  break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
              case 'h':
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c

index 621f4102c262e1f7bb822af42d35e87ebc11fd48..87b6b8a39d62eb7ec115044cfe851b2d47f11737 100644 (file)
--- a/bcftools/vcfmerge.c
+++ b/bcftools/vcfmerge.c
@@ -1,6 +1,6 @@
  /*  vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
  
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -63,6 +63,19 @@ typedef khash_t(strdict) strdict_t;
  
  #define PL2PROB_MAX 1024
  
+// Rules for merging FORMAT Number=A,G,R vectors with missing values
+#define MERGE_MISSING_DOT   0   // leave as is, i.e. use a missing value "."
+#define MERGE_MISSING_CONST 1   // use a constant value
+#define MERGE_MISSING_MAX   2   // use the existing maximum value
+
+typedef struct _missing_rule_t
+{
+    char *hdr_tag;
+    int type;
+    float value;
+}
+missing_rule_t;
+
  // For merging INFO Number=A,G,R tags
  typedef struct
  {
@@ -103,29 +116,37 @@ typedef struct
      int *map;   // mapping from input alleles to the array of output alleles (set by merge_alleles)
      int mmap;   // size of map array (only buffer[i].n_allele is actually used)
      int als_differ;
+    int var_types;  // variant types in this record, shifted by <<1 to account for VCF_REF
  }
  maux1_t;
+
+// Buffered lines for a single reader
  typedef struct
  {
      int rid;        // current rid
      int beg,end;    // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+    int unkn_allele;// the index of the unknown allele (<*>, <NON_REF>)
      int cur;        // current line or -1 if none
      int mrec;       // allocated size of buf
      maux1_t *rec;   // buffer to keep reader's lines
      bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+    int var_types;  // reader's variant types in the active [beg,end] window
  }
  buffer_t;
  typedef struct
  {
-    int n, pos, var_types;  // number of readers, current position, currently available variant types
+    int n, pos, var_types;  // number of readers; current position; variant types at this position across all available records
+    int *als_types,         // allele type of each output allele
+        mals_types;
      char *chr;              // current chromosome
      char **als, **out_als;  // merged alleles (temp, may contain empty records) and merged alleles ready for output
      int nals, mals, nout_als, mout_als; // size of the output array
      int *cnt, ncnt; // number of records that refer to the alleles
      int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
+    const char **fmt_key;// temporary short-lived array to store output tag names
      bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
      int nfmt_map;        // number of rows in the fmt_map array
-    int *agr_map, nagr_map, magr_map;   // mapping between Number=AGR element indexes
+    int *agr_map, nagr_map, magr_map;   // mapping between Number=AGR element indexes, from src idxs to dst file idxs
      void *tmp_arr;
      size_t ntmp_arr;
      buffer_t *buf;
@@ -156,6 +177,9 @@ typedef struct
      faidx_t *gvcf_fai;
      info_rule_t *rules;
      int nrules;
+    char *missing_rules_str;
+    missing_rule_t *missing_rules;    // lookup for -M, --missing-rules
+    int nmissing_rules;
      strdict_t *tmph;
      kstring_t tmps;
      bcf_srs_t *files;
@@ -166,6 +190,8 @@ typedef struct
      int argc, n_threads, record_cmd_line, clevel;
      int local_alleles;    // the value of -L option
      int keep_AC_AN;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -298,6 +324,89 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul
      }
  }
  
+static int missing_rules_comp_key2(const void *a, const void *b)
+{
+    missing_rule_t *rule1 = (missing_rule_t*) a;
+    missing_rule_t *rule2 = (missing_rule_t*) b;
+    return strcmp(rule1->hdr_tag, rule2->hdr_tag);
+}
+static int missing_rules_comp_key(const void *a, const void *b)
+{
+    char *key = (char*) a;
+    missing_rule_t *rule = (missing_rule_t*) b;
+    return strcmp(key, rule->hdr_tag);
+}
+static void missing_rules_init(args_t *args)
+{
+    kstring_t str = {0,0,0};
+    if ( args->missing_rules_str )
+    {
+        if ( !strcmp("-",args->missing_rules_str) ) kputs("PL:.,AD:.",&str);
+        else kputs(args->missing_rules_str,&str);
+    }
+    else if ( args->do_gvcf ) kputs("PL:max,AD:0",&str);
+    else return;
+
+    args->nmissing_rules = 1;
+    char *ss = str.s, *tmp = ss;
+    int n = 0;
+    while ( *ss )
+    {
+        if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+        else if ( *ss==',' ) { *ss = 0; args->nmissing_rules++; n++; if ( n%2==1 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+        ss++;
+    }
+    if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+    args->missing_rules = (missing_rule_t*) calloc(args->nmissing_rules,sizeof(missing_rule_t));
+
+    n = args->nmissing_rules;
+    args->nmissing_rules = 0;
+    ss = tmp;
+    while ( args->nmissing_rules < n  )
+    {
+        missing_rule_t *rule = &args->missing_rules[args->nmissing_rules];
+        rule->hdr_tag = strdup(ss);
+        int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
+        if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_FMT,id) )
+        {
+            if ( args->missing_rules_str ) error("The FORMAT tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+            free(rule->hdr_tag);
+            n--;
+            ss = strchr(ss, '\0'); ss++;
+            if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+            ss = strchr(ss, '\0'); ss++;
+            continue;
+        }
+
+        ss = strchr(ss, '\0'); ss++;
+        if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+
+        if ( !strcasecmp(ss,".") ) rule->type = MERGE_MISSING_DOT;
+        else if ( !strcasecmp(ss,"max") ) rule->type = MERGE_MISSING_MAX;
+        else
+        {
+            char *tmp = ss;
+            rule->value = strtod(ss, &tmp);
+            if ( *tmp ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+            rule->type = MERGE_MISSING_CONST;
+        }
+        ss = strchr(ss, '\0'); ss++;
+        args->nmissing_rules++;
+    }
+    qsort(args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key2);
+    free(str.s);
+}
+static void missing_rules_destroy(args_t *args)
+{
+    int i;
+    for (i=0; i<args->nmissing_rules; i++)
+    {
+        missing_rule_t *rule = &args->missing_rules[i];
+        free(rule->hdr_tag);
+    }
+    free(args->missing_rules);
+}
+
  static int info_rules_comp_key2(const void *a, const void *b)
  {
      info_rule_t *rule1 = (info_rule_t*) a;
@@ -770,6 +879,7 @@ void maux_destroy(maux_t *ma)
      int i,j;
      for (i=0; i<ma->nout_smpl; i++) free(ma->str[i].s);
      free(ma->str);
+    free(ma->als_types);
      for (i=0; i<ma->mals; i++)
      {
          free(ma->als[i]);
@@ -793,6 +903,7 @@ void maux_destroy(maux_t *ma)
      free(ma->AGR_info);
      if (ma->ntmp_arr) free(ma->tmp_arr);
      if (ma->nfmt_map) free(ma->fmt_map);
+    free(ma->fmt_key);
      // ma->inf freed in bcf_destroy1
      for (i=0; i<ma->mals; i++) free(ma->als[i]);
      if (ma->mout_als) free(ma->out_als);
@@ -820,7 +931,6 @@ void maux_reset(maux_t *ma, int *rid_tab)
  {
      int i,j;
      for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
-    for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
      for (i=0; i<ma->mals; i++)
      {
          free(ma->als[i]);
@@ -856,6 +966,7 @@ void maux_reset(maux_t *ma, int *rid_tab)
          for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
          {
              ma->buf[i].rec[j].skip = 0;
+            ma->buf[i].rec[j].var_types = 0;
              bcf1_t *line = ma->files->readers[i].buffer[j];
              if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
          }
@@ -959,12 +1070,14 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
          int ir, j;
          for (ir=0; ir<files->nreaders; ir++)
          {
+            ma->buf[ir].unkn_allele = 0;
              bcf1_t *line = maux_get_line(args,ir);
              if ( !line ) continue;
              for (j=1; j<line->n_allele; j++)
              {
                  int irec = ma->buf[ir].cur;
                  if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+                if ( bcf_has_variant_type(line,j,VCF_REF) && line->d.allele[j][0]=='<' ) ma->buf[ir].unkn_allele = j;
              }
          }
      }
@@ -1985,7 +2098,7 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
          bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
      ma->laa_dirty = 1;
  }
-void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule, bcf1_t *out)
  {
      bcf_srs_t *files = args->files;
      bcf_hdr_t *out_hdr = args->out_hdr;
@@ -2135,12 +2248,32 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
                          for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
                          continue; \
                      } \
-                    int ngsize = ma->smpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
-                    for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+                    int haploid = ma->smpl_ploidy[ismpl+j]==1 ? 1 : 0; \
+                    int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
+                    if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
+                    {  \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \
+                        for (l=0; l<ngsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+                    { \
+                        for (l=0; l<ngsize; l++) { *tgt = mrule->value; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+                    { \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        src_type_t max = src[0]; \
+                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        for (l=0; l<ngsize; l++) { *tgt = max; tgt++; } \
+                    } \
+                    else \
+                    { \
+                        for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+                    } \
                      for (; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
-                    if ( ma->smpl_ploidy[ismpl+j]==1 ) \
+                    if ( haploid ) \
                      { \
-                        /* Haploid */ \
                          int iori, inew; \
                          for (iori=0; iori<line->n_allele; iori++) \
                          { \
@@ -2194,7 +2327,26 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
                          continue; \
                      } \
                      src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
-                    for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+                    if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
+                    { \
+                        int iunkn = ma->buf[i].unkn_allele; \
+                        for (l=0; l<nsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+                    { \
+                        for (l=0; l<nsize; l++) { *tgt = mrule->value; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+                    { \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        src_type_t max = src[0]; \
+                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        for (l=0; l<nsize; l++) { *tgt = max; tgt++; } \
+                    } \
+                    else \
+                    { \
+                        for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+                    } \
                      int iori,inew; \
                      for (iori=ifrom; iori<line->n_allele; iori++) \
                      { \
@@ -2234,6 +2386,7 @@ void merge_format(args_t *args, bcf1_t *out)
      {
          ma->nfmt_map = 2;
          ma->fmt_map  = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*));
+        ma->fmt_key  = (const char**) malloc(ma->nfmt_map*sizeof(*ma->fmt_key));
      }
      else
          memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**));
@@ -2250,7 +2403,7 @@ void merge_format(args_t *args, bcf1_t *out)
          bcf_hdr_t *hdr = reader->header;
          for (j=0; j<line->n_fmt; j++)
          {
-            // Wat this tag already seen?
+            // Was this tag already seen?
              bcf_fmt_t *fmt = &line->d.fmt[j];
              const char *key = hdr->id[BCF_DT_ID][fmt->id].key;
              kitr = kh_get(strdict, tmph, key);
@@ -2269,9 +2422,11 @@ void merge_format(args_t *args, bcf1_t *out)
                      {
                          ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders);
                          memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
+                        ma->fmt_key = (const char**) realloc(ma->fmt_key, sizeof(*ma->fmt_key)*(max_ifmt+1));
                          ma->nfmt_map = max_ifmt+1;
                      }
                      if ( key[0]=='P' && key[1]=='L' && key[2]==0  ) { has_PL = ifmt; }
+                    ma->fmt_key[max_ifmt] = key;
                  }
                  kitr = kh_put(strdict, tmph, key, &ret);
                  kh_value(tmph, kitr) = ifmt;
@@ -2298,7 +2453,10 @@ void merge_format(args_t *args, bcf1_t *out)
          update_AN_AC(out_hdr, out);
  
      for (i=1; i<=max_ifmt; i++)
-        merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+    {
+        missing_rule_t *rule = (missing_rule_t*) bsearch(ma->fmt_key[i], args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key);
+        merge_format_field(args, &ma->fmt_map[i*files->nreaders], rule, out);
+    }
  
      if ( ma->laa_dirty )
          update_local_alleles(args, out);
@@ -2406,6 +2564,9 @@ void gvcf_write_block(args_t *args, int start, int end)
      {
          int slen  = 0;
          char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+        if (!seq)
+            exit(1); // faidx_fetch_seq has already reported the error.
+
          if (slen)
          {
              out->d.allele[0][0] = seq[0];
@@ -2520,16 +2681,6 @@ static inline int is_gvcf_block(bcf1_t *line)
      return 0;
  }
  
-// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
-// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and
-// to accommodate for VCF_GVCF_REF defined below
-static const int
-    snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2),
-    indel_mask = VCF_INDEL<<2,
-    ins_mask = VCF_INS<<2,
-    del_mask = VCF_DEL<<2,
-    ref_mask = 2;
-
  /*
      Check incoming lines for new gVCF blocks, set pointer to the current source
      buffer (gvcf or readers).  In contrast to gvcf_flush, this function can be
@@ -2629,7 +2780,7 @@ void clean_buffer(args_t *args)
          {
              if ( ma->gvcf[ir].active )
              {
-                if ( ma->pos >= ma->gvcf[ir].end )  ma->gvcf[ir].active = 0;
+                if ( ma->pos > ma->gvcf[ir].end )  ma->gvcf[ir].active = 0;
                  else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg;  // re-activate interrupted gVCF block
              }
              if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
@@ -2664,13 +2815,16 @@ void debug_maux(args_t *args)
      {
          bcf_sr_t *reader = &files->readers[j];
          buffer_t *buf = &maux->buf[j];
-        fprintf(stderr," reader %d: ", j);
+        fprintf(stderr," reader %d (k=%d-%d): ", j,buf->beg,buf->end);
          for (k=buf->beg; k<buf->end; k++)
          {
-            if ( buf->rec[k].skip & SKIP_DONE ) continue;
-            bcf1_t *line = reader->buffer[k];
+            if ( buf->rec[k].skip & SKIP_DONE ) { fprintf(stderr," DONE"); continue; }
+            bcf1_t *line = reader->buffer[k];               // selected for merging by can_merge
              fprintf(stderr,"\t");
-            if ( buf->rec[k].skip ) fprintf(stderr,"[");  // this record will not be merged in this round
+            if ( buf->cur==k ) fprintf(stderr,"!");         // selected for merging by stage_line
+            if ( buf->rec[k].skip ) fprintf(stderr,"[");    // this record cannot be merged in this round
+            if ( !line->n_allele && maux->gvcf[j].active )
+                fprintf(stderr,"<*>");
              for (l=0; l<line->n_allele; l++)
                  fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]);
              if ( buf->rec[k].skip ) fprintf(stderr,"]");
@@ -2686,9 +2840,10 @@ void debug_state(args_t *args)
  {
      maux_t *maux = args->maux;
      int i,j;
+    fprintf(stderr,"State after position=%d done:\n",maux->pos+1);
      for (i=0; i<args->files->nreaders; i++)
      {
-        fprintf(stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
+        fprintf(stderr,"\treader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
          if ( maux->buf[i].cur >=0 )
          {
              bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
@@ -2698,20 +2853,136 @@ void debug_state(args_t *args)
          }
          fprintf(stderr,"\n");
      }
-    fprintf(stderr,"gvcf_min=%d\n", args->maux->gvcf_min);
+    fprintf(stderr,"\tgvcf_min=%d\n", args->maux->gvcf_min);
      for (i=0; i<args->files->nreaders; i++)
      {
-        fprintf(stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
+        fprintf(stderr,"\t\treader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
          if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1);
          fprintf(stderr,"\n");
      }
      fprintf(stderr,"\n");
  }
  
+
+// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
+// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault)
+static const int
+    snp_mask   = (VCF_SNP<<1)|(VCF_MNP<<1),
+    indel_mask = (VCF_INDEL<<1),
+    ins_mask   = VCF_INS<<1,
+    del_mask   = VCF_DEL<<1,
+    ref_mask   = 1;
+
+// Can these types be merged given the -m settings? Despite the function's name, its focus is on
+// excluding incompatible records, there will be a finer matching later in stage_line()
+static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec)
+{
+    int k;
+    maux_t *maux = args->maux;
+    bcf1_t *rec = buf->lines[irec];
+    int rec_types = buf->rec[irec].var_types;
+
+    assert( selected_types );   // this is trivially true, set in can_merge()
+
+    if ( args->collapse & COLLAPSE_ANY ) return 1;  // can merge anything with anything
+
+    // REF and gVCF_REF with no other alleles present can be merged with anything
+    if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1;
+    if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
+
+    if ( args->collapse!=COLLAPSE_NONE )
+    {
+        // If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del
+        // Include the new record if
+        //  - rec has SNV, we already have SNV, and -m is both,snps,snp-ins-del
+        //  - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
+        if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) )
+        {
+            if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1;
+        }
+        if ( args->collapse&COLLAPSE_INDELS )
+        {
+            if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1;
+        }
+        if ( args->collapse&COLLAPSE_SNP_INS_DEL )
+        {
+            if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1;
+            if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1;
+        }
+        // Whatever is left, allow to match if the alleles match exactly
+    }
+
+    // The -m none mode or exact matching requested
+    // Simple test first: are the variants of the same type?
+    int x = selected_types >> 1;        // remove REF
+    int y = rec_types >> 1;             // remove REF
+    while ( x && y ) { x>>=1; y>>=1; }
+    if ( x || y ) return 0;             // the types differ
+
+    if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0;   // refs are not compatible
+    for (k=1; k<rec->n_allele; k++)
+    {
+        if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue;    // this must be gVCF_REF (<*> or <NON_REF>)
+        if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
+    }
+    if ( k==rec->n_allele ) return 0;   // this record has a new allele rec->d.allele[k]
+    return 1;   // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
+}
+
+static void maux_update_alleles(args_t *args, int ireader, int irec)
+{
+    int k;
+    bcf_sr_t *reader = &args->files->readers[ireader];
+    maux_t *maux = args->maux;
+    buffer_t *buf = &maux->buf[ireader];
+    maux1_t *ma1 = &buf->rec[irec];
+    bcf1_t *line = buf->lines[irec];
+    hts_expand(int, line->n_allele, ma1->mmap, ma1->map);
+    if ( !maux->nals )  // first record to be merged, copy the alleles to the output
+    {
+        maux->nals = line->n_allele;
+        hts_expand0(char*, maux->nals, maux->mals, maux->als);
+        hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+        hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+        for (k=0; k<maux->nals; k++)
+        {
+            free(maux->als[k]);
+            maux->als[k] = strdup(line->d.allele[k]);
+            ma1->map[k]  = k;
+            maux->cnt[k] = 1;
+            int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+            if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+            maux->als_types[k] = var_type ? var_type<<1 : ref_mask;
+        }
+        return;
+    }
+    // normalize alleles
+    maux->als = merge_alleles(line->d.allele, line->n_allele, ma1->map, maux->als, &maux->nals, &maux->mals);
+    if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
+    hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+    hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+    for (k=1; k<line->n_allele; k++)
+    {
+        int ik = ma1->map[k];
+        int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+        if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+        maux->als_types[ik] = var_type ? var_type<<1 : ref_mask;
+        maux->cnt[ik]++;    // how many times an allele appears in the files
+    }
+    maux->cnt[0]++;
+}
+
  /*
-   Determine which line should be merged from which reader: go through all
-   readers and all buffered lines, expand REF,ALT and try to match lines with
-   the same ALTs.
+   Determine which lines remain to be merged across readers at the current position and
+   are compatible given the -m criteria. This is indicated by maux1_t.skip: 0=compatible,
+   SKIP_DONE=the record is done, SKIP_DIFF=not compatible and will be included next time.
+
+   At the same time count how many times is each allele present across the readers and records
+   so that we can prioritize the records with the same alleles to come first. In the end maximum
+   one record at a time can be selected from each reader and that witll be done in stage_line().
+
+   The function maux_reset already initialized structures for this position, so here each
+   reader comes with the beg,end indexes that point to records with the same maux_t.pos position.
   */
  int can_merge(args_t *args)
  {
@@ -2719,28 +2990,39 @@ int can_merge(args_t *args)
      maux_t *maux = args->maux;
      gvcf_aux_t *gaux = maux->gvcf;
      char *id = NULL, ref = 'N';
-    int i,j,k, ntodo = 0;
+    int i,j, ntodo = 0;
  
      for (i=0; i<maux->nals; i++)
      {
          free(maux->als[i]);
          maux->als[i] = NULL;
+        maux->cnt[i] = 0;
      }
      maux->var_types = maux->nals = 0;
  
-    // this is only for the `-m none -g` mode, ensure that <*> lines come last
-    #define VCF_GVCF_REF 1
-
+    // In this loop we do the following:
+    //  - remember the first encountered ID if matching by ID
+    //  - count the number of unprocessed records at this position
+    //  - collect all variant types at this position. This is to be able to perform -m matching and
+    //    print SNVs first, then indels, then gVCF blocks
+    //  - init the 'skip' variable to SKIP_DIFF for each record that has not been used yet
      for (i=0; i<files->nreaders; i++)
      {
          buffer_t *buf = &maux->buf[i];
+        buf->var_types = 0;
  
-        if ( gaux && gaux[i].active )
+        if ( gaux && gaux[i].active ) // active gvcf block
          {
-            // skip readers with active gvcf blocks
              buf->rec[buf->beg].skip = SKIP_DIFF;
+            maux->var_types |= ref_mask;
+            buf->var_types |= ref_mask;
+            buf->rec[buf->beg].var_types = ref_mask;
              continue;
          }
+
+        // for gvcf: find out REF at this position
+        if ( buf->beg < buf->end && ref=='N' ) ref = buf->lines[buf->beg]->d.allele[0][0];
+
          for (j=buf->beg; j<buf->end; j++)
          {
              if ( buf->rec[j].skip & SKIP_DONE ) continue;
@@ -2749,118 +3031,70 @@ int can_merge(args_t *args)
              ntodo++;
  
              bcf1_t *line = buf->lines[j];
-            if ( args->merge_by_id )
-                id = line->d.id;
-            else
+            if ( args->merge_by_id && !id ) { id = line->d.id; continue; }      // set ID when merging by id
+
+            if ( !buf->rec[j].var_types )
              {
                  int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
-                if (var_type < 0) error("bcf_has_variant_types() failed.");
+                if ( var_type < 0 ) error("bcf_has_variant_types() failed.");
                  if ( args->collapse==COLLAPSE_SNP_INS_DEL )
                  {
                      // need to distinguish between ins and del so strip the VCF_INDEL flag
                      var_type &= ~VCF_INDEL;
                  }
-                maux->var_types |= var_type ? var_type<<2 : 2;
-
-                // for the `-m none -g` mode
-                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
-                    maux->var_types |= VCF_GVCF_REF;
+                var_type = var_type ? var_type<<1 : ref_mask;
+                if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask;
+                buf->rec[j].var_types = var_type;
              }
+            maux->var_types |= buf->rec[j].var_types;
+            buf->var_types |= buf->rec[j].var_types;
          }
-
-        // for gvcf: find out REF at this position
-        if ( buf->beg < buf->end && ref=='N' )
-            ref = buf->lines[buf->beg]->d.allele[0][0];
      }
      if ( !ntodo ) return 0;
  
+    int selected_types = 0;
+
      // In this loop we select from each reader compatible candidate lines.
      // (i.e. SNPs or indels). Go through all files and all lines at this
      // position and normalize relevant alleles.
      // REF-only sites may be associated with both SNPs and indels.
      for (i=0; i<files->nreaders; i++)
      {
-        bcf_sr_t *reader = &files->readers[i];
          buffer_t *buf = &maux->buf[i];
-
          if ( gaux && gaux[i].active )
          {
+            // gVCF records inherited from an upstream gVCF block have incorrect or missing allele and position
              gaux[i].line->d.allele[0][0] = ref;
              gaux[i].line->pos = maux->pos;
+            maux_update_alleles(args, i, buf->beg);
+            selected_types |= ref_mask;
+            continue;
          }
-
          for (j=buf->beg; j<buf->end; j++)
          {
              if ( buf->rec[j].skip & SKIP_DONE ) continue;
  
              bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
-
-            int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
-            if (line_type < 0) error("bcf_has_variant_types() failed.");
-            line_type = line_type ? line_type<<2 : 2;
+            int line_types = buf->rec[j].var_types;
  
              // select relevant lines
              if ( args->merge_by_id )
              {
-                if ( strcmp(id,line->d.id) ) continue;
+                if ( strcmp(id,line->d.id) ) continue;      // matching by ID and it does not match the selected record
              }
+            else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue;
              else
              {
-                // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
-                // records come last, otherwise infinite loop is created (#1164)
-                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
-                {
-                    if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
-                }
-                if ( args->collapse==COLLAPSE_NONE && maux->nals )
-                {
-                    // All alleles of the tested record must be present in the
-                    // selected maux record plus variant types must be the same
-                    if ( (maux->var_types & line_type) != line_type ) continue;
-                    if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue;   // refs not compatible
-                    for (k=1; k<line->n_allele; k++)
-                    {
-                        if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
-                    }
-                    if ( !(line_type&ref_mask) && k==line->n_allele ) continue;  // not a REF-only site and there is no matching allele
-                }
-                if ( !(args->collapse&COLLAPSE_ANY) )
-                {
-                    // Merge:
-                    //  - SNPs+SNPs+MNPs+REF if -m both,snps
-                    //  - indels+indels+REF  if -m both,indels, REF only if SNPs are not present
-                    //  - SNPs come first
-                    if ( line_type & (indel_mask|ins_mask|del_mask) )
-                    {
-                        if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue;  // SNPs come first
-                        if ( args->do_gvcf && maux->var_types&ref_mask ) continue;  // never merge indels with gVCF blocks
-                    }
-                }
+                // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
+                if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE)     // asked to merge SNVs into multiallelics
+                        && (maux->var_types&snp_mask)                   // there are SNVs at the current position
+                        && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
+                   ) continue;
              }
-            buf->rec[j].skip = 0;
+            selected_types |= line_types;
  
-            hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
-            if ( !maux->nals )    // first record, copy the alleles to the output
-            {
-                maux->nals = line->n_allele;
-                hts_expand0(char*, maux->nals, maux->mals, maux->als);
-                hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
-                for (k=0; k<maux->nals; k++)
-                {
-                    free(maux->als[k]);
-                    maux->als[k] = strdup(line->d.allele[k]);
-                    buf->rec[j].map[k] = k;
-                    maux->cnt[k] = 1;
-                }
-                continue;
-            }
-            // normalize alleles
-            maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
-            if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
-            hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
-            for (k=1; k<line->n_allele; k++)
-                maux->cnt[ buf->rec[j].map[k] ]++;    // how many times an allele appears in the files
-            maux->cnt[0]++;
+            buf->rec[j].skip = 0;   // the j-th record from i-th reader can be included. Final decision will be made in stage_line
+            maux_update_alleles(args, i, j);
          }
      }
      return 1;
@@ -2878,48 +3112,61 @@ void stage_line(args_t *args)
      bcf_srs_t *files = args->files;
      maux_t *maux = args->maux;
  
-    // debug_maux(args);
-
-    // take the most frequent allele present in multiple files, REF is skipped
-    int i,j,k,icnt = 1;
-    for (i=2; i<maux->nals; i++)
-        if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+    // Take the most frequent allele present in multiple files, REF and gVCF_REF is skipped.
+    int i,j,k,icnt = -1;
+    for (i=1; i<maux->nals; i++)
+    {
+        if ( maux->als_types[i] & ref_mask ) continue;
+        if ( icnt==-1 || maux->cnt[icnt] < maux->cnt[i] ) icnt = i;
+    }
+    int selected_type = icnt>0 ? maux->als_types[icnt] : ref_mask;
  
      int nout = 0;
      for (i=0; i<files->nreaders; i++)
      {
          buffer_t *buf = &maux->buf[i];
          buf->cur = -1;
-        if ( buf->beg >= buf->end ) continue;   // no lines in the buffer
+        if ( buf->beg >= buf->end ) continue; // No lines in the buffer at this site
  
          // find lines with the same allele
          for (j=buf->beg; j<buf->end; j++)
          {
-            if ( buf->rec[j].skip ) continue;   // done or not compatible
-            if ( args->merge_by_id ) break;
-            if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break;   // REF-only record
+            if ( buf->rec[j].skip )
+            {
+                int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0;
+                if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1;
+                if ( !is_gvcf ) continue;   // done or not compatible
+            }
+            if ( args->merge_by_id ) break;     // if merging by ID and the line is compatible, the this is THE line
+
+            // skip if the reader has a record that matches the most frequent allele and this record is not it
+            if ( (selected_type & buf->var_types) && !(selected_type & buf->rec[j].var_types) ) continue;
  
+            // if the reader does not have the most frequent allele type but is a ref, accept
+            if ( !(selected_type & buf->var_types) && (buf->rec[j].var_types & ref_mask) ) break;
+            if ( selected_type==ref_mask ) break;
+
+            // accept if the record has the most frequent allele
              for (k=0; k<buf->lines[j]->n_allele; k++)
                  if ( icnt==buf->rec[j].map[k] ) break;
-
              if ( k<buf->lines[j]->n_allele ) break;
          }
          if ( j>=buf->end )
          {
              // no matching allele found in this file
-            if ( args->collapse==COLLAPSE_NONE ) continue;
+            if ( args->collapse==COLLAPSE_NONE ) continue;  // exact matching requested, skip
  
+            // choose something compatible to create a multiallelic site given the -m criteria
              for (j=buf->beg; j<buf->end; j++)
              {
                  if ( buf->rec[j].skip ) continue;   // done or not compatible
                  if ( args->collapse&COLLAPSE_ANY ) break;   // anything can be merged
-                int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap);
-                if (line_type < 0) error("bcf_has_variant_types() failed.");
-                if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
-                if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
-                if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
-                if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
-                if ( line_type==VCF_REF )
+                int line_type = buf->rec[j].var_types;
+                if ( maux->var_types&snp_mask && line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+                if ( maux->var_types&indel_mask && line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+                if ( maux->var_types&ins_mask && line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+                if ( maux->var_types&del_mask && line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+                if ( line_type&ref_mask )
                  {
                      if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
                      if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
@@ -2940,12 +3187,21 @@ void stage_line(args_t *args)
          {
              // found a suitable line for merging
              buf->cur = j;
-
-            // mark as finished so that it's ignored next time
-            buf->rec[j].skip  = SKIP_DONE;
-            nout++;
          }
      }
+
+    // debug_maux(args);
+
+    // Mark lines staged for merging as finished so that they are ignored next time
+    for (i=0; i<files->nreaders; i++)
+    {
+        buffer_t *buf = &maux->buf[i];
+        if ( buf->cur == -1 ) continue;
+
+        buf->rec[buf->cur].skip  = SKIP_DONE;
+        nout++;
+    }
+
      assert( nout );
  }
  
@@ -3078,6 +3334,7 @@ void merge_vcf(args_t *args)
              error_errno("[%s] Failed to update header", __func__);
      }
      info_rules_init(args);
+    missing_rules_init(args);
  
      bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
      if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
@@ -3087,6 +3344,7 @@ void merge_vcf(args_t *args)
          if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
          return;
      }
+    else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
  
      if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
      args->maux = maux_init(args);
@@ -3122,9 +3380,19 @@ void merge_vcf(args_t *args)
          gvcf_flush(args,1);
  
      info_rules_destroy(args);
+    missing_rules_destroy(args);
      maux_destroy(args->maux);
      bcf_hdr_destroy(args->out_hdr);
-    if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
+    if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"stdout");
      bcf_destroy1(args->out_line);
      kh_destroy(strdict, args->tmph);
      if ( args->tmps.m ) free(args->tmps.s);
@@ -3146,11 +3414,12 @@ static void usage(void)
      fprintf(stderr, "    -0  --missing-to-ref              Assume genotypes at missing sites are 0/0\n");
      fprintf(stderr, "    -f, --apply-filters LIST          Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
      fprintf(stderr, "    -F, --filter-logic x|+            Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
-    fprintf(stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
+    fprintf(stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
      fprintf(stderr, "    -i, --info-rules TAG:METHOD,..    Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
      fprintf(stderr, "    -l, --file-list FILE              Read file names from the file\n");
      fprintf(stderr, "    -L, --local-alleles INT           EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
      fprintf(stderr, "    -m, --merge STRING                Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
+    fprintf(stderr, "    -M, --missing-rules TAG:METHOD    Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n");
      fprintf(stderr, "        --no-index                    Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
      fprintf(stderr, "        --no-version                  Do not append version and command line to the header\n");
      fprintf(stderr, "    -o, --output FILE                 Write output to a file [standard output]\n");
@@ -3159,6 +3428,7 @@ static void usage(void)
      fprintf(stderr, "    -R, --regions-file FILE           Restrict to regions listed in a file\n");
      fprintf(stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
      fprintf(stderr, "        --threads INT                 Use multithreading with <int> worker threads [0]\n");
+    fprintf(stderr, "        --write-index                 Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -3197,13 +3467,15 @@ int main_vcfmerge(int argc, char *argv[])
          {"regions-file",required_argument,NULL,'R'},
          {"regions-overlap",required_argument,NULL,4},
          {"info-rules",required_argument,NULL,'i'},
+        {"missing-rules",required_argument,NULL,'M'},
          {"no-version",no_argument,NULL,8},
          {"no-index",no_argument,NULL,10},
          {"filter-logic",required_argument,NULL,'F'},
+        {"write-index",no_argument,NULL,11},
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) {
          switch (c) {
              case 'L':
                  args->local_alleles = strtol(optarg,&tmp,10);
@@ -3227,6 +3499,7 @@ int main_vcfmerge(int argc, char *argv[])
                  break;
              case 'l': args->file_list = optarg; break;
              case 'i': args->info_rules = optarg; break;
+            case 'M': args->missing_rules_str = optarg; break;
              case 'o': args->output_fname = optarg; break;
              case 'O':
                  switch (optarg[0]) {
@@ -3254,7 +3527,7 @@ int main_vcfmerge(int argc, char *argv[])
                  else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
                  else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
                  else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
-                else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL;
+                else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
                  else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
                  else error("The -m type \"%s\" is not recognised.\n", optarg);
                  break;
@@ -3271,6 +3544,7 @@ int main_vcfmerge(int argc, char *argv[])
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
              case 10 : args->no_index = 1; break;
+            case 11 : args->write_index = 1; break;
              case 'h':
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c

index 2231a5750226f4500f9ff6f9c252c4d18f30614e..7ce5dfa8d1e36bf609844014323dfddb108e0207 100644 (file)
--- a/bcftools/vcfmerge.c.pysam.c
+++ b/bcftools/vcfmerge.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
  
-    Copyright (C) 2012-2022 Genome Research Ltd.
+    Copyright (C) 2012-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -65,6 +65,19 @@ typedef khash_t(strdict) strdict_t;
  
  #define PL2PROB_MAX 1024
  
+// Rules for merging FORMAT Number=A,G,R vectors with missing values
+#define MERGE_MISSING_DOT   0   // leave as is, i.e. use a missing value "."
+#define MERGE_MISSING_CONST 1   // use a constant value
+#define MERGE_MISSING_MAX   2   // use the existing maximum value
+
+typedef struct _missing_rule_t
+{
+    char *hdr_tag;
+    int type;
+    float value;
+}
+missing_rule_t;
+
  // For merging INFO Number=A,G,R tags
  typedef struct
  {
@@ -105,29 +118,37 @@ typedef struct
      int *map;   // mapping from input alleles to the array of output alleles (set by merge_alleles)
      int mmap;   // size of map array (only buffer[i].n_allele is actually used)
      int als_differ;
+    int var_types;  // variant types in this record, shifted by <<1 to account for VCF_REF
  }
  maux1_t;
+
+// Buffered lines for a single reader
  typedef struct
  {
      int rid;        // current rid
      int beg,end;    // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+    int unkn_allele;// the index of the unknown allele (<*>, <NON_REF>)
      int cur;        // current line or -1 if none
      int mrec;       // allocated size of buf
      maux1_t *rec;   // buffer to keep reader's lines
      bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+    int var_types;  // reader's variant types in the active [beg,end] window
  }
  buffer_t;
  typedef struct
  {
-    int n, pos, var_types;  // number of readers, current position, currently available variant types
+    int n, pos, var_types;  // number of readers; current position; variant types at this position across all available records
+    int *als_types,         // allele type of each output allele
+        mals_types;
      char *chr;              // current chromosome
      char **als, **out_als;  // merged alleles (temp, may contain empty records) and merged alleles ready for output
      int nals, mals, nout_als, mout_als; // size of the output array
      int *cnt, ncnt; // number of records that refer to the alleles
      int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
+    const char **fmt_key;// temporary short-lived array to store output tag names
      bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
      int nfmt_map;        // number of rows in the fmt_map array
-    int *agr_map, nagr_map, magr_map;   // mapping between Number=AGR element indexes
+    int *agr_map, nagr_map, magr_map;   // mapping between Number=AGR element indexes, from src idxs to dst file idxs
      void *tmp_arr;
      size_t ntmp_arr;
      buffer_t *buf;
@@ -158,6 +179,9 @@ typedef struct
      faidx_t *gvcf_fai;
      info_rule_t *rules;
      int nrules;
+    char *missing_rules_str;
+    missing_rule_t *missing_rules;    // lookup for -M, --missing-rules
+    int nmissing_rules;
      strdict_t *tmph;
      kstring_t tmps;
      bcf_srs_t *files;
@@ -168,6 +192,8 @@ typedef struct
      int argc, n_threads, record_cmd_line, clevel;
      int local_alleles;    // the value of -L option
      int keep_AC_AN;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -300,6 +326,89 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul
      }
  }
  
+static int missing_rules_comp_key2(const void *a, const void *b)
+{
+    missing_rule_t *rule1 = (missing_rule_t*) a;
+    missing_rule_t *rule2 = (missing_rule_t*) b;
+    return strcmp(rule1->hdr_tag, rule2->hdr_tag);
+}
+static int missing_rules_comp_key(const void *a, const void *b)
+{
+    char *key = (char*) a;
+    missing_rule_t *rule = (missing_rule_t*) b;
+    return strcmp(key, rule->hdr_tag);
+}
+static void missing_rules_init(args_t *args)
+{
+    kstring_t str = {0,0,0};
+    if ( args->missing_rules_str )
+    {
+        if ( !strcmp("-",args->missing_rules_str) ) kputs("PL:.,AD:.",&str);
+        else kputs(args->missing_rules_str,&str);
+    }
+    else if ( args->do_gvcf ) kputs("PL:max,AD:0",&str);
+    else return;
+
+    args->nmissing_rules = 1;
+    char *ss = str.s, *tmp = ss;
+    int n = 0;
+    while ( *ss )
+    {
+        if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+        else if ( *ss==',' ) { *ss = 0; args->nmissing_rules++; n++; if ( n%2==1 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+        ss++;
+    }
+    if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+    args->missing_rules = (missing_rule_t*) calloc(args->nmissing_rules,sizeof(missing_rule_t));
+
+    n = args->nmissing_rules;
+    args->nmissing_rules = 0;
+    ss = tmp;
+    while ( args->nmissing_rules < n  )
+    {
+        missing_rule_t *rule = &args->missing_rules[args->nmissing_rules];
+        rule->hdr_tag = strdup(ss);
+        int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
+        if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_FMT,id) )
+        {
+            if ( args->missing_rules_str ) error("The FORMAT tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+            free(rule->hdr_tag);
+            n--;
+            ss = strchr(ss, '\0'); ss++;
+            if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+            ss = strchr(ss, '\0'); ss++;
+            continue;
+        }
+
+        ss = strchr(ss, '\0'); ss++;
+        if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+
+        if ( !strcasecmp(ss,".") ) rule->type = MERGE_MISSING_DOT;
+        else if ( !strcasecmp(ss,"max") ) rule->type = MERGE_MISSING_MAX;
+        else
+        {
+            char *tmp = ss;
+            rule->value = strtod(ss, &tmp);
+            if ( *tmp ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+            rule->type = MERGE_MISSING_CONST;
+        }
+        ss = strchr(ss, '\0'); ss++;
+        args->nmissing_rules++;
+    }
+    qsort(args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key2);
+    free(str.s);
+}
+static void missing_rules_destroy(args_t *args)
+{
+    int i;
+    for (i=0; i<args->nmissing_rules; i++)
+    {
+        missing_rule_t *rule = &args->missing_rules[i];
+        free(rule->hdr_tag);
+    }
+    free(args->missing_rules);
+}
+
  static int info_rules_comp_key2(const void *a, const void *b)
  {
      info_rule_t *rule1 = (info_rule_t*) a;
@@ -772,6 +881,7 @@ void maux_destroy(maux_t *ma)
      int i,j;
      for (i=0; i<ma->nout_smpl; i++) free(ma->str[i].s);
      free(ma->str);
+    free(ma->als_types);
      for (i=0; i<ma->mals; i++)
      {
          free(ma->als[i]);
@@ -795,6 +905,7 @@ void maux_destroy(maux_t *ma)
      free(ma->AGR_info);
      if (ma->ntmp_arr) free(ma->tmp_arr);
      if (ma->nfmt_map) free(ma->fmt_map);
+    free(ma->fmt_key);
      // ma->inf freed in bcf_destroy1
      for (i=0; i<ma->mals; i++) free(ma->als[i]);
      if (ma->mout_als) free(ma->out_als);
@@ -822,7 +933,6 @@ void maux_reset(maux_t *ma, int *rid_tab)
  {
      int i,j;
      for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
-    for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
      for (i=0; i<ma->mals; i++)
      {
          free(ma->als[i]);
@@ -858,6 +968,7 @@ void maux_reset(maux_t *ma, int *rid_tab)
          for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
          {
              ma->buf[i].rec[j].skip = 0;
+            ma->buf[i].rec[j].var_types = 0;
              bcf1_t *line = ma->files->readers[i].buffer[j];
              if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
          }
@@ -961,12 +1072,14 @@ void merge_chrom2qual(args_t *args, bcf1_t *out)
          int ir, j;
          for (ir=0; ir<files->nreaders; ir++)
          {
+            ma->buf[ir].unkn_allele = 0;
              bcf1_t *line = maux_get_line(args,ir);
              if ( !line ) continue;
              for (j=1; j<line->n_allele; j++)
              {
                  int irec = ma->buf[ir].cur;
                  if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+                if ( bcf_has_variant_type(line,j,VCF_REF) && line->d.allele[j][0]=='<' ) ma->buf[ir].unkn_allele = j;
              }
          }
      }
@@ -1987,7 +2100,7 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc
          bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
      ma->laa_dirty = 1;
  }
-void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule, bcf1_t *out)
  {
      bcf_srs_t *files = args->files;
      bcf_hdr_t *out_hdr = args->out_hdr;
@@ -2137,12 +2250,32 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
                          for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
                          continue; \
                      } \
-                    int ngsize = ma->smpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
-                    for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+                    int haploid = ma->smpl_ploidy[ismpl+j]==1 ? 1 : 0; \
+                    int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
+                    if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
+                    {  \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \
+                        for (l=0; l<ngsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+                    { \
+                        for (l=0; l<ngsize; l++) { *tgt = mrule->value; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+                    { \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        src_type_t max = src[0]; \
+                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        for (l=0; l<ngsize; l++) { *tgt = max; tgt++; } \
+                    } \
+                    else \
+                    { \
+                        for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+                    } \
                      for (; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
-                    if ( ma->smpl_ploidy[ismpl+j]==1 ) \
+                    if ( haploid ) \
                      { \
-                        /* Haploid */ \
                          int iori, inew; \
                          for (iori=0; iori<line->n_allele; iori++) \
                          { \
@@ -2196,7 +2329,26 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
                          continue; \
                      } \
                      src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
-                    for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+                    if ( ma->buf[i].unkn_allele )  /* Use value from the unknown allele when available */ \
+                    { \
+                        int iunkn = ma->buf[i].unkn_allele; \
+                        for (l=0; l<nsize; l++) { *tgt = src[iunkn]; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+                    { \
+                        for (l=0; l<nsize; l++) { *tgt = mrule->value; tgt++; } \
+                    } \
+                    else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+                    { \
+                        src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                        src_type_t max = src[0]; \
+                        for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+                        for (l=0; l<nsize; l++) { *tgt = max; tgt++; } \
+                    } \
+                    else \
+                    { \
+                        for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+                    } \
                      int iori,inew; \
                      for (iori=ifrom; iori<line->n_allele; iori++) \
                      { \
@@ -2236,6 +2388,7 @@ void merge_format(args_t *args, bcf1_t *out)
      {
          ma->nfmt_map = 2;
          ma->fmt_map  = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*));
+        ma->fmt_key  = (const char**) malloc(ma->nfmt_map*sizeof(*ma->fmt_key));
      }
      else
          memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**));
@@ -2252,7 +2405,7 @@ void merge_format(args_t *args, bcf1_t *out)
          bcf_hdr_t *hdr = reader->header;
          for (j=0; j<line->n_fmt; j++)
          {
-            // Wat this tag already seen?
+            // Was this tag already seen?
              bcf_fmt_t *fmt = &line->d.fmt[j];
              const char *key = hdr->id[BCF_DT_ID][fmt->id].key;
              kitr = kh_get(strdict, tmph, key);
@@ -2271,9 +2424,11 @@ void merge_format(args_t *args, bcf1_t *out)
                      {
                          ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders);
                          memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
+                        ma->fmt_key = (const char**) realloc(ma->fmt_key, sizeof(*ma->fmt_key)*(max_ifmt+1));
                          ma->nfmt_map = max_ifmt+1;
                      }
                      if ( key[0]=='P' && key[1]=='L' && key[2]==0  ) { has_PL = ifmt; }
+                    ma->fmt_key[max_ifmt] = key;
                  }
                  kitr = kh_put(strdict, tmph, key, &ret);
                  kh_value(tmph, kitr) = ifmt;
@@ -2300,7 +2455,10 @@ void merge_format(args_t *args, bcf1_t *out)
          update_AN_AC(out_hdr, out);
  
      for (i=1; i<=max_ifmt; i++)
-        merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+    {
+        missing_rule_t *rule = (missing_rule_t*) bsearch(ma->fmt_key[i], args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key);
+        merge_format_field(args, &ma->fmt_map[i*files->nreaders], rule, out);
+    }
  
      if ( ma->laa_dirty )
          update_local_alleles(args, out);
@@ -2408,6 +2566,9 @@ void gvcf_write_block(args_t *args, int start, int end)
      {
          int slen  = 0;
          char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+        if (!seq)
+            bcftools_exit(1); // faidx_fetch_seq has already reported the error.
+
          if (slen)
          {
              out->d.allele[0][0] = seq[0];
@@ -2522,16 +2683,6 @@ static inline int is_gvcf_block(bcf1_t *line)
      return 0;
  }
  
-// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
-// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and
-// to accommodate for VCF_GVCF_REF defined below
-static const int
-    snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2),
-    indel_mask = VCF_INDEL<<2,
-    ins_mask = VCF_INS<<2,
-    del_mask = VCF_DEL<<2,
-    ref_mask = 2;
-
  /*
      Check incoming lines for new gVCF blocks, set pointer to the current source
      buffer (gvcf or readers).  In contrast to gvcf_flush, this function can be
@@ -2631,7 +2782,7 @@ void clean_buffer(args_t *args)
          {
              if ( ma->gvcf[ir].active )
              {
-                if ( ma->pos >= ma->gvcf[ir].end )  ma->gvcf[ir].active = 0;
+                if ( ma->pos > ma->gvcf[ir].end )  ma->gvcf[ir].active = 0;
                  else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg;  // re-activate interrupted gVCF block
              }
              if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
@@ -2666,13 +2817,16 @@ void debug_maux(args_t *args)
      {
          bcf_sr_t *reader = &files->readers[j];
          buffer_t *buf = &maux->buf[j];
-        fprintf(bcftools_stderr," reader %d: ", j);
+        fprintf(bcftools_stderr," reader %d (k=%d-%d): ", j,buf->beg,buf->end);
          for (k=buf->beg; k<buf->end; k++)
          {
-            if ( buf->rec[k].skip & SKIP_DONE ) continue;
-            bcf1_t *line = reader->buffer[k];
+            if ( buf->rec[k].skip & SKIP_DONE ) { fprintf(bcftools_stderr," DONE"); continue; }
+            bcf1_t *line = reader->buffer[k];               // selected for merging by can_merge
              fprintf(bcftools_stderr,"\t");
-            if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"[");  // this record will not be merged in this round
+            if ( buf->cur==k ) fprintf(bcftools_stderr,"!");         // selected for merging by stage_line
+            if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"[");    // this record cannot be merged in this round
+            if ( !line->n_allele && maux->gvcf[j].active )
+                fprintf(bcftools_stderr,"<*>");
              for (l=0; l<line->n_allele; l++)
                  fprintf(bcftools_stderr,"%s%s", l==0?"":",", line->d.allele[l]);
              if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"]");
@@ -2688,9 +2842,10 @@ void debug_state(args_t *args)
  {
      maux_t *maux = args->maux;
      int i,j;
+    fprintf(bcftools_stderr,"State after position=%d done:\n",maux->pos+1);
      for (i=0; i<args->files->nreaders; i++)
      {
-        fprintf(bcftools_stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
+        fprintf(bcftools_stderr,"\treader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
          if ( maux->buf[i].cur >=0 )
          {
              bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
@@ -2700,20 +2855,136 @@ void debug_state(args_t *args)
          }
          fprintf(bcftools_stderr,"\n");
      }
-    fprintf(bcftools_stderr,"gvcf_min=%d\n", args->maux->gvcf_min);
+    fprintf(bcftools_stderr,"\tgvcf_min=%d\n", args->maux->gvcf_min);
      for (i=0; i<args->files->nreaders; i++)
      {
-        fprintf(bcftools_stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
+        fprintf(bcftools_stderr,"\t\treader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
          if ( maux->gvcf[i].active ) fprintf(bcftools_stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1);
          fprintf(bcftools_stderr,"\n");
      }
      fprintf(bcftools_stderr,"\n");
  }
  
+
+// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
+// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault)
+static const int
+    snp_mask   = (VCF_SNP<<1)|(VCF_MNP<<1),
+    indel_mask = (VCF_INDEL<<1),
+    ins_mask   = VCF_INS<<1,
+    del_mask   = VCF_DEL<<1,
+    ref_mask   = 1;
+
+// Can these types be merged given the -m settings? Despite the function's name, its focus is on
+// excluding incompatible records, there will be a finer matching later in stage_line()
+static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec)
+{
+    int k;
+    maux_t *maux = args->maux;
+    bcf1_t *rec = buf->lines[irec];
+    int rec_types = buf->rec[irec].var_types;
+
+    assert( selected_types );   // this is trivially true, set in can_merge()
+
+    if ( args->collapse & COLLAPSE_ANY ) return 1;  // can merge anything with anything
+
+    // REF and gVCF_REF with no other alleles present can be merged with anything
+    if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1;
+    if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
+
+    if ( args->collapse!=COLLAPSE_NONE )
+    {
+        // If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del
+        // Include the new record if
+        //  - rec has SNV, we already have SNV, and -m is both,snps,snp-ins-del
+        //  - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
+        if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) )
+        {
+            if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1;
+        }
+        if ( args->collapse&COLLAPSE_INDELS )
+        {
+            if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1;
+        }
+        if ( args->collapse&COLLAPSE_SNP_INS_DEL )
+        {
+            if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1;
+            if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1;
+        }
+        // Whatever is left, allow to match if the alleles match exactly
+    }
+
+    // The -m none mode or exact matching requested
+    // Simple test first: are the variants of the same type?
+    int x = selected_types >> 1;        // remove REF
+    int y = rec_types >> 1;             // remove REF
+    while ( x && y ) { x>>=1; y>>=1; }
+    if ( x || y ) return 0;             // the types differ
+
+    if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0;   // refs are not compatible
+    for (k=1; k<rec->n_allele; k++)
+    {
+        if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue;    // this must be gVCF_REF (<*> or <NON_REF>)
+        if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
+    }
+    if ( k==rec->n_allele ) return 0;   // this record has a new allele rec->d.allele[k]
+    return 1;   // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
+}
+
+static void maux_update_alleles(args_t *args, int ireader, int irec)
+{
+    int k;
+    bcf_sr_t *reader = &args->files->readers[ireader];
+    maux_t *maux = args->maux;
+    buffer_t *buf = &maux->buf[ireader];
+    maux1_t *ma1 = &buf->rec[irec];
+    bcf1_t *line = buf->lines[irec];
+    hts_expand(int, line->n_allele, ma1->mmap, ma1->map);
+    if ( !maux->nals )  // first record to be merged, copy the alleles to the output
+    {
+        maux->nals = line->n_allele;
+        hts_expand0(char*, maux->nals, maux->mals, maux->als);
+        hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+        hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+        for (k=0; k<maux->nals; k++)
+        {
+            free(maux->als[k]);
+            maux->als[k] = strdup(line->d.allele[k]);
+            ma1->map[k]  = k;
+            maux->cnt[k] = 1;
+            int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+            if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+            maux->als_types[k] = var_type ? var_type<<1 : ref_mask;
+        }
+        return;
+    }
+    // normalize alleles
+    maux->als = merge_alleles(line->d.allele, line->n_allele, ma1->map, maux->als, &maux->nals, &maux->mals);
+    if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
+    hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+    hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+    for (k=1; k<line->n_allele; k++)
+    {
+        int ik = ma1->map[k];
+        int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+        if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+        maux->als_types[ik] = var_type ? var_type<<1 : ref_mask;
+        maux->cnt[ik]++;    // how many times an allele appears in the files
+    }
+    maux->cnt[0]++;
+}
+
  /*
-   Determine which line should be merged from which reader: go through all
-   readers and all buffered lines, expand REF,ALT and try to match lines with
-   the same ALTs.
+   Determine which lines remain to be merged across readers at the current position and
+   are compatible given the -m criteria. This is indicated by maux1_t.skip: 0=compatible,
+   SKIP_DONE=the record is done, SKIP_DIFF=not compatible and will be included next time.
+
+   At the same time count how many times is each allele present across the readers and records
+   so that we can prioritize the records with the same alleles to come first. In the end maximum
+   one record at a time can be selected from each reader and that witll be done in stage_line().
+
+   The function maux_reset already initialized structures for this position, so here each
+   reader comes with the beg,end indexes that point to records with the same maux_t.pos position.
   */
  int can_merge(args_t *args)
  {
@@ -2721,28 +2992,39 @@ int can_merge(args_t *args)
      maux_t *maux = args->maux;
      gvcf_aux_t *gaux = maux->gvcf;
      char *id = NULL, ref = 'N';
-    int i,j,k, ntodo = 0;
+    int i,j, ntodo = 0;
  
      for (i=0; i<maux->nals; i++)
      {
          free(maux->als[i]);
          maux->als[i] = NULL;
+        maux->cnt[i] = 0;
      }
      maux->var_types = maux->nals = 0;
  
-    // this is only for the `-m none -g` mode, ensure that <*> lines come last
-    #define VCF_GVCF_REF 1
-
+    // In this loop we do the following:
+    //  - remember the first encountered ID if matching by ID
+    //  - count the number of unprocessed records at this position
+    //  - collect all variant types at this position. This is to be able to perform -m matching and
+    //    print SNVs first, then indels, then gVCF blocks
+    //  - init the 'skip' variable to SKIP_DIFF for each record that has not been used yet
      for (i=0; i<files->nreaders; i++)
      {
          buffer_t *buf = &maux->buf[i];
+        buf->var_types = 0;
  
-        if ( gaux && gaux[i].active )
+        if ( gaux && gaux[i].active ) // active gvcf block
          {
-            // skip readers with active gvcf blocks
              buf->rec[buf->beg].skip = SKIP_DIFF;
+            maux->var_types |= ref_mask;
+            buf->var_types |= ref_mask;
+            buf->rec[buf->beg].var_types = ref_mask;
              continue;
          }
+
+        // for gvcf: find out REF at this position
+        if ( buf->beg < buf->end && ref=='N' ) ref = buf->lines[buf->beg]->d.allele[0][0];
+
          for (j=buf->beg; j<buf->end; j++)
          {
              if ( buf->rec[j].skip & SKIP_DONE ) continue;
@@ -2751,118 +3033,70 @@ int can_merge(args_t *args)
              ntodo++;
  
              bcf1_t *line = buf->lines[j];
-            if ( args->merge_by_id )
-                id = line->d.id;
-            else
+            if ( args->merge_by_id && !id ) { id = line->d.id; continue; }      // set ID when merging by id
+
+            if ( !buf->rec[j].var_types )
              {
                  int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
-                if (var_type < 0) error("bcf_has_variant_types() failed.");
+                if ( var_type < 0 ) error("bcf_has_variant_types() failed.");
                  if ( args->collapse==COLLAPSE_SNP_INS_DEL )
                  {
                      // need to distinguish between ins and del so strip the VCF_INDEL flag
                      var_type &= ~VCF_INDEL;
                  }
-                maux->var_types |= var_type ? var_type<<2 : 2;
-
-                // for the `-m none -g` mode
-                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
-                    maux->var_types |= VCF_GVCF_REF;
+                var_type = var_type ? var_type<<1 : ref_mask;
+                if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask;
+                buf->rec[j].var_types = var_type;
              }
+            maux->var_types |= buf->rec[j].var_types;
+            buf->var_types |= buf->rec[j].var_types;
          }
-
-        // for gvcf: find out REF at this position
-        if ( buf->beg < buf->end && ref=='N' )
-            ref = buf->lines[buf->beg]->d.allele[0][0];
      }
      if ( !ntodo ) return 0;
  
+    int selected_types = 0;
+
      // In this loop we select from each reader compatible candidate lines.
      // (i.e. SNPs or indels). Go through all files and all lines at this
      // position and normalize relevant alleles.
      // REF-only sites may be associated with both SNPs and indels.
      for (i=0; i<files->nreaders; i++)
      {
-        bcf_sr_t *reader = &files->readers[i];
          buffer_t *buf = &maux->buf[i];
-
          if ( gaux && gaux[i].active )
          {
+            // gVCF records inherited from an upstream gVCF block have incorrect or missing allele and position
              gaux[i].line->d.allele[0][0] = ref;
              gaux[i].line->pos = maux->pos;
+            maux_update_alleles(args, i, buf->beg);
+            selected_types |= ref_mask;
+            continue;
          }
-
          for (j=buf->beg; j<buf->end; j++)
          {
              if ( buf->rec[j].skip & SKIP_DONE ) continue;
  
              bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
-
-            int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
-            if (line_type < 0) error("bcf_has_variant_types() failed.");
-            line_type = line_type ? line_type<<2 : 2;
+            int line_types = buf->rec[j].var_types;
  
              // select relevant lines
              if ( args->merge_by_id )
              {
-                if ( strcmp(id,line->d.id) ) continue;
+                if ( strcmp(id,line->d.id) ) continue;      // matching by ID and it does not match the selected record
              }
+            else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue;
              else
              {
-                // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
-                // records come last, otherwise infinite loop is created (#1164)
-                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
-                {
-                    if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
-                }
-                if ( args->collapse==COLLAPSE_NONE && maux->nals )
-                {
-                    // All alleles of the tested record must be present in the
-                    // selected maux record plus variant types must be the same
-                    if ( (maux->var_types & line_type) != line_type ) continue;
-                    if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue;   // refs not compatible
-                    for (k=1; k<line->n_allele; k++)
-                    {
-                        if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
-                    }
-                    if ( !(line_type&ref_mask) && k==line->n_allele ) continue;  // not a REF-only site and there is no matching allele
-                }
-                if ( !(args->collapse&COLLAPSE_ANY) )
-                {
-                    // Merge:
-                    //  - SNPs+SNPs+MNPs+REF if -m both,snps
-                    //  - indels+indels+REF  if -m both,indels, REF only if SNPs are not present
-                    //  - SNPs come first
-                    if ( line_type & (indel_mask|ins_mask|del_mask) )
-                    {
-                        if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue;  // SNPs come first
-                        if ( args->do_gvcf && maux->var_types&ref_mask ) continue;  // never merge indels with gVCF blocks
-                    }
-                }
+                // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
+                if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE)     // asked to merge SNVs into multiallelics
+                        && (maux->var_types&snp_mask)                   // there are SNVs at the current position
+                        && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
+                   ) continue;
              }
-            buf->rec[j].skip = 0;
+            selected_types |= line_types;
  
-            hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
-            if ( !maux->nals )    // first record, copy the alleles to the output
-            {
-                maux->nals = line->n_allele;
-                hts_expand0(char*, maux->nals, maux->mals, maux->als);
-                hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
-                for (k=0; k<maux->nals; k++)
-                {
-                    free(maux->als[k]);
-                    maux->als[k] = strdup(line->d.allele[k]);
-                    buf->rec[j].map[k] = k;
-                    maux->cnt[k] = 1;
-                }
-                continue;
-            }
-            // normalize alleles
-            maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
-            if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
-            hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
-            for (k=1; k<line->n_allele; k++)
-                maux->cnt[ buf->rec[j].map[k] ]++;    // how many times an allele appears in the files
-            maux->cnt[0]++;
+            buf->rec[j].skip = 0;   // the j-th record from i-th reader can be included. Final decision will be made in stage_line
+            maux_update_alleles(args, i, j);
          }
      }
      return 1;
@@ -2880,48 +3114,61 @@ void stage_line(args_t *args)
      bcf_srs_t *files = args->files;
      maux_t *maux = args->maux;
  
-    // debug_maux(args);
-
-    // take the most frequent allele present in multiple files, REF is skipped
-    int i,j,k,icnt = 1;
-    for (i=2; i<maux->nals; i++)
-        if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+    // Take the most frequent allele present in multiple files, REF and gVCF_REF is skipped.
+    int i,j,k,icnt = -1;
+    for (i=1; i<maux->nals; i++)
+    {
+        if ( maux->als_types[i] & ref_mask ) continue;
+        if ( icnt==-1 || maux->cnt[icnt] < maux->cnt[i] ) icnt = i;
+    }
+    int selected_type = icnt>0 ? maux->als_types[icnt] : ref_mask;
  
      int nout = 0;
      for (i=0; i<files->nreaders; i++)
      {
          buffer_t *buf = &maux->buf[i];
          buf->cur = -1;
-        if ( buf->beg >= buf->end ) continue;   // no lines in the buffer
+        if ( buf->beg >= buf->end ) continue; // No lines in the buffer at this site
  
          // find lines with the same allele
          for (j=buf->beg; j<buf->end; j++)
          {
-            if ( buf->rec[j].skip ) continue;   // done or not compatible
-            if ( args->merge_by_id ) break;
-            if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break;   // REF-only record
+            if ( buf->rec[j].skip )
+            {
+                int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0;
+                if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1;
+                if ( !is_gvcf ) continue;   // done or not compatible
+            }
+            if ( args->merge_by_id ) break;     // if merging by ID and the line is compatible, the this is THE line
+
+            // skip if the reader has a record that matches the most frequent allele and this record is not it
+            if ( (selected_type & buf->var_types) && !(selected_type & buf->rec[j].var_types) ) continue;
  
+            // if the reader does not have the most frequent allele type but is a ref, accept
+            if ( !(selected_type & buf->var_types) && (buf->rec[j].var_types & ref_mask) ) break;
+            if ( selected_type==ref_mask ) break;
+
+            // accept if the record has the most frequent allele
              for (k=0; k<buf->lines[j]->n_allele; k++)
                  if ( icnt==buf->rec[j].map[k] ) break;
-
              if ( k<buf->lines[j]->n_allele ) break;
          }
          if ( j>=buf->end )
          {
              // no matching allele found in this file
-            if ( args->collapse==COLLAPSE_NONE ) continue;
+            if ( args->collapse==COLLAPSE_NONE ) continue;  // exact matching requested, skip
  
+            // choose something compatible to create a multiallelic site given the -m criteria
              for (j=buf->beg; j<buf->end; j++)
              {
                  if ( buf->rec[j].skip ) continue;   // done or not compatible
                  if ( args->collapse&COLLAPSE_ANY ) break;   // anything can be merged
-                int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap);
-                if (line_type < 0) error("bcf_has_variant_types() failed.");
-                if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
-                if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
-                if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
-                if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
-                if ( line_type==VCF_REF )
+                int line_type = buf->rec[j].var_types;
+                if ( maux->var_types&snp_mask && line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+                if ( maux->var_types&indel_mask && line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+                if ( maux->var_types&ins_mask && line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+                if ( maux->var_types&del_mask && line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+                if ( line_type&ref_mask )
                  {
                      if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
                      if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
@@ -2942,12 +3189,21 @@ void stage_line(args_t *args)
          {
              // found a suitable line for merging
              buf->cur = j;
-
-            // mark as finished so that it's ignored next time
-            buf->rec[j].skip  = SKIP_DONE;
-            nout++;
          }
      }
+
+    // debug_maux(args);
+
+    // Mark lines staged for merging as finished so that they are ignored next time
+    for (i=0; i<files->nreaders; i++)
+    {
+        buffer_t *buf = &maux->buf[i];
+        if ( buf->cur == -1 ) continue;
+
+        buf->rec[buf->cur].skip  = SKIP_DONE;
+        nout++;
+    }
+
      assert( nout );
  }
  
@@ -3080,6 +3336,7 @@ void merge_vcf(args_t *args)
              error_errno("[%s] Failed to update header", __func__);
      }
      info_rules_init(args);
+    missing_rules_init(args);
  
      bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
      if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
@@ -3089,6 +3346,7 @@ void merge_vcf(args_t *args)
          if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
          return;
      }
+    else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
  
      if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
      args->maux = maux_init(args);
@@ -3124,9 +3382,19 @@ void merge_vcf(args_t *args)
          gvcf_flush(args,1);
  
      info_rules_destroy(args);
+    missing_rules_destroy(args);
      maux_destroy(args->maux);
      bcf_hdr_destroy(args->out_hdr);
-    if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out_fh)<0 )
+        {
+            if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
+    if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"bcftools_stdout");
      bcf_destroy1(args->out_line);
      kh_destroy(strdict, args->tmph);
      if ( args->tmps.m ) free(args->tmps.s);
@@ -3148,11 +3416,12 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -0  --missing-to-ref              Assume genotypes at missing sites are 0/0\n");
      fprintf(bcftools_stderr, "    -f, --apply-filters LIST          Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
      fprintf(bcftools_stderr, "    -F, --filter-logic x|+            Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
-    fprintf(bcftools_stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
+    fprintf(bcftools_stderr, "    -g, --gvcf -|REF.FA               Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
      fprintf(bcftools_stderr, "    -i, --info-rules TAG:METHOD,..    Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
      fprintf(bcftools_stderr, "    -l, --file-list FILE              Read file names from the file\n");
      fprintf(bcftools_stderr, "    -L, --local-alleles INT           EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
      fprintf(bcftools_stderr, "    -m, --merge STRING                Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
+    fprintf(bcftools_stderr, "    -M, --missing-rules TAG:METHOD    Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n");
      fprintf(bcftools_stderr, "        --no-index                    Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
      fprintf(bcftools_stderr, "        --no-version                  Do not append version and command line to the header\n");
      fprintf(bcftools_stderr, "    -o, --output FILE                 Write output to a file [standard output]\n");
@@ -3161,6 +3430,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -R, --regions-file FILE           Restrict to regions listed in a file\n");
      fprintf(bcftools_stderr, "        --regions-overlap 0|1|2       Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
      fprintf(bcftools_stderr, "        --threads INT                 Use multithreading with <int> worker threads [0]\n");
+    fprintf(bcftools_stderr, "        --write-index                 Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      bcftools_exit(1);
  }
@@ -3199,13 +3469,15 @@ int main_vcfmerge(int argc, char *argv[])
          {"regions-file",required_argument,NULL,'R'},
          {"regions-overlap",required_argument,NULL,4},
          {"info-rules",required_argument,NULL,'i'},
+        {"missing-rules",required_argument,NULL,'M'},
          {"no-version",no_argument,NULL,8},
          {"no-index",no_argument,NULL,10},
          {"filter-logic",required_argument,NULL,'F'},
+        {"write-index",no_argument,NULL,11},
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) {
          switch (c) {
              case 'L':
                  args->local_alleles = strtol(optarg,&tmp,10);
@@ -3229,6 +3501,7 @@ int main_vcfmerge(int argc, char *argv[])
                  break;
              case 'l': args->file_list = optarg; break;
              case 'i': args->info_rules = optarg; break;
+            case 'M': args->missing_rules_str = optarg; break;
              case 'o': args->output_fname = optarg; break;
              case 'O':
                  switch (optarg[0]) {
@@ -3256,7 +3529,7 @@ int main_vcfmerge(int argc, char *argv[])
                  else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
                  else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
                  else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
-                else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL;
+                else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
                  else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
                  else error("The -m type \"%s\" is not recognised.\n", optarg);
                  break;
@@ -3273,6 +3546,7 @@ int main_vcfmerge(int argc, char *argv[])
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
              case 10 : args->no_index = 1; break;
+            case 11 : args->write_index = 1; break;
              case 'h':
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c

index 9538f8d01a723d8c49d536d2d7137743b6f2ebfe..02ad322d17d496e536ea8ab3718144d2d52ba8cd 100644 (file)
--- a/bcftools/vcfnorm.c
+++ b/bcftools/vcfnorm.c
@@ -1,6 +1,6 @@
  /*  vcfnorm.c -- Left-align and normalize indels.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -40,6 +40,8 @@ THE SOFTWARE.  */
  #include "bcftools.h"
  #include "rbuf.h"
  #include "abuf.h"
+#include "gff.h"
+#include "regidx.h"
  
  #define CHECK_REF_EXIT 1
  #define CHECK_REF_WARN 2
@@ -86,8 +88,8 @@ typedef struct
      int32_t *int32_arr;
      int ntmp_arr1, ntmp_arr2, nint32_arr;
      kstring_t *tmp_str;
-    kstring_t *tmp_als, tmp_kstr;
-    int ntmp_als;
+    kstring_t *tmp_als, *tmp_del, tmp_kstr;
+    int ntmp_als, ntmp_del;
      rbuf_t rbuf;
      int buf_win;            // maximum distance between two records to consider
      int aln_win;            // the realignment window size (maximum repeat size)
@@ -105,6 +107,13 @@ typedef struct
      int use_star_allele, ma_use_ref_allele;
      char *old_rec_tag;
      htsFile *out;
+    char *index_fn;
+    int write_index;
+    int right_align;
+    char *gff_fname;
+    gff_t *gff;
+    regidx_t *idx_tscript;
+    regitr_t *itr_tscript;
  }
  args_t;
  
@@ -344,6 +353,157 @@ static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
              error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
  }
  
+static int is_left_align(args_t *args, bcf1_t *line)
+{
+    if ( args->right_align ) return 0;
+    if ( !args->gff ) return 1;
+    const char *chr = bcf_seqname(args->hdr,line);
+    if ( !strncasecmp("chr",chr,3) ) chr += 3;  // strip 'chr' prefix, that's what we requested the GFF reader to do
+    if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1;
+
+    // if there are two conflicting overlapping transcripts, go with the default left-alignment
+    int has_fwd = 0;
+    while ( regitr_overlap(args->itr_tscript) )
+    {
+        gf_tscript_t *tr = regitr_payload(args->itr_tscript, gf_tscript_t*);
+        if ( tr->strand==STRAND_FWD ) has_fwd = 1;
+        if ( tr->strand==STRAND_REV ) return 1;
+    }
+    // either no hit at all (then left-align) or everything was on fwd strand (then right-align)
+    return has_fwd ? 0 : 1;
+}
+static hts_pos_t realign_left(args_t *args, bcf1_t *line)
+{
+    // trim from right
+    char *ref = NULL;
+    int i;
+    hts_pos_t nref=0, new_pos = line->pos;
+    kstring_t *als = args->tmp_als;
+    while (1)
+    {
+        // is the rightmost base identical in all alleles?
+        int min_len = als[0].l;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+            if ( als[i].l < min_len ) min_len = als[i].l;
+        }
+        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+        if ( min_len<=1 && new_pos==0 ) break;
+
+        int pad_from_left = 0;
+        for (i=0; i<line->n_allele; i++) // trim all alleles
+        {
+            als[i].l--;
+            if ( !als[i].l ) pad_from_left = 1;
+        }
+        if ( pad_from_left )
+        {
+            // extend all alleles to the left by aln_win bases (unless close to the chr start).
+            // Extra bases will be trimmed from the left after this loop is done
+            int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
+            free(ref);
+            ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), new_pos-npad, new_pos-1, &nref);
+            if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line), (int64_t) new_pos-npad+1);
+            replace_iupac_codes(ref,nref);
+            for (i=0; i<line->n_allele; i++)
+            {
+                ks_resize(&als[i], als[i].l + npad);
+                if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
+                memcpy(als[i].s,ref,npad);
+                als[i].l += npad;
+            }
+            new_pos -= npad;
+        }
+    }
+    free(ref);
+
+    // trim from left
+    int ntrim_left = 0;
+    while (1)
+    {
+        // is the first base identical in all alleles?
+        int min_len = als[0].l - ntrim_left;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+        }
+        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
+        ntrim_left++;
+    }
+    if ( ntrim_left )
+    {
+        for (i=0; i<line->n_allele; i++)
+        {
+            memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
+            als[i].l -= ntrim_left;
+        }
+        new_pos += ntrim_left;
+    }
+    return new_pos;
+}
+
+static hts_pos_t realign_right(args_t *args, bcf1_t *line)
+{
+    char *ref = NULL;
+    int i;
+    hts_pos_t new_pos = line->pos, nref = 0;
+    kstring_t *als = args->tmp_als;
+
+    // trim from left
+    int ntrim_left = 0, npad_right = line->rlen, has_indel = 0;
+    while (1)
+    {
+        // is the leftmost base identical in all alleles?
+        int min_len = als[0].l - ntrim_left;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( als[0].l!=als[i].l ) has_indel = 1;
+            if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+        }
+        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed further
+
+        ntrim_left++;
+        if ( min_len<=1 ) // pad from the right
+        {
+            free(ref);
+            ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), line->pos + npad_right, line->pos + npad_right + args->aln_win, &nref);
+            if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,line), new_pos + ntrim_left);
+            npad_right += args->aln_win;
+            replace_iupac_codes(ref,nref);
+            for (i=0; i<line->n_allele; i++) kputs(ref, &als[i]);
+        }
+    }
+    ntrim_left -= has_indel;
+    if ( ntrim_left > 0 )
+    {
+        for (i=0; i<line->n_allele; i++)
+        {
+            memmove(als[i].s, als[i].s + ntrim_left, als[i].l - ntrim_left);
+            als[i].l -= ntrim_left;
+        }
+        new_pos += ntrim_left;
+    }
+    free(ref);
+
+    // trim from right
+    while (1)
+    {
+        // is the last base identical in all alleles?
+        int min_len = als[0].l;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+            if ( min_len > als[i].l ) min_len = als[i].l;
+        }
+        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed more
+        for (i=0; i<line->n_allele; i++) { als[i].l--; als[i].s[als[i].l]=0; }
+    }
+    return new_pos;
+}
+
  #define ERR_DUP_ALLELE       -2
  #define ERR_REF_MISMATCH     -1
  #define ERR_OK                0
@@ -396,10 +556,32 @@ static int realign(args_t *args, bcf1_t *line)
  
      // make a copy of each allele for trimming
      hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
+    hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
      kstring_t *als = args->tmp_als;
+    kstring_t *del = args->tmp_del;
      for (i=0; i<line->n_allele; i++)
      {
-        if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC;  // symbolic allele
+        del[i].l = 0;
+        if ( line->d.allele[i][0]=='<' )
+        {
+            // symbolic allele, only <DEL.*> will be realigned
+            if ( strncmp("<DEL",line->d.allele[i],4) ) return ERR_SYMBOLIC;
+            if ( nref < line->rlen )
+            {
+                free(ref);
+                reflen = line->rlen;
+                ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+                if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
+                seq_to_upper(ref,0);
+                replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
+                als[0].l = 0;
+                kputs(ref, &als[0]);
+                als[i].l = 0;
+                kputsn(ref,1,&als[i]);
+                kputs(line->d.allele[i],&del[i]);
+                continue;
+            }
+        }
          if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
          if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
          {
@@ -416,69 +598,17 @@ static int realign(args_t *args, bcf1_t *line)
  
          if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
      }
-
-    // trim from right
-    int new_pos = line->pos;
-    while (1)
-    {
-        // is the rightmost base identical in all alleles?
-        int min_len = als[0].l;
-        for (i=1; i<line->n_allele; i++)
-        {
-            if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break;
-            if ( als[i].l < min_len ) min_len = als[i].l;
-        }
-        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
-        if ( min_len<=1 && new_pos==0 ) break;
-
-        int pad_from_left = 0;
-        for (i=0; i<line->n_allele; i++) // trim all alleles
-        {
-            als[i].l--;
-            if ( !als[i].l ) pad_from_left = 1;
-        }
-        if ( pad_from_left )
-        {
-            int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
-            free(ref);
-            ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
-            if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
-            replace_iupac_codes(ref,nref);
-            for (i=0; i<line->n_allele; i++)
-            {
-                ks_resize(&als[i], als[i].l + npad);
-                if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
-                memcpy(als[i].s,ref,npad);
-                als[i].l += npad;
-            }
-            new_pos -= npad;
-        }
-    }
      free(ref);
+    ref = NULL;
  
-    // trim from left
-    int ntrim_left = 0;
-    while (1)
-    {
-        // is the first base identical in all alleles?
-        int min_len = als[0].l - ntrim_left;
-        for (i=1; i<line->n_allele; i++)
-        {
-            if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
-            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
-        }
-        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
-        ntrim_left++;
-    }
-    if ( ntrim_left )
-    {
-        for (i=0; i<line->n_allele; i++)
-        {
-            memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
-            als[i].l -= ntrim_left;
-        }
-        new_pos += ntrim_left;
-    }
+    // which direction are we aligning?
+    int left_align = is_left_align(args, line);
+
+    hts_pos_t new_pos;
+    if ( left_align )
+        new_pos = realign_left(args, line);
+    else
+        new_pos = realign_right(args, line);
  
      // Have the alleles changed?
      als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
@@ -491,7 +621,8 @@ static int realign(args_t *args, bcf1_t *line)
      for (i=0; i<line->n_allele; i++)
      {
          if (i>0) kputc(',',&args->tmp_kstr);
-        kputsn(als[i].s,als[i].l,&args->tmp_kstr);
+        if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr);
+        else kputsn(als[i].s,als[i].l,&args->tmp_kstr);
      }
      args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
      bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
@@ -1281,10 +1412,12 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
          ngts2 /= nsmpl;
          if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1);
  
-        int32_t *gt  = (int32_t*) args->tmp_arr1;
-        int32_t *gt2 = (int32_t*) args->tmp_arr2;
+        int32_t *gt  = (int32_t*) args->tmp_arr1;       // the first, destination line
+        int32_t *gt2 = (int32_t*) args->tmp_arr2;       // one of the subsequent lines, i.e. the source line
          for (j=0; j<nsmpl; j++)
          {
+            // Take each source allele and apply to the first line. We try to preserve the order and phasing and we
+            // never overwrite with ref allele
              for (k2=0; k2<ngts2; k2++)
              {
                  if ( gt2[k2]==bcf_int32_vector_end ) break;
@@ -1292,12 +1425,18 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
                  int ial2 = bcf_gt_allele(gt2[k2]);
                  if ( ial2==0 ) continue;    // never overwrite with ref
                  if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
+
+                // The destination allele
                  int ial = args->maps[i].map[ial2];
-                for (k=0; k<ngts; k++)
-                    if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
-                if ( k<ngts )
+                if ( gt[k2]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k2]) || !bcf_gt_allele(gt[k2]) )
+                    gt[k2] = bcf_gt_is_phased(gt[k2]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+                else
                  {
-                    gt[k] = bcf_gt_unphased(ial);
+                    // conflict, the first line has non-zero allele, use the old way, possibly disrupt the phasing
+                    for (k=0; k<ngts; k++)
+                        if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
+                    if ( k<ngts )
+                        gt[k] = bcf_gt_unphased(ial);
                  }
              }
              gt  += ngts;
@@ -1906,10 +2045,24 @@ static void init_data(args_t *args)
              abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
          abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
      }
+    if ( args->gff_fname )
+    {
+        args->gff = gff_init(args->gff_fname);
+        gff_set(args->gff,verbosity,1);
+        gff_set(args->gff,strip_chr_names,1);
+        gff_parse(args->gff);
+        args->idx_tscript = gff_get(args->gff,idx_tscript);
+        args->itr_tscript = regitr_init(NULL);
+    }
  }
  
  static void destroy_data(args_t *args)
  {
+    if ( args->gff )
+    {
+        gff_destroy(args->gff);
+        regitr_destroy(args->itr_tscript);
+    }
      cmpals_destroy(&args->cmpals_in);
      cmpals_destroy(&args->cmpals_out);
      int i;
@@ -1929,7 +2082,10 @@ static void destroy_data(args_t *args)
          free(args->maps[i].map);
      for (i=0; i<args->ntmp_als; i++)
          free(args->tmp_als[i].s);
+    for (i=0; i<args->ntmp_del; i++)
+        free(args->tmp_del[i].s);
      free(args->tmp_als);
+    free(args->tmp_del);
      free(args->tmp_kstr.s);
      if ( args->tmp_str )
      {
@@ -2018,6 +2174,7 @@ static void normalize_vcf(args_t *args)
          hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
      if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
      if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
  
      bcf1_t *line;
      int prev_rid = -1, prev_pos = -1, prev_type = 0;
@@ -2081,6 +2238,15 @@ static void normalize_vcf(args_t *args)
          if ( j>0 ) flush_buffer(args, args->out, j);
      }
      flush_buffer(args, args->out, args->rbuf.n);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out)<0 )
+        {
+            if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
  
      fprintf(stderr,"Lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
@@ -2104,6 +2270,7 @@ static void usage(void)
      fprintf(stderr, "    -d, --rm-dup TYPE               Remove duplicate snps|indels|both|all|exact\n");
      fprintf(stderr, "    -f, --fasta-ref FILE            Reference sequence\n");
      fprintf(stderr, "        --force                     Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+    fprintf(stderr, "    -g, --gff-annot FILE            Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n");
      fprintf(stderr, "        --keep-sum TAG,..           Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
      fprintf(stderr, "    -m, --multiallelics -|+TYPE     Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
      fprintf(stderr, "        --multi-overlaps 0|.        Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
@@ -2121,6 +2288,7 @@ static void usage(void)
      fprintf(stderr, "        --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
      fprintf(stderr, "        --threads INT               Use multithreading with <int> worker threads [0]\n");
      fprintf(stderr, "    -w, --site-win INT              Buffer for sorting lines which changed position during realignment [1000]\n");
+    fprintf(stderr, "        --write-index               Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Examples:\n");
      fprintf(stderr, "   # normalize and left-align indels\n");
@@ -2163,6 +2331,8 @@ int main_vcfnorm(int argc, char *argv[])
          {"old-rec-tag",required_argument,NULL,12},
          {"keep-sum",required_argument,NULL,10},
          {"fasta-ref",required_argument,NULL,'f'},
+        {"gff-annot",required_argument,NULL,'g'},
+        {"right-align",no_argument,NULL,15},            // undocumented, only for debugging
          {"do-not-normalize",no_argument,NULL,'N'},
          {"multiallelics",required_argument,NULL,'m'},
          {"multi-overlaps",required_argument,NULL,13},
@@ -2181,10 +2351,11 @@ int main_vcfnorm(int argc, char *argv[])
          {"check-ref",required_argument,NULL,'c'},
          {"strict-filter",no_argument,NULL,'s'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,14},
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) {
          switch (c) {
              case  10:
                  // possibly generalize this also to INFO/AD and other tags
@@ -2192,6 +2363,7 @@ int main_vcfnorm(int argc, char *argv[])
                      error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
                  args->keep_sum_ad = 1;  // this will be set to the header id or -1 in init_data
                  break;
+            case 'g': args->gff_fname = optarg; break;
              case 'a': args->atomize = SPLIT; break;
              case 11 :
                  if ( optarg[0]=='*' ) args->use_star_allele = 1;
@@ -2204,6 +2376,8 @@ int main_vcfnorm(int argc, char *argv[])
                  else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
                  else error("Invalid argument to --multi-overlaps\n");
                  break;
+            case 14 : args->write_index = 1; break;
+            case 15 : args->right_align = 1; break;
              case 'N': args->do_indels = 0; break;
              case 'd':
                  if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c

index e2d417748bfd65ce654254dc6d6dccb22f023644..de9c2857b25ca589618779bdd4ea966b721aab0b 100644 (file)
--- a/bcftools/vcfnorm.c.pysam.c
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfnorm.c -- Left-align and normalize indels.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -42,6 +42,8 @@ THE SOFTWARE.  */
  #include "bcftools.h"
  #include "rbuf.h"
  #include "abuf.h"
+#include "gff.h"
+#include "regidx.h"
  
  #define CHECK_REF_EXIT 1
  #define CHECK_REF_WARN 2
@@ -88,8 +90,8 @@ typedef struct
      int32_t *int32_arr;
      int ntmp_arr1, ntmp_arr2, nint32_arr;
      kstring_t *tmp_str;
-    kstring_t *tmp_als, tmp_kstr;
-    int ntmp_als;
+    kstring_t *tmp_als, *tmp_del, tmp_kstr;
+    int ntmp_als, ntmp_del;
      rbuf_t rbuf;
      int buf_win;            // maximum distance between two records to consider
      int aln_win;            // the realignment window size (maximum repeat size)
@@ -107,6 +109,13 @@ typedef struct
      int use_star_allele, ma_use_ref_allele;
      char *old_rec_tag;
      htsFile *out;
+    char *index_fn;
+    int write_index;
+    int right_align;
+    char *gff_fname;
+    gff_t *gff;
+    regidx_t *idx_tscript;
+    regitr_t *itr_tscript;
  }
  args_t;
  
@@ -346,6 +355,157 @@ static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
              error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
  }
  
+static int is_left_align(args_t *args, bcf1_t *line)
+{
+    if ( args->right_align ) return 0;
+    if ( !args->gff ) return 1;
+    const char *chr = bcf_seqname(args->hdr,line);
+    if ( !strncasecmp("chr",chr,3) ) chr += 3;  // strip 'chr' prefix, that's what we requested the GFF reader to do
+    if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1;
+
+    // if there are two conflicting overlapping transcripts, go with the default left-alignment
+    int has_fwd = 0;
+    while ( regitr_overlap(args->itr_tscript) )
+    {
+        gf_tscript_t *tr = regitr_payload(args->itr_tscript, gf_tscript_t*);
+        if ( tr->strand==STRAND_FWD ) has_fwd = 1;
+        if ( tr->strand==STRAND_REV ) return 1;
+    }
+    // either no hit at all (then left-align) or everything was on fwd strand (then right-align)
+    return has_fwd ? 0 : 1;
+}
+static hts_pos_t realign_left(args_t *args, bcf1_t *line)
+{
+    // trim from right
+    char *ref = NULL;
+    int i;
+    hts_pos_t nref=0, new_pos = line->pos;
+    kstring_t *als = args->tmp_als;
+    while (1)
+    {
+        // is the rightmost base identical in all alleles?
+        int min_len = als[0].l;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+            if ( als[i].l < min_len ) min_len = als[i].l;
+        }
+        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+        if ( min_len<=1 && new_pos==0 ) break;
+
+        int pad_from_left = 0;
+        for (i=0; i<line->n_allele; i++) // trim all alleles
+        {
+            als[i].l--;
+            if ( !als[i].l ) pad_from_left = 1;
+        }
+        if ( pad_from_left )
+        {
+            // extend all alleles to the left by aln_win bases (unless close to the chr start).
+            // Extra bases will be trimmed from the left after this loop is done
+            int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
+            free(ref);
+            ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), new_pos-npad, new_pos-1, &nref);
+            if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line), (int64_t) new_pos-npad+1);
+            replace_iupac_codes(ref,nref);
+            for (i=0; i<line->n_allele; i++)
+            {
+                ks_resize(&als[i], als[i].l + npad);
+                if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
+                memcpy(als[i].s,ref,npad);
+                als[i].l += npad;
+            }
+            new_pos -= npad;
+        }
+    }
+    free(ref);
+
+    // trim from left
+    int ntrim_left = 0;
+    while (1)
+    {
+        // is the first base identical in all alleles?
+        int min_len = als[0].l - ntrim_left;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+        }
+        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
+        ntrim_left++;
+    }
+    if ( ntrim_left )
+    {
+        for (i=0; i<line->n_allele; i++)
+        {
+            memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
+            als[i].l -= ntrim_left;
+        }
+        new_pos += ntrim_left;
+    }
+    return new_pos;
+}
+
+static hts_pos_t realign_right(args_t *args, bcf1_t *line)
+{
+    char *ref = NULL;
+    int i;
+    hts_pos_t new_pos = line->pos, nref = 0;
+    kstring_t *als = args->tmp_als;
+
+    // trim from left
+    int ntrim_left = 0, npad_right = line->rlen, has_indel = 0;
+    while (1)
+    {
+        // is the leftmost base identical in all alleles?
+        int min_len = als[0].l - ntrim_left;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( als[0].l!=als[i].l ) has_indel = 1;
+            if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+        }
+        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed further
+
+        ntrim_left++;
+        if ( min_len<=1 ) // pad from the right
+        {
+            free(ref);
+            ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), line->pos + npad_right, line->pos + npad_right + args->aln_win, &nref);
+            if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,line), new_pos + ntrim_left);
+            npad_right += args->aln_win;
+            replace_iupac_codes(ref,nref);
+            for (i=0; i<line->n_allele; i++) kputs(ref, &als[i]);
+        }
+    }
+    ntrim_left -= has_indel;
+    if ( ntrim_left > 0 )
+    {
+        for (i=0; i<line->n_allele; i++)
+        {
+            memmove(als[i].s, als[i].s + ntrim_left, als[i].l - ntrim_left);
+            als[i].l -= ntrim_left;
+        }
+        new_pos += ntrim_left;
+    }
+    free(ref);
+
+    // trim from right
+    while (1)
+    {
+        // is the last base identical in all alleles?
+        int min_len = als[0].l;
+        for (i=1; i<line->n_allele; i++)
+        {
+            if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+            if ( min_len > als[i].l ) min_len = als[i].l;
+        }
+        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed more
+        for (i=0; i<line->n_allele; i++) { als[i].l--; als[i].s[als[i].l]=0; }
+    }
+    return new_pos;
+}
+
  #define ERR_DUP_ALLELE       -2
  #define ERR_REF_MISMATCH     -1
  #define ERR_OK                0
@@ -398,10 +558,32 @@ static int realign(args_t *args, bcf1_t *line)
  
      // make a copy of each allele for trimming
      hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
+    hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
      kstring_t *als = args->tmp_als;
+    kstring_t *del = args->tmp_del;
      for (i=0; i<line->n_allele; i++)
      {
-        if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC;  // symbolic allele
+        del[i].l = 0;
+        if ( line->d.allele[i][0]=='<' )
+        {
+            // symbolic allele, only <DEL.*> will be realigned
+            if ( strncmp("<DEL",line->d.allele[i],4) ) return ERR_SYMBOLIC;
+            if ( nref < line->rlen )
+            {
+                free(ref);
+                reflen = line->rlen;
+                ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+                if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
+                seq_to_upper(ref,0);
+                replace_iupac_codes(ref,nref);  // any non-ACGT character in fasta ref is replaced with N
+                als[0].l = 0;
+                kputs(ref, &als[0]);
+                als[i].l = 0;
+                kputsn(ref,1,&als[i]);
+                kputs(line->d.allele[i],&del[i]);
+                continue;
+            }
+        }
          if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
          if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
          {
@@ -418,69 +600,17 @@ static int realign(args_t *args, bcf1_t *line)
  
          if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
      }
-
-    // trim from right
-    int new_pos = line->pos;
-    while (1)
-    {
-        // is the rightmost base identical in all alleles?
-        int min_len = als[0].l;
-        for (i=1; i<line->n_allele; i++)
-        {
-            if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break;
-            if ( als[i].l < min_len ) min_len = als[i].l;
-        }
-        if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
-        if ( min_len<=1 && new_pos==0 ) break;
-
-        int pad_from_left = 0;
-        for (i=0; i<line->n_allele; i++) // trim all alleles
-        {
-            als[i].l--;
-            if ( !als[i].l ) pad_from_left = 1;
-        }
-        if ( pad_from_left )
-        {
-            int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
-            free(ref);
-            ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
-            if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
-            replace_iupac_codes(ref,nref);
-            for (i=0; i<line->n_allele; i++)
-            {
-                ks_resize(&als[i], als[i].l + npad);
-                if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
-                memcpy(als[i].s,ref,npad);
-                als[i].l += npad;
-            }
-            new_pos -= npad;
-        }
-    }
      free(ref);
+    ref = NULL;
  
-    // trim from left
-    int ntrim_left = 0;
-    while (1)
-    {
-        // is the first base identical in all alleles?
-        int min_len = als[0].l - ntrim_left;
-        for (i=1; i<line->n_allele; i++)
-        {
-            if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
-            if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
-        }
-        if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
-        ntrim_left++;
-    }
-    if ( ntrim_left )
-    {
-        for (i=0; i<line->n_allele; i++)
-        {
-            memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
-            als[i].l -= ntrim_left;
-        }
-        new_pos += ntrim_left;
-    }
+    // which direction are we aligning?
+    int left_align = is_left_align(args, line);
+
+    hts_pos_t new_pos;
+    if ( left_align )
+        new_pos = realign_left(args, line);
+    else
+        new_pos = realign_right(args, line);
  
      // Have the alleles changed?
      als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
@@ -493,7 +623,8 @@ static int realign(args_t *args, bcf1_t *line)
      for (i=0; i<line->n_allele; i++)
      {
          if (i>0) kputc(',',&args->tmp_kstr);
-        kputsn(als[i].s,als[i].l,&args->tmp_kstr);
+        if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr);
+        else kputsn(als[i].s,als[i].l,&args->tmp_kstr);
      }
      args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
      bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
@@ -1283,10 +1414,12 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
          ngts2 /= nsmpl;
          if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1);
  
-        int32_t *gt  = (int32_t*) args->tmp_arr1;
-        int32_t *gt2 = (int32_t*) args->tmp_arr2;
+        int32_t *gt  = (int32_t*) args->tmp_arr1;       // the first, destination line
+        int32_t *gt2 = (int32_t*) args->tmp_arr2;       // one of the subsequent lines, i.e. the source line
          for (j=0; j<nsmpl; j++)
          {
+            // Take each source allele and apply to the first line. We try to preserve the order and phasing and we
+            // never overwrite with ref allele
              for (k2=0; k2<ngts2; k2++)
              {
                  if ( gt2[k2]==bcf_int32_vector_end ) break;
@@ -1294,12 +1427,18 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
                  int ial2 = bcf_gt_allele(gt2[k2]);
                  if ( ial2==0 ) continue;    // never overwrite with ref
                  if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
+
+                // The destination allele
                  int ial = args->maps[i].map[ial2];
-                for (k=0; k<ngts; k++)
-                    if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
-                if ( k<ngts )
+                if ( gt[k2]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k2]) || !bcf_gt_allele(gt[k2]) )
+                    gt[k2] = bcf_gt_is_phased(gt[k2]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+                else
                  {
-                    gt[k] = bcf_gt_unphased(ial);
+                    // conflict, the first line has non-zero allele, use the old way, possibly disrupt the phasing
+                    for (k=0; k<ngts; k++)
+                        if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
+                    if ( k<ngts )
+                        gt[k] = bcf_gt_unphased(ial);
                  }
              }
              gt  += ngts;
@@ -1908,10 +2047,24 @@ static void init_data(args_t *args)
              abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
          abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
      }
+    if ( args->gff_fname )
+    {
+        args->gff = gff_init(args->gff_fname);
+        gff_set(args->gff,verbosity,1);
+        gff_set(args->gff,strip_chr_names,1);
+        gff_parse(args->gff);
+        args->idx_tscript = gff_get(args->gff,idx_tscript);
+        args->itr_tscript = regitr_init(NULL);
+    }
  }
  
  static void destroy_data(args_t *args)
  {
+    if ( args->gff )
+    {
+        gff_destroy(args->gff);
+        regitr_destroy(args->itr_tscript);
+    }
      cmpals_destroy(&args->cmpals_in);
      cmpals_destroy(&args->cmpals_out);
      int i;
@@ -1931,7 +2084,10 @@ static void destroy_data(args_t *args)
          free(args->maps[i].map);
      for (i=0; i<args->ntmp_als; i++)
          free(args->tmp_als[i].s);
+    for (i=0; i<args->ntmp_del; i++)
+        free(args->tmp_del[i].s);
      free(args->tmp_als);
+    free(args->tmp_del);
      free(args->tmp_kstr.s);
      if ( args->tmp_str )
      {
@@ -2020,6 +2176,7 @@ static void normalize_vcf(args_t *args)
          hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
      if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
      if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
  
      bcf1_t *line;
      int prev_rid = -1, prev_pos = -1, prev_type = 0;
@@ -2083,6 +2240,15 @@ static void normalize_vcf(args_t *args)
          if ( j>0 ) flush_buffer(args, args->out, j);
      }
      flush_buffer(args, args->out, args->rbuf.n);
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(args->out)<0 )
+        {
+            if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
  
      fprintf(bcftools_stderr,"Lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
@@ -2106,6 +2272,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -d, --rm-dup TYPE               Remove duplicate snps|indels|both|all|exact\n");
      fprintf(bcftools_stderr, "    -f, --fasta-ref FILE            Reference sequence\n");
      fprintf(bcftools_stderr, "        --force                     Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+    fprintf(bcftools_stderr, "    -g, --gff-annot FILE            Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n");
      fprintf(bcftools_stderr, "        --keep-sum TAG,..           Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
      fprintf(bcftools_stderr, "    -m, --multiallelics -|+TYPE     Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
      fprintf(bcftools_stderr, "        --multi-overlaps 0|.        Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
@@ -2123,6 +2290,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "        --targets-overlap 0|1|2     Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
      fprintf(bcftools_stderr, "        --threads INT               Use multithreading with <int> worker threads [0]\n");
      fprintf(bcftools_stderr, "    -w, --site-win INT              Buffer for sorting lines which changed position during realignment [1000]\n");
+    fprintf(bcftools_stderr, "        --write-index               Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Examples:\n");
      fprintf(bcftools_stderr, "   # normalize and left-align indels\n");
@@ -2165,6 +2333,8 @@ int main_vcfnorm(int argc, char *argv[])
          {"old-rec-tag",required_argument,NULL,12},
          {"keep-sum",required_argument,NULL,10},
          {"fasta-ref",required_argument,NULL,'f'},
+        {"gff-annot",required_argument,NULL,'g'},
+        {"right-align",no_argument,NULL,15},            // undocumented, only for debugging
          {"do-not-normalize",no_argument,NULL,'N'},
          {"multiallelics",required_argument,NULL,'m'},
          {"multi-overlaps",required_argument,NULL,13},
@@ -2183,10 +2353,11 @@ int main_vcfnorm(int argc, char *argv[])
          {"check-ref",required_argument,NULL,'c'},
          {"strict-filter",no_argument,NULL,'s'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,14},
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) {
          switch (c) {
              case  10:
                  // possibly generalize this also to INFO/AD and other tags
@@ -2194,6 +2365,7 @@ int main_vcfnorm(int argc, char *argv[])
                      error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
                  args->keep_sum_ad = 1;  // this will be set to the header id or -1 in init_data
                  break;
+            case 'g': args->gff_fname = optarg; break;
              case 'a': args->atomize = SPLIT; break;
              case 11 :
                  if ( optarg[0]=='*' ) args->use_star_allele = 1;
@@ -2206,6 +2378,8 @@ int main_vcfnorm(int argc, char *argv[])
                  else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
                  else error("Invalid argument to --multi-overlaps\n");
                  break;
+            case 14 : args->write_index = 1; break;
+            case 15 : args->right_align = 1; break;
              case 'N': args->do_indels = 0; break;
              case 'd':
                  if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c

index 45686680a2872c76dfb075762adee1b7ce5a5e5f..68775196116047e24e5082bb6cd6995d0e839a7b 100644 (file)
--- a/bcftools/vcfplugin.c
+++ b/bcftools/vcfplugin.c
@@ -1,6 +1,6 @@
  /*  vcfplugin.c -- plugin modules for operating on VCF/BCF files.
  
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -149,6 +149,8 @@ typedef struct _args_t
  
      char **argv, *output_fname, *regions_list, *targets_list;
      int argc, drop_header, verbose, record_cmd_line, plist_only;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -548,6 +550,7 @@ static void init_data(args_t *args)
          if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
          if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
          if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+        if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      }
  }
  
@@ -569,7 +572,19 @@ static void destroy_data(args_t *args)
      }
      if ( args->filter )
          filter_destroy(args->filter);
-    if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    if (args->out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    }
  }
  
  static void usage(args_t *args)
@@ -598,6 +613,7 @@ static void usage(args_t *args)
      fprintf(stderr, "   -l, --list-plugins             List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
      fprintf(stderr, "   -v, --verbose                  Print verbose information, -vv increases verbosity\n");
      fprintf(stderr, "   -V, --version                  Print version string and exit\n");
+    fprintf(stderr, "       --write-index              Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -643,9 +659,9 @@ int main_plugin(int argc, char *argv[])
      if ( argv[1][0]!='-' )
      {
          args->verbose = is_verbose(argc, argv);
-        plugin_name = argv[1]; 
-        argc--; 
-        argv++; 
+        plugin_name = argv[1];
+        argc--;
+        argv++;
          load_plugin(args, plugin_name, 1, &args->plugin);
          if ( args->plugin.run )
          {
@@ -675,6 +691,7 @@ int main_plugin(int argc, char *argv[])
          {"targets-file",required_argument,NULL,'T'},
          {"targets-overlap",required_argument,NULL,2},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -723,6 +740,7 @@ int main_plugin(int argc, char *argv[])
                  break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
              case '?':
              case 'h': usage_only = 1; break;
              default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c

index b37ac235f789f39d597f9bd44d0b284c70960788..ad04eb44f297e9570f89ad30b46209e5c626e943 100644 (file)
--- a/bcftools/vcfplugin.c.pysam.c
+++ b/bcftools/vcfplugin.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfplugin.c -- plugin modules for operating on VCF/BCF files.
  
-    Copyright (C) 2013-2021 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -151,6 +151,8 @@ typedef struct _args_t
  
      char **argv, *output_fname, *regions_list, *targets_list;
      int argc, drop_header, verbose, record_cmd_line, plist_only;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -550,6 +552,7 @@ static void init_data(args_t *args)
          if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
          if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
          if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+        if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      }
  }
  
@@ -571,7 +574,19 @@ static void destroy_data(args_t *args)
      }
      if ( args->filter )
          filter_destroy(args->filter);
-    if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    if (args->out_fh )
+    {
+        if ( args->write_index )
+        {
+            if ( bcf_idx_save(args->out_fh)<0 )
+            {
+                if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+                error("Error: cannot write to index %s\n", args->index_fn);
+            }
+            free(args->index_fn);
+        }
+        if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    }
  }
  
  static void usage(args_t *args)
@@ -600,6 +615,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "   -l, --list-plugins             List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
      fprintf(bcftools_stderr, "   -v, --verbose                  Print verbose information, -vv increases verbosity\n");
      fprintf(bcftools_stderr, "   -V, --version                  Print version string and exit\n");
+    fprintf(bcftools_stderr, "       --write-index              Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      bcftools_exit(1);
  }
@@ -645,9 +661,9 @@ int main_plugin(int argc, char *argv[])
      if ( argv[1][0]!='-' )
      {
          args->verbose = is_verbose(argc, argv);
-        plugin_name = argv[1]; 
-        argc--; 
-        argv++; 
+        plugin_name = argv[1];
+        argc--;
+        argv++;
          load_plugin(args, plugin_name, 1, &args->plugin);
          if ( args->plugin.run )
          {
@@ -677,6 +693,7 @@ int main_plugin(int argc, char *argv[])
          {"targets-file",required_argument,NULL,'T'},
          {"targets-overlap",required_argument,NULL,2},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -725,6 +742,7 @@ int main_plugin(int argc, char *argv[])
                  break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
              case '?':
              case 'h': usage_only = 1; break;
              default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c

index 889f363242aa41d7e404bd1a32012ea02864f3de..5f4eb07c6ee0cfa71ba8448f8fbfe5475b98f2cc 100644 (file)
--- a/bcftools/vcfquery.c
+++ b/bcftools/vcfquery.c
@@ -1,6 +1,6 @@
  /*  vcfquery.c -- Extracts fields from VCF/BCF file.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -94,6 +94,7 @@ static void init_data(args_t *args)
          smpl_ilist_destroy(ilist);
      }
      args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+    convert_set_option(args->convert, force_newline, 1);
      convert_set_option(args->convert, subset_samples, &args->smpl_pass);
      if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
      free(samples);
diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c

index f1e0f8be42b5336c9f5196c757d4e44db41d9c9f..e4f252033dc26be96013b453950016b72440b5b5 100644 (file)
--- a/bcftools/vcfquery.c.pysam.c
+++ b/bcftools/vcfquery.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfquery.c -- Extracts fields from VCF/BCF file.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -96,6 +96,7 @@ static void init_data(args_t *args)
          smpl_ilist_destroy(ilist);
      }
      args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+    convert_set_option(args->convert, force_newline, 1);
      convert_set_option(args->convert, subset_samples, &args->smpl_pass);
      if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
      free(samples);
diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c

index 1de2b2867b7d9442f5308ec658df78ff429db575..3b208a0d3f91cb8126b08214dc9565e51390a7d9 100644 (file)
--- a/bcftools/vcfsort.c
+++ b/bcftools/vcfsort.c
@@ -1,6 +1,6 @@
  /*  vcfsort.c -- sort subcommand
  
-   Copyright (C) 2017-2022 Genome Research Ltd.
+   Copyright (C) 2017-2023 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -62,6 +62,8 @@ typedef struct _args_t
      uint8_t *mem_block;
      size_t nbuf, mbuf, nblk;
      blk_t *blk;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -300,6 +302,7 @@ void merge_blocks(args_t *args)
      set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
      htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
      if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      while ( bhp->ndat )
      {
          blk_t *blk = bhp->dat[0];
@@ -307,6 +310,15 @@ void merge_blocks(args_t *args)
          khp_delete(blk, bhp);
          blk_read(args, bhp, args->hdr, blk);
      }
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out)<0 )
+        {
+            if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname);
  
      clean_files(args);
@@ -333,6 +345,7 @@ static void usage(args_t *args)
  #else
      fprintf(stderr, "    -T, --temp-dir DIR             temporary files [/tmp/bcftools.XXXXXX]\n");
  #endif
+    fprintf(stderr, "        --write-index              Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -395,6 +408,7 @@ int main_sort(int argc, char *argv[])
          {"output-file",required_argument,NULL,'o'},
          {"output",required_argument,NULL,'o'},
          {"help",no_argument,NULL,'h'},
+        {"write-index",no_argument,NULL,1},
          {0,0,0,0}
      };
      char *tmp;
@@ -423,6 +437,7 @@ int main_sort(int argc, char *argv[])
                            if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                        }
                        break;
+            case  1 : args->write_index = 1; break;
              case 'h':
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c

index 79dbc431d63e472a4f98ff02b17dcc0372030867..948d60b777eae845b65293d3de1a792fbcca843d 100644 (file)
--- a/bcftools/vcfsort.c.pysam.c
+++ b/bcftools/vcfsort.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfsort.c -- sort subcommand
  
-   Copyright (C) 2017-2022 Genome Research Ltd.
+   Copyright (C) 2017-2023 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -64,6 +64,8 @@ typedef struct _args_t
      uint8_t *mem_block;
      size_t nbuf, mbuf, nblk;
      blk_t *blk;
+    char *index_fn;
+    int write_index;
  }
  args_t;
  
@@ -302,6 +304,7 @@ void merge_blocks(args_t *args)
      set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
      htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
      if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+    if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
      while ( bhp->ndat )
      {
          blk_t *blk = bhp->dat[0];
@@ -309,6 +312,15 @@ void merge_blocks(args_t *args)
          khp_delete(blk, bhp);
          blk_read(args, bhp, args->hdr, blk);
      }
+    if ( args->write_index )
+    {
+        if ( bcf_idx_save(out)<0 )
+        {
+            if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
      if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname);
  
      clean_files(args);
@@ -335,6 +347,7 @@ static void usage(args_t *args)
  #else
      fprintf(bcftools_stderr, "    -T, --temp-dir DIR             temporary files [/tmp/bcftools.XXXXXX]\n");
  #endif
+    fprintf(bcftools_stderr, "        --write-index              Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      bcftools_exit(1);
  }
@@ -397,6 +410,7 @@ int main_sort(int argc, char *argv[])
          {"output-file",required_argument,NULL,'o'},
          {"output",required_argument,NULL,'o'},
          {"help",no_argument,NULL,'h'},
+        {"write-index",no_argument,NULL,1},
          {0,0,0,0}
      };
      char *tmp;
@@ -425,6 +439,7 @@ int main_sort(int argc, char *argv[])
                            if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
                        }
                        break;
+            case  1 : args->write_index = 1; break;
              case 'h':
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c

index 10189fef94b79b298156ec4f816bee7ea3530deb..e2744ab3c662cf92b8c7355a0918165faf296565 100644 (file)
--- a/bcftools/vcfstats.c
+++ b/bcftools/vcfstats.c
@@ -70,6 +70,13 @@ typedef struct
  }
  idist_t;
  
+// variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins
+typedef struct
+{
+    int snv[21], indel[21];
+}
+vaf_t;
+
  typedef struct
  {
      uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
@@ -93,7 +100,8 @@ typedef struct
      int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
      int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
      int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs;
-    int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+    int *smpl_frm_shifts;   // not-applicable, in-frame, out-frame
+    vaf_t vaf, *smpl_vaf;   // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions
      unsigned long int *smpl_dp;
      idist_t dp, dp_sites;
      int nusr;
@@ -141,7 +149,9 @@ typedef struct
      gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
      bin_t *af_bins;
      float *farr;
-    int mfarr;
+    int32_t *iarr;
+    int mfarr, miarr;
+    int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref;
  
      // indel context
      indel_ctx_t *indel_ctx;
@@ -447,6 +457,8 @@ static void init_stats(args_t *args)
      if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
          error("No such INFO tag: %s\n", args->af_tag);
  
+    int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0;
+
      #if QUAL_STATS
          args->m_qual = 999;
      #endif
@@ -501,6 +513,8 @@ static void init_stats(args_t *args)
              stats->smpl_dp     = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int));
              stats->smpl_ndp    = (int *) calloc(args->files->n_smpl,sizeof(int));
              stats->smpl_sngl   = (int *) calloc(args->files->n_smpl,sizeof(int));
+            if ( has_fmt_ad )
+                stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t));
              #if HWE_STATS
                  stats->af_hwe  = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int));
              #endif
@@ -586,6 +600,7 @@ static void destroy_stats(args_t *args)
          free(stats->smpl_dp);
          free(stats->smpl_ndp);
          free(stats->smpl_sngl);
+        free(stats->smpl_vaf);
          idist_destroy(&stats->dp);
          idist_destroy(&stats->dp_sites);
          for (j=0; j<stats->nusr; j++)
@@ -602,6 +617,7 @@ static void destroy_stats(args_t *args)
      for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
      if ( args->af_bins ) bin_destroy(args->af_bins);
      free(args->farr);
+    free(args->iarr);
      free(args->usr);
      free(args->tmp_frm);
      free(args->tmp_iaf);
@@ -615,6 +631,8 @@ static void destroy_stats(args_t *args)
      if (args->filter[1]) filter_destroy(args->filter[1]);
  }
  
+// The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons.
+// The number of bins, either m_af (101) or as given by the user in --af-bins
  static void init_iaf(args_t *args, bcf_sr_t *reader)
  {
      bcf1_t *line = reader->buffer[0];
@@ -869,205 +887,279 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
      }
  }
  
-static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal)
+// Returns the max non-ref AD value
+static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial)
  {
-    if ( !fmt ) return;
-
-    float dvaf;
+    int iv, ad = 0;
+    *ial = 0;
      #define BRANCH_INT(type_t,missing,vector_end) { \
-        type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \
-        if ( p[ial]==vector_end || p[jal]==vector_end ) return; \
-        if ( p[ial]==missing || p[jal]==missing ) return; \
-        if ( !p[ial] && !p[jal] ) return; \
-        dvaf = (float)p[ial]/(p[ial]+p[jal]); \
+        type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+        for (iv=1; iv<ad_fmt_ptr->n; iv++) \
+        { \
+            if ( ptr[iv]==vector_end ) break; \
+            if ( ptr[iv]==missing ) continue; \
+            if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\
+        } \
      }
-    switch (fmt->type) {
+    switch (ad_fmt_ptr->type) {
          case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
          case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
          case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-        default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break;
+        default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
      }
      #undef BRANCH_INT
-
+    return ad;
+}
+static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial)
+{
+    #define BRANCH_INT(type_t,missing,vector_end) { \
+        type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+        if ( ptr[ial]==vector_end ) return 0; \
+        if ( ptr[ial]==missing ) return 0; \
+        return ptr[ial]; \
+    }
+    switch (ad_fmt_ptr->type) {
+        case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+        default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
+    }
+    #undef BRANCH_INT
+}
+static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf)
+{
      int len = line->d.var[ial].n;
      if ( len < -stats->m_indel ) len = -stats->m_indel;
      else if ( len > stats->m_indel ) len = stats->m_indel;
      int bin = stats->m_indel + len;
      stats->nvaf[bin]++;
-    stats->dvaf[bin] += dvaf;
+    stats->dvaf[bin] += vaf;
+}
+#define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05))
+static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf)
+{
+    int idx = vaf2bin(vaf);
+    if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
+    else smpl_vaf->indel[idx]++;
  }
  
-static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr)
  {
-    bcf_srs_t *files = args->files;
-    bcf1_t *line = reader->buffer[0];
-    bcf_fmt_t *fmt_ptr;
-    int nref_tot = 0, nhet_tot = 0, nalt_tot = 0;
-    int line_type = bcf_get_variant_types(line);
+    if ( dp_fmt_ptr )
+    {
+        #define BRANCH_INT(type_t,missing,vector_end) { \
+            type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \
+            if ( *ptr==missing || *ptr==vector_end ) return -1; \
+            return *ptr; \
+        }
+        switch (dp_fmt_ptr->type) {
+            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+            default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); exit(1); break;
+        }
+        #undef BRANCH_INT
+    }
+    if ( ad_fmt_ptr )
+    {
+        int iv, dp = 0, has_value = 0;
+        #define BRANCH_INT(type_t,missing,vector_end) { \
+            type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+            for (iv=0; iv<ad_fmt_ptr->n; iv++) \
+            { \
+                if ( ptr[iv]==vector_end ) break; \
+                if ( ptr[iv]==missing ) continue; \
+                has_value = 1; \
+                dp += ptr[iv]; \
+            } \
+        }
+        switch (ad_fmt_ptr->type) {
+            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+            default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
+        }
+        #undef BRANCH_INT
+        if ( !has_value ) return -1;
+        return dp;
+    }
+    return -1;
+}
+static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal)
+{
+    if ( gt==GT_UNKN )
+    {
+        stats->smpl_missing[ismpl]++;
+        return;
+    }
  
-    if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) )
+    int var_type = 0;
+    if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+    if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+    if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
      {
-        bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL;
+        if ( var_type&VCF_INDEL && stats->smpl_frm_shifts )
+        {
+            assert( ial<line->n_allele );
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+        }
+        if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++;
+        if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++;
+        return;
+    }
+    if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; }
+    #if HWE_STATS
+        switch (gt)
+        {
+            case GT_HOM_RR: args->nref_tot++; break;
+            case GT_HET_RA: args->nhet_tot++; break;
+            case GT_HET_AA:
+            case GT_HOM_AA: args->nalt_tot++; break;
+        }
+    #endif
  
-        int ref = bcf_acgt2int(*line->d.allele[0]);
-        int is, n_nref = 0, i_nref = 0;
-        for (is=0; is<args->files->n_smpl; is++)
+    if ( var_type&VCF_SNP || var_type==VCF_REF )  // count ALT=. as SNP
+    {
+        if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++;
+        else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++;
+        else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++;
+        else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++;
+        if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
          {
-            int ial, jal;
-            int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
-            if ( gt==GT_UNKN )
-            {
-                stats->smpl_missing[is]++;
-                continue;
-            }
-            if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
+            int ref = bcf_acgt2int(*line->d.allele[0]);
+            int alt = bcf_acgt2int(*line->d.allele[ial]);
+            if ( alt<0 ) return;
+            if ( abs(ref-alt)==2 )
+                stats->smpl_ts[ismpl]++;
+            else
+                stats->smpl_tv[ismpl]++;
+        }
+        if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal )
+        {
+            int ref = bcf_acgt2int(*line->d.allele[0]);
+            int alt = bcf_acgt2int(*line->d.allele[jal]);
+            if ( alt<0 ) return;
+            if ( abs(ref-alt)==2 )
+                stats->smpl_ts[ismpl]++;
+            else
+                stats->smpl_tv[ismpl]++;
+        }
+    }
+    if ( var_type&VCF_INDEL )
+    {
+        if ( gt != GT_HOM_RR )
+        {
+            stats->smpl_indels[ismpl]++;
+            if ( gt==GT_HET_RA || gt==GT_HET_AA )
              {
-                if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
+                int is_ins = 0, is_del = 0;
+                if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
                  {
-                    assert( ial<line->n_allele );
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+                    if ( line->d.var[ial].n < 0 ) is_del = 1;
+                    else is_ins = 1;
                  }
-                if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
-                if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
-                continue;
-            }
-            if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
-            #if HWE_STATS
-                switch (gt)
+                if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
                  {
-                    case GT_HOM_RR: nref_tot++; break;
-                    case GT_HET_RA: nhet_tot++; break;
-                    case GT_HET_AA:
-                    case GT_HOM_AA: nalt_tot++; break;
-                }
-            #endif
-            int var_type = 0;
-            if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
-            if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
-            if ( var_type&VCF_SNP || var_type==VCF_REF )  // count ALT=. as SNP
-            {
-                if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
-                else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
-                else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++;
-                else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++;
-                if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
-                {
-                    int alt = bcf_acgt2int(*line->d.allele[ial]);
-                    if ( alt<0 ) continue;
-                    if ( abs(ref-alt)==2 )
-                        stats->smpl_ts[is]++;
-                    else
-                        stats->smpl_tv[is]++;
+                    if ( line->d.var[jal].n < 0 ) is_del = 1;
+                    else is_ins = 1;
                  }
+                // Note that alt-het genotypes with both ins and del allele are counted twice!!
+                if ( is_del ) stats->smpl_del_hets[ismpl]++;
+                if ( is_ins ) stats->smpl_ins_hets[ismpl]++;
              }
-            if ( var_type&VCF_INDEL )
+            else if ( gt==GT_HOM_AA )
              {
-                if ( gt != GT_HOM_RR )
-                {
-                    stats->smpl_indels[is]++;
-
-                    if ( gt==GT_HET_RA || gt==GT_HET_AA )
-                    {
-                        int is_ins = 0, is_del = 0;
-                        if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
-                        {
-                            if ( line->d.var[ial].n < 0 ) is_del = 1;
-                            else is_ins = 1;
-                            update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal);
-                        }
-                        if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
-                        {
-                            if ( line->d.var[jal].n < 0 ) is_del = 1;
-                            else is_ins = 1;
-                            update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial);
-                        }
-                        // Note that alt-het genotypes with both ins and del allele are counted twice!!
-                        if ( is_del ) stats->smpl_del_hets[is]++;
-                        if ( is_ins ) stats->smpl_ins_hets[is]++;
-                    }
-                    else if ( gt==GT_HOM_AA )
-                    {
-                        if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++;
-                        else stats->smpl_ins_homs[is]++;
-                    }
-                }
-                if ( stats->smpl_frm_shifts )
-                {
-                    assert( ial<line->n_allele && jal<line->n_allele );
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++;
-                }
+                if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++;
+                else stats->smpl_ins_homs[ismpl]++;
              }
          }
-        if ( n_nref==1 ) stats->smpl_sngl[i_nref]++;
+        if ( stats->smpl_frm_shifts )
+        {
+            assert( ial<line->n_allele && jal<line->n_allele );
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++;
+        }
      }
+}
+static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+{
+    bcf_srs_t *files = args->files;
+    bcf1_t *line = reader->buffer[0];
  
-    #if HWE_STATS
-        if ( nhet_tot + nref_tot + nalt_tot )
+    args->nref_tot = 0;
+    args->nhet_tot = 0;
+    args->nalt_tot = 0;
+    args->n_nref   = 0;
+    args->i_nref   = 0;
+
+    bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT");
+    bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD");
+    bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP");
+
+    int is;
+    for (is=0; is<args->files->n_smpl; is++)
+    {
+        // Determine depth
+        int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr);
+        if ( dp>0 )
          {
-            float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
-            int idx = het_frac*(args->naf_hwe - 1);
-//check me: what is this?
-            if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
-            stats->af_hwe[idx]++;
+            (*idist(&stats->dp, dp))++;
+            stats->smpl_ndp[is]++;
+            stats->smpl_dp[is] += dp;
          }
-    #endif
  
-    if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) )
-    {
-        #define BRANCH_INT(type_t,missing,vector_end) { \
-            int is; \
-            for (is=0; is<args->files->n_smpl; is++) \
-            { \
-                type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
-                if ( *p==vector_end ) continue; \
-                if ( *p!=missing ) \
-                { \
-                    (*idist(&stats->dp, *p))++; \
-                    stats->smpl_ndp[is]++; \
-                    stats->smpl_dp[is] += *p; \
-                } \
-            } \
+        // Determine genotype
+        int ial, jal, gt=GT_UNKN;
+        if ( gt_fmt_ptr )
+        {
+            gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal);
+            sample_gt_stats(args,stats,line,is,gt,ial,jal);
          }
-        switch (fmt_ptr->type) {
-            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-            default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+
+        // Determine variant allele frequency
+        if ( dp>0 && ad_fmt_ptr )
+        {
+            float iad = 0, jad = 0;
+            if ( gt==GT_UNKN )    // GT not available
+            {
+                iad = get_ad(line,ad_fmt_ptr,is,&ial);
+            }
+            else if ( gt!=GT_UNKN )
+            {
+                iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial);
+                jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal);
+            }
+            if ( iad )
+            {
+                update_dvaf(stats,line,ial,(float)iad/dp);
+                update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp);
+            }
+            if ( jad && iad!=jad )
+            {
+                update_dvaf(stats,line,jal,(float)jad/dp);
+                update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp);
+            }
          }
-        #undef BRANCH_INT
      }
-    else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) )
+    if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++;
+
+#if HWE_STATS
+    if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) )
      {
-        #define BRANCH_INT(type_t,missing,vector_end) { \
-            int is,iv; \
-            for (is=0; is<args->files->n_smpl; is++) \
-            { \
-                type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
-                int dp = 0, has_value = 0; \
-                for (iv=0; iv<fmt_ptr->n; iv++) \
-                { \
-                    if ( p[iv]==vector_end ) break; \
-                    if ( p[iv]==missing ) continue; \
-                    has_value = 1; \
-                    dp += p[iv]; \
-                } \
-                if ( has_value ) \
-                { \
-                    (*idist(&stats->dp, dp))++; \
-                    stats->smpl_ndp[is]++; \
-                    stats->smpl_dp[is] += dp; \
-                } \
-            } \
-        }
-        switch (fmt_ptr->type) {
-            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-            default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
-        }
-        #undef BRANCH_INT
+        // Number of heterozygous genotypes observed for any given allele frequency. This is used
+        // by plot-vcfstats to show the observed vs expected number of hets. There the expected number
+        // of hets is calculated from the probability P(het) = 2*AF*(1-AF).
+        // The array af_hwe is organized as follows
+        //      m_af     .. number of allele frequency bins
+        //      naf_hwe  .. the number of het genotype frequency bins
+        //      iallele_freq*naf_hwe + ihet_freq
+        //
+        float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot);
+        int ihet_freq = het_frac * (args->naf_hwe - 1);
+        int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe;
+        stats->af_hwe[idx]++;
      }
+#endif
  
      if ( matched==3 )
      {
@@ -1200,8 +1292,8 @@ static void do_vcf_stats(args_t *args)
          if ( files->n_smpl )
              do_sample_stats(args, stats, reader, ret);
  
-        if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 )
-            (*idist(&stats->dp_sites, args->tmp_iaf[0]))++;
+        if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 )
+            (*idist(&stats->dp_sites, args->iarr[0]))++;
      }
  }
  
@@ -1736,6 +1828,24 @@ static void print_stats(args_t *args)
          }
          #endif
      }
+
+    if ( args->stats[0].smpl_vaf )
+    {
+        printf("# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n");
+        printf("# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n");
+        for (id=0; id<args->nstats; id++)
+        {
+            stats_t *stats = &args->stats[id];
+            for (i=0; i<args->files->n_smpl; i++)
+            {
+                printf("VAF\t%d\t%s\t", id,args->files->samples[i]);
+                for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].snv[j]);
+                printf("\t");
+                for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].indel[j]);
+                printf("\n");
+            }
+        }
+    }
  }
  
  static void usage(void)
diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c

index 3b7da5ad5630526b37a7704ed01f032c18555d65..11db1d1cc05afec64dda8cf3de392cc8f27f9e6b 100644 (file)
--- a/bcftools/vcfstats.c.pysam.c
+++ b/bcftools/vcfstats.c.pysam.c
@@ -72,6 +72,13 @@ typedef struct
  }
  idist_t;
  
+// variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins
+typedef struct
+{
+    int snv[21], indel[21];
+}
+vaf_t;
+
  typedef struct
  {
      uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
@@ -95,7 +102,8 @@ typedef struct
      int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
      int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
      int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs;
-    int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+    int *smpl_frm_shifts;   // not-applicable, in-frame, out-frame
+    vaf_t vaf, *smpl_vaf;   // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions
      unsigned long int *smpl_dp;
      idist_t dp, dp_sites;
      int nusr;
@@ -143,7 +151,9 @@ typedef struct
      gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
      bin_t *af_bins;
      float *farr;
-    int mfarr;
+    int32_t *iarr;
+    int mfarr, miarr;
+    int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref;
  
      // indel context
      indel_ctx_t *indel_ctx;
@@ -449,6 +459,8 @@ static void init_stats(args_t *args)
      if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
          error("No such INFO tag: %s\n", args->af_tag);
  
+    int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0;
+
      #if QUAL_STATS
          args->m_qual = 999;
      #endif
@@ -503,6 +515,8 @@ static void init_stats(args_t *args)
              stats->smpl_dp     = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int));
              stats->smpl_ndp    = (int *) calloc(args->files->n_smpl,sizeof(int));
              stats->smpl_sngl   = (int *) calloc(args->files->n_smpl,sizeof(int));
+            if ( has_fmt_ad )
+                stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t));
              #if HWE_STATS
                  stats->af_hwe  = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int));
              #endif
@@ -588,6 +602,7 @@ static void destroy_stats(args_t *args)
          free(stats->smpl_dp);
          free(stats->smpl_ndp);
          free(stats->smpl_sngl);
+        free(stats->smpl_vaf);
          idist_destroy(&stats->dp);
          idist_destroy(&stats->dp_sites);
          for (j=0; j<stats->nusr; j++)
@@ -604,6 +619,7 @@ static void destroy_stats(args_t *args)
      for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
      if ( args->af_bins ) bin_destroy(args->af_bins);
      free(args->farr);
+    free(args->iarr);
      free(args->usr);
      free(args->tmp_frm);
      free(args->tmp_iaf);
@@ -617,6 +633,8 @@ static void destroy_stats(args_t *args)
      if (args->filter[1]) filter_destroy(args->filter[1]);
  }
  
+// The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons.
+// The number of bins, either m_af (101) or as given by the user in --af-bins
  static void init_iaf(args_t *args, bcf_sr_t *reader)
  {
      bcf1_t *line = reader->buffer[0];
@@ -871,205 +889,279 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
      }
  }
  
-static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal)
+// Returns the max non-ref AD value
+static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial)
  {
-    if ( !fmt ) return;
-
-    float dvaf;
+    int iv, ad = 0;
+    *ial = 0;
      #define BRANCH_INT(type_t,missing,vector_end) { \
-        type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \
-        if ( p[ial]==vector_end || p[jal]==vector_end ) return; \
-        if ( p[ial]==missing || p[jal]==missing ) return; \
-        if ( !p[ial] && !p[jal] ) return; \
-        dvaf = (float)p[ial]/(p[ial]+p[jal]); \
+        type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+        for (iv=1; iv<ad_fmt_ptr->n; iv++) \
+        { \
+            if ( ptr[iv]==vector_end ) break; \
+            if ( ptr[iv]==missing ) continue; \
+            if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\
+        } \
      }
-    switch (fmt->type) {
+    switch (ad_fmt_ptr->type) {
          case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
          case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
          case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-        default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); bcftools_exit(1); break;
+        default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break;
      }
      #undef BRANCH_INT
-
+    return ad;
+}
+static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial)
+{
+    #define BRANCH_INT(type_t,missing,vector_end) { \
+        type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+        if ( ptr[ial]==vector_end ) return 0; \
+        if ( ptr[ial]==missing ) return 0; \
+        return ptr[ial]; \
+    }
+    switch (ad_fmt_ptr->type) {
+        case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+        default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break;
+    }
+    #undef BRANCH_INT
+}
+static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf)
+{
      int len = line->d.var[ial].n;
      if ( len < -stats->m_indel ) len = -stats->m_indel;
      else if ( len > stats->m_indel ) len = stats->m_indel;
      int bin = stats->m_indel + len;
      stats->nvaf[bin]++;
-    stats->dvaf[bin] += dvaf;
+    stats->dvaf[bin] += vaf;
+}
+#define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05))
+static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf)
+{
+    int idx = vaf2bin(vaf);
+    if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
+    else smpl_vaf->indel[idx]++;
  }
  
-static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr)
  {
-    bcf_srs_t *files = args->files;
-    bcf1_t *line = reader->buffer[0];
-    bcf_fmt_t *fmt_ptr;
-    int nref_tot = 0, nhet_tot = 0, nalt_tot = 0;
-    int line_type = bcf_get_variant_types(line);
+    if ( dp_fmt_ptr )
+    {
+        #define BRANCH_INT(type_t,missing,vector_end) { \
+            type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \
+            if ( *ptr==missing || *ptr==vector_end ) return -1; \
+            return *ptr; \
+        }
+        switch (dp_fmt_ptr->type) {
+            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+            default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); bcftools_exit(1); break;
+        }
+        #undef BRANCH_INT
+    }
+    if ( ad_fmt_ptr )
+    {
+        int iv, dp = 0, has_value = 0;
+        #define BRANCH_INT(type_t,missing,vector_end) { \
+            type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+            for (iv=0; iv<ad_fmt_ptr->n; iv++) \
+            { \
+                if ( ptr[iv]==vector_end ) break; \
+                if ( ptr[iv]==missing ) continue; \
+                has_value = 1; \
+                dp += ptr[iv]; \
+            } \
+        }
+        switch (ad_fmt_ptr->type) {
+            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
+            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+            default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break;
+        }
+        #undef BRANCH_INT
+        if ( !has_value ) return -1;
+        return dp;
+    }
+    return -1;
+}
+static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal)
+{
+    if ( gt==GT_UNKN )
+    {
+        stats->smpl_missing[ismpl]++;
+        return;
+    }
  
-    if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) )
+    int var_type = 0;
+    if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+    if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+    if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
      {
-        bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL;
+        if ( var_type&VCF_INDEL && stats->smpl_frm_shifts )
+        {
+            assert( ial<line->n_allele );
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+        }
+        if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++;
+        if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++;
+        return;
+    }
+    if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; }
+    #if HWE_STATS
+        switch (gt)
+        {
+            case GT_HOM_RR: args->nref_tot++; break;
+            case GT_HET_RA: args->nhet_tot++; break;
+            case GT_HET_AA:
+            case GT_HOM_AA: args->nalt_tot++; break;
+        }
+    #endif
  
-        int ref = bcf_acgt2int(*line->d.allele[0]);
-        int is, n_nref = 0, i_nref = 0;
-        for (is=0; is<args->files->n_smpl; is++)
+    if ( var_type&VCF_SNP || var_type==VCF_REF )  // count ALT=. as SNP
+    {
+        if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++;
+        else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++;
+        else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++;
+        else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++;
+        if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
          {
-            int ial, jal;
-            int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
-            if ( gt==GT_UNKN )
-            {
-                stats->smpl_missing[is]++;
-                continue;
-            }
-            if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
+            int ref = bcf_acgt2int(*line->d.allele[0]);
+            int alt = bcf_acgt2int(*line->d.allele[ial]);
+            if ( alt<0 ) return;
+            if ( abs(ref-alt)==2 )
+                stats->smpl_ts[ismpl]++;
+            else
+                stats->smpl_tv[ismpl]++;
+        }
+        if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal )
+        {
+            int ref = bcf_acgt2int(*line->d.allele[0]);
+            int alt = bcf_acgt2int(*line->d.allele[jal]);
+            if ( alt<0 ) return;
+            if ( abs(ref-alt)==2 )
+                stats->smpl_ts[ismpl]++;
+            else
+                stats->smpl_tv[ismpl]++;
+        }
+    }
+    if ( var_type&VCF_INDEL )
+    {
+        if ( gt != GT_HOM_RR )
+        {
+            stats->smpl_indels[ismpl]++;
+            if ( gt==GT_HET_RA || gt==GT_HET_AA )
              {
-                if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
+                int is_ins = 0, is_del = 0;
+                if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
                  {
-                    assert( ial<line->n_allele );
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+                    if ( line->d.var[ial].n < 0 ) is_del = 1;
+                    else is_ins = 1;
                  }
-                if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
-                if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
-                continue;
-            }
-            if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
-            #if HWE_STATS
-                switch (gt)
+                if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
                  {
-                    case GT_HOM_RR: nref_tot++; break;
-                    case GT_HET_RA: nhet_tot++; break;
-                    case GT_HET_AA:
-                    case GT_HOM_AA: nalt_tot++; break;
-                }
-            #endif
-            int var_type = 0;
-            if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
-            if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
-            if ( var_type&VCF_SNP || var_type==VCF_REF )  // count ALT=. as SNP
-            {
-                if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
-                else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
-                else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++;
-                else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++;
-                if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
-                {
-                    int alt = bcf_acgt2int(*line->d.allele[ial]);
-                    if ( alt<0 ) continue;
-                    if ( abs(ref-alt)==2 )
-                        stats->smpl_ts[is]++;
-                    else
-                        stats->smpl_tv[is]++;
+                    if ( line->d.var[jal].n < 0 ) is_del = 1;
+                    else is_ins = 1;
                  }
+                // Note that alt-het genotypes with both ins and del allele are counted twice!!
+                if ( is_del ) stats->smpl_del_hets[ismpl]++;
+                if ( is_ins ) stats->smpl_ins_hets[ismpl]++;
              }
-            if ( var_type&VCF_INDEL )
+            else if ( gt==GT_HOM_AA )
              {
-                if ( gt != GT_HOM_RR )
-                {
-                    stats->smpl_indels[is]++;
-
-                    if ( gt==GT_HET_RA || gt==GT_HET_AA )
-                    {
-                        int is_ins = 0, is_del = 0;
-                        if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
-                        {
-                            if ( line->d.var[ial].n < 0 ) is_del = 1;
-                            else is_ins = 1;
-                            update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal);
-                        }
-                        if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
-                        {
-                            if ( line->d.var[jal].n < 0 ) is_del = 1;
-                            else is_ins = 1;
-                            update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial);
-                        }
-                        // Note that alt-het genotypes with both ins and del allele are counted twice!!
-                        if ( is_del ) stats->smpl_del_hets[is]++;
-                        if ( is_ins ) stats->smpl_ins_hets[is]++;
-                    }
-                    else if ( gt==GT_HOM_AA )
-                    {
-                        if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++;
-                        else stats->smpl_ins_homs[is]++;
-                    }
-                }
-                if ( stats->smpl_frm_shifts )
-                {
-                    assert( ial<line->n_allele && jal<line->n_allele );
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
-                    stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++;
-                }
+                if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++;
+                else stats->smpl_ins_homs[ismpl]++;
              }
          }
-        if ( n_nref==1 ) stats->smpl_sngl[i_nref]++;
+        if ( stats->smpl_frm_shifts )
+        {
+            assert( ial<line->n_allele && jal<line->n_allele );
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+            stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++;
+        }
      }
+}
+static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+{
+    bcf_srs_t *files = args->files;
+    bcf1_t *line = reader->buffer[0];
  
-    #if HWE_STATS
-        if ( nhet_tot + nref_tot + nalt_tot )
+    args->nref_tot = 0;
+    args->nhet_tot = 0;
+    args->nalt_tot = 0;
+    args->n_nref   = 0;
+    args->i_nref   = 0;
+
+    bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT");
+    bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD");
+    bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP");
+
+    int is;
+    for (is=0; is<args->files->n_smpl; is++)
+    {
+        // Determine depth
+        int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr);
+        if ( dp>0 )
          {
-            float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
-            int idx = het_frac*(args->naf_hwe - 1);
-//check me: what is this?
-            if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
-            stats->af_hwe[idx]++;
+            (*idist(&stats->dp, dp))++;
+            stats->smpl_ndp[is]++;
+            stats->smpl_dp[is] += dp;
          }
-    #endif
  
-    if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) )
-    {
-        #define BRANCH_INT(type_t,missing,vector_end) { \
-            int is; \
-            for (is=0; is<args->files->n_smpl; is++) \
-            { \
-                type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
-                if ( *p==vector_end ) continue; \
-                if ( *p!=missing ) \
-                { \
-                    (*idist(&stats->dp, *p))++; \
-                    stats->smpl_ndp[is]++; \
-                    stats->smpl_dp[is] += *p; \
-                } \
-            } \
+        // Determine genotype
+        int ial, jal, gt=GT_UNKN;
+        if ( gt_fmt_ptr )
+        {
+            gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal);
+            sample_gt_stats(args,stats,line,is,gt,ial,jal);
          }
-        switch (fmt_ptr->type) {
-            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-            default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
+
+        // Determine variant allele frequency
+        if ( dp>0 && ad_fmt_ptr )
+        {
+            float iad = 0, jad = 0;
+            if ( gt==GT_UNKN )    // GT not available
+            {
+                iad = get_ad(line,ad_fmt_ptr,is,&ial);
+            }
+            else if ( gt!=GT_UNKN )
+            {
+                iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial);
+                jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal);
+            }
+            if ( iad )
+            {
+                update_dvaf(stats,line,ial,(float)iad/dp);
+                update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp);
+            }
+            if ( jad && iad!=jad )
+            {
+                update_dvaf(stats,line,jal,(float)jad/dp);
+                update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp);
+            }
          }
-        #undef BRANCH_INT
      }
-    else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) )
+    if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++;
+
+#if HWE_STATS
+    if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) )
      {
-        #define BRANCH_INT(type_t,missing,vector_end) { \
-            int is,iv; \
-            for (is=0; is<args->files->n_smpl; is++) \
-            { \
-                type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
-                int dp = 0, has_value = 0; \
-                for (iv=0; iv<fmt_ptr->n; iv++) \
-                { \
-                    if ( p[iv]==vector_end ) break; \
-                    if ( p[iv]==missing ) continue; \
-                    has_value = 1; \
-                    dp += p[iv]; \
-                } \
-                if ( has_value ) \
-                { \
-                    (*idist(&stats->dp, dp))++; \
-                    stats->smpl_ndp[is]++; \
-                    stats->smpl_dp[is] += dp; \
-                } \
-            } \
-        }
-        switch (fmt_ptr->type) {
-            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
-            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
-            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-            default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
-        }
-        #undef BRANCH_INT
+        // Number of heterozygous genotypes observed for any given allele frequency. This is used
+        // by plot-vcfstats to show the observed vs expected number of hets. There the expected number
+        // of hets is calculated from the probability P(het) = 2*AF*(1-AF).
+        // The array af_hwe is organized as follows
+        //      m_af     .. number of allele frequency bins
+        //      naf_hwe  .. the number of het genotype frequency bins
+        //      iallele_freq*naf_hwe + ihet_freq
+        //
+        float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot);
+        int ihet_freq = het_frac * (args->naf_hwe - 1);
+        int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe;
+        stats->af_hwe[idx]++;
      }
+#endif
  
      if ( matched==3 )
      {
@@ -1202,8 +1294,8 @@ static void do_vcf_stats(args_t *args)
          if ( files->n_smpl )
              do_sample_stats(args, stats, reader, ret);
  
-        if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 )
-            (*idist(&stats->dp_sites, args->tmp_iaf[0]))++;
+        if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 )
+            (*idist(&stats->dp_sites, args->iarr[0]))++;
      }
  }
  
@@ -1738,6 +1830,24 @@ static void print_stats(args_t *args)
          }
          #endif
      }
+
+    if ( args->stats[0].smpl_vaf )
+    {
+        fprintf(bcftools_stdout, "# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n");
+        fprintf(bcftools_stdout, "# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n");
+        for (id=0; id<args->nstats; id++)
+        {
+            stats_t *stats = &args->stats[id];
+            for (i=0; i<args->files->n_smpl; i++)
+            {
+                fprintf(bcftools_stdout, "VAF\t%d\t%s\t", id,args->files->samples[i]);
+                for (j=0; j<21; j++) fprintf(bcftools_stdout, "%s%d",j?",":"",stats->smpl_vaf[i].snv[j]);
+                fprintf(bcftools_stdout, "\t");
+                for (j=0; j<21; j++) fprintf(bcftools_stdout, "%s%d",j?",":"",stats->smpl_vaf[i].indel[j]);
+                fprintf(bcftools_stdout, "\n");
+            }
+        }
+    }
  }
  
  static void usage(void)
diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c

index 96dcbc7b5d0f9a4834439089dbd09a65c2ed3e73..e09efa0bc0314faf2908d22eb689079786cb2f71 100644 (file)
--- a/bcftools/vcfview.c
+++ b/bcftools/vcfview.c
@@ -1,6 +1,6 @@
  /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -76,6 +76,8 @@ typedef struct _args_t
      char *include_types, *exclude_types;
      int include, exclude;
      int record_cmd_line;
+    char *index_fn;
+    int write_index;
      htsFile *out;
  }
  args_t;
@@ -532,6 +534,7 @@ static void usage(args_t *args)
      fprintf(stderr, "    -u/U, --uncalled/--exclude-uncalled    Select/exclude sites without a called genotype\n");
      fprintf(stderr, "    -v/V, --types/--exclude-types LIST     Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
      fprintf(stderr, "    -x/X, --private/--exclude-private      Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+    fprintf(stderr, "          --write-index                    Automatically index the output files [off]\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -548,6 +551,7 @@ int main_vcfview(int argc, char *argv[])
      args->output_type = FT_VCF;
      args->n_threads = 0;
      args->record_cmd_line = 1;
+    args->write_index = 0;
      args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
      args->regions_overlap = 1;
      args->targets_overlap = 0;
@@ -596,6 +600,7 @@ int main_vcfview(int argc, char *argv[])
          {"phased",no_argument,NULL,'p'},
          {"exclude-phased",no_argument,NULL,'P'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -727,6 +732,7 @@ int main_vcfview(int argc, char *argv[])
                  break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
          }
@@ -783,6 +789,8 @@ int main_vcfview(int argc, char *argv[])
      else if ( args->output_type & FT_BCF )
          error("BCF output requires header, cannot proceed with -H\n");
  
+    if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out);
+
      int ret = 0;
      if (!args->header_only)
      {
@@ -795,7 +803,18 @@ int main_vcfview(int argc, char *argv[])
          ret = args->files->errnum;
          if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
      }
-    hts_close(args->out);
+
+    if (args->write_index)
+    {
+        if (bcf_idx_save(args->out) < 0)
+        {
+            if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
+
+    if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout");
      destroy_data(args);
      bcf_sr_destroy(args->files);
      free(args);
diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c

index 85f483d5dc8315e67c93ed1d02e1dc0a0705573e..1485b1e65f87ce4a30f5c69313e9290015b619bb 100644 (file)
--- a/bcftools/vcfview.c.pysam.c
+++ b/bcftools/vcfview.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
  
-    Copyright (C) 2013-2022 Genome Research Ltd.
+    Copyright (C) 2013-2023 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -78,6 +78,8 @@ typedef struct _args_t
      char *include_types, *exclude_types;
      int include, exclude;
      int record_cmd_line;
+    char *index_fn;
+    int write_index;
      htsFile *out;
  }
  args_t;
@@ -534,6 +536,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "    -u/U, --uncalled/--exclude-uncalled    Select/exclude sites without a called genotype\n");
      fprintf(bcftools_stderr, "    -v/V, --types/--exclude-types LIST     Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
      fprintf(bcftools_stderr, "    -x/X, --private/--exclude-private      Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+    fprintf(bcftools_stderr, "          --write-index                    Automatically index the output files [off]\n");
      fprintf(bcftools_stderr, "\n");
      bcftools_exit(1);
  }
@@ -550,6 +553,7 @@ int main_vcfview(int argc, char *argv[])
      args->output_type = FT_VCF;
      args->n_threads = 0;
      args->record_cmd_line = 1;
+    args->write_index = 0;
      args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
      args->regions_overlap = 1;
      args->targets_overlap = 0;
@@ -598,6 +602,7 @@ int main_vcfview(int argc, char *argv[])
          {"phased",no_argument,NULL,'p'},
          {"exclude-phased",no_argument,NULL,'P'},
          {"no-version",no_argument,NULL,8},
+        {"write-index",no_argument,NULL,10},
          {NULL,0,NULL,0}
      };
      char *tmp;
@@ -729,6 +734,7 @@ int main_vcfview(int argc, char *argv[])
                  break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->write_index = 1; break;
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
          }
@@ -785,6 +791,8 @@ int main_vcfview(int argc, char *argv[])
      else if ( args->output_type & FT_BCF )
          error("BCF output requires header, cannot proceed with -H\n");
  
+    if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out);
+
      int ret = 0;
      if (!args->header_only)
      {
@@ -797,7 +805,18 @@ int main_vcfview(int argc, char *argv[])
          ret = args->files->errnum;
          if ( ret ) fprintf(bcftools_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
      }
-    hts_close(args->out);
+
+    if (args->write_index)
+    {
+        if (bcf_idx_save(args->out) < 0)
+        {
+            if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"bcftools_stdout");
+            error("Error: cannot write to index %s\n", args->index_fn);
+        }
+        free(args->index_fn);
+    }
+
+    if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"bcftools_stdout");
      destroy_data(args);
      bcf_sr_destroy(args->files);
      free(args);
diff --git a/bcftools/version.c b/bcftools/version.c

index 4306d40116c492ebb9218240aa32f02a5ab9006a..38417a78b91db6c7050f3488416389e47147ff08 100644 (file)
--- a/bcftools/version.c
+++ b/bcftools/version.c
@@ -1,6 +1,6 @@
  /*  version.c -- report version numbers for plugins.
  
-    Copyright (C) 2014-2021 Genome Research Ltd.
+    Copyright (C) 2014-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -72,22 +72,26 @@ const char *hts_bcf_wmode(int file_type)
  const char *hts_bcf_wmode2(int file_type, const char *fname)
  {
      if ( !fname ) return hts_bcf_wmode(file_type);
-    int len = strlen(fname);
-    if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
-    if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
-    if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
-    if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+    if ( !end ) end = fname ? fname + strlen(fname) : fname;
+    int len = end - fname;
+    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+    if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF);
+    if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
      return hts_bcf_wmode(file_type);
  }
  
  void set_wmode(char dst[8], int file_type, const char *fname, int clevel)
  {
      const char *ret = NULL;
-    int len = fname ? strlen(fname) : 0;
-    if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
-    else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF);
-    else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
-    else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+    const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+    if ( !end ) end = fname ? fname + strlen(fname) : fname;
+    int len = end - fname;
+    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+    else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF);
+    else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+    else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
      else ret = hts_bcf_wmode(file_type);
      if ( clevel>=0 && clevel<=9 )
      {
@@ -107,3 +111,33 @@ int parse_overlap_option(const char *arg)
      else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2;
      else return -1;
  }
+
+// See also samtools/sam_utils.c auto_index()
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
+{
+    int min_shift = 14; // CSI
+
+    if ( !fname || !*fname || !strcmp(fname, "-") ) return -1;
+
+    char *delim = strstr(fname, HTS_IDX_DELIM);
+    if (delim)
+    {
+        delim += strlen(HTS_IDX_DELIM);
+        *idx_fname = strdup(delim);
+        if ( !*idx_fname ) return -1;
+
+        size_t l = strlen(*idx_fname);
+        if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0;
+    }
+    else
+    {
+        if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1;
+        sprintf(*idx_fname, "%s.csi", fname);
+    }
+
+    if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1;
+
+    return 0;
+}
+
+
diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c

index df12fc4859eb00eb244c09b12394943d1ec5ca73..23949bf0236b816a36ab861857b587906cd44d38 100644 (file)
--- a/bcftools/version.c.pysam.c
+++ b/bcftools/version.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  version.c -- report version numbers for plugins.
  
-    Copyright (C) 2014-2021 Genome Research Ltd.
+    Copyright (C) 2014-2023 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -74,22 +74,26 @@ const char *hts_bcf_wmode(int file_type)
  const char *hts_bcf_wmode2(int file_type, const char *fname)
  {
      if ( !fname ) return hts_bcf_wmode(file_type);
-    int len = strlen(fname);
-    if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
-    if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
-    if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
-    if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+    if ( !end ) end = fname ? fname + strlen(fname) : fname;
+    int len = end - fname;
+    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+    if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF);
+    if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
      return hts_bcf_wmode(file_type);
  }
  
  void set_wmode(char dst[8], int file_type, const char *fname, int clevel)
  {
      const char *ret = NULL;
-    int len = fname ? strlen(fname) : 0;
-    if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
-    else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF);
-    else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
-    else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+    const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+    if ( !end ) end = fname ? fname + strlen(fname) : fname;
+    int len = end - fname;
+    if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+    else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF);
+    else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+    else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
      else ret = hts_bcf_wmode(file_type);
      if ( clevel>=0 && clevel<=9 )
      {
@@ -109,3 +113,33 @@ int parse_overlap_option(const char *arg)
      else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2;
      else return -1;
  }
+
+// See also samtools/sam_utils.c auto_index()
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
+{
+    int min_shift = 14; // CSI
+
+    if ( !fname || !*fname || !strcmp(fname, "-") ) return -1;
+
+    char *delim = strstr(fname, HTS_IDX_DELIM);
+    if (delim)
+    {
+        delim += strlen(HTS_IDX_DELIM);
+        *idx_fname = strdup(delim);
+        if ( !*idx_fname ) return -1;
+
+        size_t l = strlen(*idx_fname);
+        if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0;
+    }
+    else
+    {
+        if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1;
+        sprintf(*idx_fname, "%s.csi", fname);
+    }
+
+    if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1;
+
+    return 0;
+}
+
+
diff --git a/bcftools/version.sh b/bcftools/version.sh

index 55d804296c41ed8a2d2249a9dcc268e958cb2a26..69bf963de4007b158f09e2a4fe782e2e07527645 100755 (executable)
--- a/bcftools/version.sh
+++ b/bcftools/version.sh
@@ -24,7 +24,7 @@
  # DEALINGS IN THE SOFTWARE.
  
  # Master version, for use in tarballs or non-git source copies
-VERSION=1.17
+VERSION=1.18
  
  # If we have a git clone, then check against the current tag
  if [ -e .git ]
diff --git a/cy_build.py b/cy_build.py

deleted file mode 100644 (file)

index 59a6e12..0000000
--- a/cy_build.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import os
-import re
-import sys
-
-try:
-    from Cython.Distutils import build_ext
-except ImportError:
-    from setuptools.command.build_ext import build_ext
-
-from distutils.extension import Extension
-from distutils.sysconfig import get_config_var, get_config_vars, get_python_version
-from pkg_resources import Distribution
-
-
-if sys.platform == 'darwin':
-    config_vars = get_config_vars()
-    config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '')
-    config_vars['SHLIB_EXT'] = '.so'
-
-
-def is_pip_install():
-    if "_" in os.environ and os.environ["_"].endswith("pip"):
-        return True
-    if "pip-egg-info" in sys.argv:
-        return True
-    if re.search("/pip-.*-build/", __file__):
-        return True
-    return False
-
-
-class CyExtension(Extension):
-    def __init__(self, *args, **kwargs):
-        self._init_func = kwargs.pop("init_func", None)
-        self._prebuild_func = kwargs.pop("prebuild_func", None)
-        Extension.__init__(self, *args, **kwargs)
-
-    def extend_includes(self, includes):
-        self.include_dirs.extend(includes)
-
-    def extend_macros(self, macros):
-        self.define_macros.extend(macros)
-
-    def extend_extra_objects(self, objs):
-        self.extra_objects.extend(objs)
-
-
-class cy_build_ext(build_ext):
-
-    def _get_egg_name(self):
-        ei_cmd = self.get_finalized_command("egg_info")
-        return Distribution(
-            None, None, ei_cmd.egg_name, ei_cmd.egg_version, get_python_version(),
-            self.distribution.has_ext_modules() and self.plat_name).egg_name()
-
-    def build_extension(self, ext):
-
-        if isinstance(ext, CyExtension) and ext._init_func:
-            ext._init_func(ext)
-
-        if not self.inplace:
-            ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
-
-        if sys.platform == 'darwin':
-            # The idea is to give shared libraries an install name of the form
-            # `@rpath/<library-name.so>`, and to set the rpath equal to
-            # @loader_path. This will allow Python packages to find the library
-            # in the expected place, while still giving enough flexibility to
-            # external applications to link against the library.
-            relative_module_path = ext.name.replace(".", os.sep) + (get_config_var('EXT_SUFFIX') or get_config_var('SO'))
-            library_path = os.path.join(
-                "@rpath", os.path.basename(relative_module_path)
-            )
-
-            if not ext.extra_link_args:
-                ext.extra_link_args = []
-            ext.extra_link_args += ['-dynamiclib',
-                                    '-rpath', '@loader_path',
-                                    '-Wl,-headerpad_max_install_names',
-                                    '-Wl,-install_name,%s' % library_path,
-                                    '-Wl,-x']
-        else:
-            if not ext.extra_link_args:
-                ext.extra_link_args = []
-
-            ext.extra_link_args += ['-Wl,-rpath,$ORIGIN']
-
-        if isinstance(ext, CyExtension) and ext._prebuild_func:
-            ext._prebuild_func(ext, self.force)
-
-        build_ext.build_extension(self, ext)
diff --git a/devtools/import.py b/devtools/import.py

index 90194d0d4704dfeb339b5b885cddf73f6364b4e6..a4652f44f8184f8bb3ff6cecfe17633bb1f08e1d 100644 (file)
--- a/devtools/import.py
+++ b/devtools/import.py
@@ -37,7 +37,7 @@ EXCLUDE = {
      "htslib": (
          'htslib/tabix.c', 'htslib/bgzip.c',
          'htslib/htsfile.c',
-        "test", "tests"),
+        "samples", "test", "tests"),
  }
  
  
diff --git a/devtools/install-prerequisites.sh b/devtools/install-prerequisites.sh

new file mode 100755 (executable)

index 0000000..eaedce1
--- /dev/null
+++ b/devtools/install-prerequisites.sh
@@ -0,0 +1,34 @@
+#!/bin/sh -e
+
+if test -x /usr/bin/dnf; then
+    echo Installing prerequisites via dnf...
+    dnf -y install epel-release
+    dnf -y install zlib-devel bzip2-devel xz-devel curl-devel samtools bcftools htslib-tools
+
+elif test -x /usr/bin/yum; then
+    if yum -y install epel-release; then
+        echo Installing prerequisites via yum...
+        yum -y install zlib-devel bzip2-devel xz-devel curl-devel samtools bcftools htslib-tools
+    else
+        echo Installing non-test prerequisites via yum...
+        yum -y install zlib-devel bzip2-devel xz-devel curl-devel
+    fi
+
+elif test -d /etc/dpkg; then
+    echo Installing prerequisites via apt-get...
+    apt-get update
+    apt-get install -y --no-install-recommends --no-install-suggests libcurl4-openssl-dev zlib1g-dev libbz2-dev liblzma-dev samtools bcftools tabix
+
+elif test -x /sbin/apk; then
+    echo Installing non-test prerequisites via apk...
+    apk update
+    apk add zlib-dev bzip2-dev xz-dev curl-dev
+
+elif test -x ${HOMEBREW_PREFIX-/usr/local}/bin/brew; then
+    echo Installing prerequisites via brew...
+    HOMEBREW_NO_AUTO_UPDATE=1 brew install -q samtools bcftools
+    brew unlink xz || true
+
+else
+    echo No package manager detected
+fi
diff --git a/doc/conf.py b/doc/conf.py

index aaf1d35772baab301c09c52ed41238800c0c74a3..1ada4bc6fd50d776a4a6a18c03abea3f4ec395a3 100644 (file)
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -11,7 +11,7 @@
  # All configuration values have a default; values that are commented out
  # serve to show the default.
  
-import sys, os, setuptools
+import sys, os, re, setuptools
  
  # If extensions (or modules to document with autodoc) are in another directory,
  # add these directories to sys.path here. If the directory is relative to the
@@ -29,6 +29,7 @@ if os.path.exists(_libdir):
  # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
  extensions = ['sphinx.ext.autodoc',
                'sphinx.ext.autosummary',
+              'sphinx.ext.extlinks',
                'sphinx.ext.todo', 
                'sphinx.ext.ifconfig',
                'sphinx.ext.intersphinx',
@@ -50,7 +51,7 @@ master_doc = 'index'
  
  # General information about the project.
  project = u'pysam'
-copyright = u'2009–2021, Andreas Heger, Kevin Jacobs, et al'
+copyright = '2009–2023 Andreas Heger, John Marshall, Kevin Jacobs, et al'
  
  # Included at the end of each rst file
  rst_epilog = '''
@@ -120,12 +121,33 @@ pygments_style = 'sphinx'
  # A list of ignored prefixes for module index sorting.
  #modindex_common_prefix = []
  
+# -- Rewrite "PR #NNN" and "#NNN" in NEWS as URL links -------------------------
+
+extlinks = {
+    'issue': ('https://github.com/pysam-developers/pysam/issues/%s', '#%s'),
+    'pull':  ('https://github.com/pysam-developers/pysam/pull/%s', 'PR #%s'),
+    }
+
+def expand_github_references(text):
+    text = re.sub(r'PR\s*#(\d+)', r':pull:`\1`', text)
+    text = re.sub(r'#(\d+)', r':issue:`\1`', text)
+    return text
+
+def include_read(app, relative_path, parent_docname, source):
+    if relative_path.name == 'NEWS':
+        source[0] = expand_github_references(source[0])
+
+def setup(app):
+    try:
+        app.connect('include-read', include_read)
+    except:
+        pass  # Sphinx is too old to link issues/PRs
  
  # -- Options for HTML output ---------------------------------------------------
  
  # The theme to use for HTML and HTML Help pages.  Major themes that come with
  # Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = 'default'
+html_theme = 'sphinx_rtd_theme'
  
  # Theme options are theme-specific and customize the look and feel of a theme
  # further.  For a list of options available for each theme, see the
@@ -207,7 +229,7 @@ htmlhelp_basename = 'samtoolsdoc'
  # (source start file, target name, title, author, documentclass [howto/manual]).
  latex_documents = [
      ('index', 'pysam.tex', u'pysam documentation',
-     u'Andreas Heger, Kevin Jacobs, et al.', 'manual'),
+     'Andreas Heger, John Marshall, Kevin Jacobs, et al', 'manual'),
  ]
  
  # The name of an image file (relative to this directory) to place at the top of
diff --git a/doc/index.rst b/doc/index.rst

index 30474e6fd662a8721698caff7b9297980ad01cf9..0b4485ce8f6a331d4adf3550d6a6fd360b90999b 100644 (file)
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -1,7 +1,7 @@
  pysam: htslib interface for python
  ==================================
  
-:Author: Andreas Heger, Kevin Jacobs and contributors
+:Author: Andreas Heger, John Marshall, Kevin Jacobs, and contributors
  :Date: |today|
  :Version: |version|
  
@@ -18,7 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as
  using cython and a high-level, pythonic API for convenient access to
  the data within genomic file formats. 
  
-The current version wraps *htslib-1.17*, *samtools-1.17*, and *bcftools-1.17*.
+The current version wraps *htslib-1.18*, *samtools-1.18*, and *bcftools-1.18*.
  
  To install the latest release, type::
  
@@ -26,6 +26,11 @@ To install the latest release, type::
  
  See the :ref:`Installation notes <installation>` for details.
  
+This module is unrelated to NREL-PySAM_, which wraps the National Renewable
+Energy Laboratory's System Advisor Model.
+
+.. _NREL-PySAM: https://nrel-pysam.readthedocs.io/
+
  Contents
  --------
  
diff --git a/doc/installation.rst b/doc/installation.rst

index a286c2733f44e353c7fd90d70fdf5d3e84b3c919..a659f9da697177b5802b55f1b81e299d57974869 100644 (file)
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -35,7 +35,11 @@ The typical installation will be through pypi_::
  
     pip install pysam
  
-This will compile the ``builtin`` htslib source code within pysam.
+Generally you will have the ``wheel`` package installed and
+this command will speedily install pysam from a pre-built wheel.
+Otherwise, or if you use pip's ``--no-binary`` option, this will
+compile the ``builtin`` htslib source code within pysam and allow
+the configuration facilities described below to be used.
  
  htslib_ can be configured at compilation to turn on additional
  features such support using encrypted configurations, enable plugins,
diff --git a/doc/release.rst b/doc/release.rst

index d731c36cd8c908103d54c582ffb739f856fdd95b..0676a275331b894780eaa832f4a01e6b01c0ff27 100644 (file)
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,840 +2,4 @@
  Release notes
  =============
  
-Release 0.19.1
-==============
-
-This release wraps htslib/samtools/bcftools version 1.15.1.
-
-* [#1104] add an add_samples() method to quickly add multiple samples
-  to VCF.
-
-Release 0.19.0
-==============
-
-This release wraps htslib/samtools/bcftools version 1.15.
-
-* [#1085] Improve getopt()/getopt_long() resetting when running samtools/bcftools commands
-
-* [#1078] Support BAM_CPAD in get_aligned_pairs
-
-* [#1063] Run flake8 and fix some linting issues
-
-* [#1088] Add AlignedSegment is_mapped/mate_is_mapped/is_forward/mate_is_forward properties
-
-* Write an absent AlignedSegment.qual as all-bytes-0xff
-
-* Fix BGZFile.read() behaviour near or at EOF
-
-* First API for the htslib modified bases interface
-  
-Release 0.18.0
-==============
-
-This release wraps htslib/samtools/bcftools version 1.14.
-
-* [#1048] and [#1060], clarify documentation of index statistics with CRAM files
-* Prevent "retval may be used uninitialised" warning.
-* Add new "samples" subcommand to pysam/samtools.py
-* Introduce TupleProxyIterator iterator object class
-
-Release 0.17.0
-==============
-
-This release wraps htslib/samtools/bcftools version 1.13. Corresponding
-to new samtools commands, `pysam.samtools` now has additional functions
-`ampliconclip`, `ampliconstats`, `fqimport`, and `version`.
-
-Bugs fixed:
-
-* [#447] The maximum QNAME length is fully restored to 254
-* [#506, #958, #1000] Don't crash the Python interpreter on ``pysam.bcftools.*()`` errors
-* [#603] count_coverage: ignore reads that have no SEQ field
-* [#928] Fix ``pysam.bcftools.mpileup()`` segmentation fault
-* [#983] Add win32/\*.[ch] to MANIFEST.in
-* [#994] Raise exception in ``get_tid()`` if header could not be parsed
-* [#995] Choose TBI/CSI in ``tabix_index()`` via both min_shift and csi
-* [#996] ``AlignmentFile.fetch()`` now works with large chromosomes longer than 2\ :sup:`29` bases
-* [#1019] Fix Sphinx documentation generation by avoiding Python 2 ``ur'string'`` syntax
-* [#1035] Improved handling of file iteration errors
-* [#1038] ``tabix_index()`` no longer leaks file descriptors
-* [#1040] ``print(aligned_segment)`` now prints the correct TLEN value
-  (it also now prints RNAME/RNEXT more clearly and prints POS/PNEXT 1-based)
-* *setup.py* longer uses ``setup(use_2to3)`` for compatibility with setuptools >= v58.0.0
-
-New facilities:
-
-* [PR #963] Additional VCF classes are exposed to pysam programmers
-* [#998, PR #1001] Add ``get/set_encoding_error_handler()`` to control UTF-8 conversion
-* [PR #1012] Running ``python setup.py sdist`` now automatically runs cythonize
-* Running tests with ``pytest`` now automatically runs ``make`` to generate test data
-
-Documentation improvements:
-
-* [#726] Clarify get_forward_sequence/get_forward_qualities documentation
-* [#865] Improved example
-* [#968] ``get_index_statstics`` parameters
-* [#986] Clarify ``VariantFile.fetch`` start/stop region parameters are 0-based and half-open.
-* [#990] Corrected ``PileupColumn.get_query_sequences`` documentation
-* [#999] Fix documentation for ``AlignmentFile.get_reference_length()``
-* [#1002] Document the default min_base_quality for ``pileup()``
-
-
-Release 0.16.0
-==============
-
-This release wraps htslib/bcftools version 1.10.2 and samtools version
-1.10. The following bugs reported against pysam are fixed due to this:
-
-* [#447] Writing out QNAME longer than 251 characters corrupts BAM
-* [#640, #734, #843] Setting VariantRecord pos or stop raises error
-* [#738, #919] FastxFile truncates concatenated plain gzip compressed files
-
-Additional bugfixes:
-
-* [#840] Pileup doesn't work on python3 when `index_filename` is used
-* [#886] FastqProxy raises ValueError when instantiated from python
-* [#904] VariantFile.fetch() throws ValueError on files with no records
-* [#909] Fix incorrect quoting in VariantFile contig records
-* [#915, #916] Implement pileup() for unindexed files and/or SAM files
-
-Backwards incompatible changes:
-
-* The `samtools import` command was removed in samtools 1.10, so pysam
-  no longer exports a `samimport` function. Use `pysam.view()` instead.
-
-
-Release 0.15.4
-==============
-
-Bugfix release. Principal reason for release is to update cython
-version in order to fix pip install pysam with python 3.8.
-
-* [#879] Fix add_meta function in libcbcf.pyx, so meta-information
-  lines in header added with this function have double-quoting rules
-  in accordance to rules specified in VCF4.2 and VCF4.3 specifications
-* [#863] Force arg to bytes to support non-ASCII encoding
-* [#875] Bump minimum Cython version
-* [#868] Prevent segfault on Python 2.7 AlignedSegment.compare(other=None)
-* [#867] Fix wheel building on TravisCI
-* [#863] Force arg to bytes to support non-ASCII encoding
-* [#799] disambiguate interpretation of bcf_read return code
-* [#841] Fix silent truncation of FASTQ with bad q strings
-* [#846] Prevent segmentation fault on ID, when handling malformed records
-* [#829] Run configure with the correct CC/CFLAGS/LDFLAGS env vars
-
-
-Release 0.15.3
-==============
-
-Bugfix release.
-
-* [#824] allow reading of UTF-8 encoded text in VCF/BCF files.
-* [#780] close all filehandles before opening new ones in pysam_dispatch
-* [#773] do not cache VariantRecord.id to avoid memory leak
-* [#781] default of multiple_iterators=True is changed to False for
-  CRAM files.
-* [#825] fix collections.abc import
-* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy
-  bug when setting FORMAT fields.
-* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its
-  memory as needed, rather than a fixed-size char buffer.
-* [#814] Build wheels and upload them to PyPI
-* [#755] Allow passing flags and arguments to index methods
-* [#763] Strip \0 in header check
-* [#761] Test Tabix index contents, not the compression
-
-Release 0.15.2
-==============
-
-Bugfix release.
-
-* [#746] catch pileup itorator out-of-scope segfaults
-* [#747] fix faixd fetch with region
-* [#748] increase max_pos to (1<<31)-1
-* [#645] Add missing macOS stub files in `MANIFEST.in`, @SoapZA
-* [#737] Fix bug in get_aligned_pairs, @bkohrn
-
-Release 0.15.1
-==============
-
-Bugfix release.
-
-* [#716] raise ValueError if tid is out of range when writing
-* [#697] release version using cython 0.28.5 for python 3.7
-  compatibility
-
-Release 0.15.0
-==============
-
-This release wraps htslib/samtools/bcftools version 1.9.0.
-
-* [#673] permit dash in chromosome name of region string
-* [#656] Support `text` when opening a SAM file for writing
-* [#658] return None in get_forward_sequence if sequence not in record
-* [#683] allow lower case bases in MD tags
-* Ensure that = and X CIGAR ops are treated the same as M
-
-Release 0.14.1
-==============
-
-This is mostly a bugfix release, though bcftools has now also been
-upgraded to 1.7.0.
-
-* [#621] Add a warning to count_coverage when an alignment has an
-  empty QUAL field
-* [#635] Speed-up of AlignedSegment.find_intro()
-* treat border case of all bases in pileup column below quality score
-* [#634] Fix access to pileup reference_sequence
-
-
-Release 0.14.0
-==============
-
-This release wraps htslib/samtools versions 1.7.0.
-
-* SAM/BAM/CRAM headers are now managed by a separate AlignmentHeader
-  class.
-* AlignmentFile.header.as_dict() returns an ordered dictionary.
-* Use "stop" instead of "end" to ensure consistency to
-  VariantFile. The end designations have been kept for backwards
-  compatibility.
-
-* [#611] and [#293] CRAM repeated fetch now works, each iterator
-  reloads index if multiple_iterators=True
-* [#608] pysam now wraps htslib 1.7 and samtools 1.7.
-* [#580] reference_name and next_reference_name can now be set to "*"
-  (will be converted to None to indicate an unmapped location)
-* [#302] providing no coordinate to count_coverage will not count from
-  start/end of contig.
-* [#325] @SQ records will be automatically added to header if they are
-  absent from text section of header.
-* [#529] add get_forward_sequence() and get_forward_qualities()
-  methods
-* [#577] add from_string() and to_dict()/from_dict() methods to
-  AlignedSegment. Rename tostring() to to_string() throughout for
-  consistency
-* [#589] return None from build_alignment_sequence if no MD tag is set
-* [#528] add PileupColumn.__len__ method
-
-Backwards incompatible changes:
-
-* AlignmentFile.header now returns an AlignmentHeader object. Use
-  AlignmentFile.header.to_dict() to get the dictionary as
-  previously. Most dictionary accessor methods (keys(), values(),
-  __getitem__, ...) have been implemented to ensure some level of
-  backwards compatibility when only reading.
-
-  The rationale for this change is to have consistency between
-  AlignmentFile and VariantFile.
-
-* AlignmentFile and FastaFile now raise IOError instead of OSError
-
-Medium term we plan to have a 1.0 release. The pysam
-interface has grown over the years and the API is cluttered with
-deprecated names (Samfile, getrname(), gettid(), ...). To work towards
-this, the next release (0.15.0) will yield DeprecationWarnings 
-for any parts of the API that are considered obsolete and will not be
-in 1.0. Once 1.0 has been reached, we will use semantic versioning.
-
-Release 0.13.0
-===============
-
-This release wraps htslib/samtools/bcftools versions 1.6.0 and
-contains a series of bugfixes.
-
-* [#544] reading header from remote TabixFiles now works.
-* [#531] add missing tag types H and A. A python float will now be
-  added as 'f' type instead of 'd' type.
-* [#543] use FastaFile instead of Fastafile in pileup.
-* [#546] set is_modified flag in setAttribute so updated attributes
-  are output.
-* [#537] allow tabix index files to be created in a custom location.
-* [#530] add get_index_statistics() method
-
-
-Release 0.12.0.1
-================
-
-Bugfix release to solve compilation issue due to missinge
-bcftools/config.h file.
-
-Release 0.12.0
-==============
-
-This release wraps htslib/samtools/bcftools versions 1.5.0 and
-contains a series of bugfixes.
-
-* [#473] A new FastxRecord class that can be instantiated from class and
-  modified in-place. Replaces PersistentFastqProxy.
-* [#521] In AligmentFile, Simplify file detection logic and allow remote index files
-
-  * Removed attempts to guess data and index file names; this is magic left
-    to htslib.
-  * Removed file existence check prior to opening files with htslib
-  * Better error checking after opening files that raise the appropriate
-    error (IOError for when errno is set, ValueError otherwise for backward
-    compatibility).
-  * Report IO errors when loading an index by name.
-  * Allow remote indices (tested using S3 signed URLs).
-  * Document filepath_index and make it an alias for index_filename.
-  * Added a require_index parameter to AlignmentFile
-
-* [#526] handle unset ref when creating new records
-* [#513] fix bcf_translate to skip deleted FORMAT fields to avoid
-  segfaults
-* [#516] expose IO errors via IOError exceptions
-* [#487] add tabix line_skip, remove 'pileup' preset
-* add FastxRecord, replaces PersistentFastqProxy (still present for
-  backwards compatibility)
-* [#496] upgrade to htslib/samtools/bcftools versions 1.5
-* add start/stop to AlignmentFile.fetch() to be consistent with
-  VariantFile.fetch(). "end" is kept for backwards compatibility.
-* [#512] add get_index_statistics() method to AlignmentFile.
-
-Upcoming changes:
-
-In the next release we are plannig to separate the header information
-from AlignmentFile into a separate class AlignmentHeader. This layout
-is similar to VariantFile/VariantHeader. With this change we will
-ensure that an AlignedSegment record will be linked to a header so
-that chromosome names can be automatically translated from the numeric
-representation. As a consequence, the way new AlignedSegment records
-are created will need to change as the constructor requires a header::
-
-    header = pysam.AlignmentHeader(
-        reference_names=["chr1", "chr2"],
-        reference_lengths=[1000, 1000])
-
-    read = pysam.AlignedSegment(header)
-
-This will affect all code that instantiates AlignedSegment objects
-directly. We have not yet merged to allow users to provide feed-back.
-The pull-request is here: https://github.com/pysam-developers/pysam/pull/518
-Please comment on github.
-
-Release 0.11.2.2
-================
-
-Bugfix release to address two issues:
-
-* Changes in 0.11.2.1 broke the GTF/GFF3 parser. Corrected and
-  more tests have been added.
-* [#479] Correct VariantRecord edge cases described in issue
-
-Release 0.11.2.1
-================
-
-Release to fix release tar-ball containing 0.11.1 pre-compiled
-C-files.
-
-Release 0.11.2
-==============
-
-This release wraps htslib/samtools/bcfools versions 1.4.1 in response
-to a security fix in these libraries. Additionally the following
-issues have been fixed:
-
-* [#452] add GFF3 support for tabix parsers
-* [#461] Multiple fixes related to VariantRecordInfo and handling of INFO/END
-* [#447] limit query name to 251 characters (only partially addresses issue)
-
-VariantFile and related object fixes
-
-* Restore VariantFile.\_\_dealloc\_\_
-* Correct handling of bcf_str_missing in bcf_array_to_object and
-  bcf_object_to_array
-* Added update() and pop() methods to some dict-like proxy objects
-* scalar INFO entries could not be set again after being deleted
-* VariantRecordInfo.__delitem__ now allows unset flags to be deleted without
-  raising a KeyError
-* Multiple other fixes for VariantRecordInfo methods
-* INFO/END is now accessible only via VariantRecord.stop and
-  VariantRecord.rlen.  Even if present behind the scenes, it is no longer
-  accessible via VariantRecordInfo.
-* Add argument to issue a warning instead of an exception if input appears
-  to be truncated
-
-Other features and fixes:
-
-* Make AlignmentFile \_\_dealloc\_\_ and close more
-  stringent
-* Add argument AlignmentFile to issue a warning instead of an
-  exception if input appears to be truncated
-
-Release 0.11.1
-==============
-
-Bugfix release
-
-* [#440] add deprecated 'always' option to infer_query_length for backwards compatibility.
-
-Release 0.11.0
-==============
-
-This release wraps the latest versions of htslib/samtools/bcftools and
-implements a few bugfixes.
-
-* [#413] Wrap HTSlib/Samtools/BCFtools 1.4 
-* [#422] Fix missing pysam.sort.usage() message
-* [#411] Fix BGZfile initialization bug
-* [#412] Add seek support for BGZFile
-* [#395] Make BGZfile iterable
-* [#433] Correct getQueryEnd
-* [#419] Export SAM enums such as pysam.CMATCH
-* [#415] Fix access by tid in AlignmentFile.fetch()
-* [#405] Writing SAM now outputs a header by default.
-* [#332] split infer_query_length(always) into infer_query_length and infer_read_length
-
-Release 0.10.0
-==============
-
-This release implements further functionality in the VariantFile API
-and includes several bugfixes:
-
-* treat special case -c option in samtools view outputs to stdout even
-  if -o given, fixes #315
-* permit reading BAM files with CSI index, closes #370
-* raise Error if query name exceeds maximum length, fixes #373
-* new method to compute hash value for AlignedSegment
-* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile
-* Avoid segfault by detecting out of range reference_id and
-  next_reference in AlignedSegment.tostring
-* Issue #355: Implement streams using file descriptors for VariantFile
-* upgrade to htslib 1.3.2
-* fix compilation with musl libc
-* Issue #316, #360: Rename all Cython modules to have lib as a prefix
-* Issue #332, hardclipped bases in cigar included by
-  pysam.AlignedSegment.infer_query_length()
-* Added support for Python 3.6 filename encoding protocol
-* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord
-* Issue #331, fix failure in VariantFile.reset() method
-* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and
-  VariantRecord.copy() methods to create new VariantRecord objects
-* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters
-* Preliminary (potentially unsafe) support for removing and altering header metadata
-* Many minor fixes and improvements to VariantFile and related objects
-
-Please note that all internal cython extensions now have a lib prefix
-to facilitate linking against pysam extension modules. Any user cython
-extensions using cimport to import pysam definitions will need
-changes, for example::
-
-   cimport pysam.csamtools
-
-will become::
-
-   cimport pysam.libcsamtools
-
-Release 0.9.1
-=============
-
-This is a bugfix release addressing some installation problems
-in pysam 0.9.0, in particular:
-
-* patch included htslib to work with older libcurl versions, fixes #262.
-* do not require cython for python 3 install, fixes #260
-* FastaFile does not accept filepath_index any more, see #270
-* add AlignedSegment.get_cigar_stats method.
-* py3 bugfix in VariantFile.subset_samples, fixes #272
-* add missing sysconfig import, fixes #278
-* do not redirect stdout, but instead write to a separately
-  created file. This should resolve issues when pysam is used
-  in notebooks or other environments that redirect stdout.
-* wrap htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1
-* use bgzf throughout instead of gzip
-* allow specifying a fasta reference for CRAM file when opening
-  for both read and write, fixes #280
-
-Release 0.9.0
-=============
-
-Overview
---------
-
-The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other
-enhancements and bugfixes. See below for a detailed list.
-
-`Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
-comes with additional capabilities for remote file access which depend
-on the presence of optional system libraries. As a consequence, the
-installation script :file:`setup.py` has become more complex. For an
-overview, see :ref:`installation`.  We have tested installation on
-linux and OS X, but could not capture all variations. It is possible
-that a 0.9.1 release might follow soon addressing installation issues.
-
-The :py:class:`~.pysam.VariantFile` class provides access to
-:term:`vcf` and :term:`bcf` formatted files. The class is certainly
-usable and interface is reaching completion, but the API and the
-functionality is subject to change.
-
-Detailed release notes
-----------------------
-
-* upgrade to htslib 1.3
-* python 3 compatibility tested throughout.
-* added a first set of bcftools commands in the pysam.bcftools
-  submodule.
-* samtools commands are now in the pysam.samtools module. For
-  backwards compatibility they are still imported into the pysam
-  namespace.
-* samtools/bcftools return stdout as a single (byte) string. As output
-  can be binary (VCF.gz, BAM) this is necessary to ensure py2/py3
-  compatibility. To replicate the previous behaviour in py2.7, use::
-
-     pysam.samtools.view(self.filename).splitlines(True)
-
-* get_tags() returns the tag type as a character, not an integer (#214)
-* TabixFile now raises ValueError on indices created by tabix <1.0 (#206)
-* improve OSX installation and develop mode
-* FastxIterator now handles empty sequences (#204)
-* TabixFile.isremote is not TabixFile.is_remote in line with AlignmentFile
-* AlignmentFile.count() has extra optional argument read_callback
-* setup.py has been changed to:
-   * install a single builtin htslib library. Previously, each pysam
-     module contained its own version. This reduces compilation time
-     and code bloat.
-   * run configure for the builtin htslib library in order to detect
-     optional libraries such as libcurl. Configure behaviour can be
-     controlled by setting the environment variable
-     HTSLIB_CONFIGURE_OPTIONS.
-* get_reference_sequence() now returns the reference sequence and not
-  something looking like it. This bug had effects on
-  get_aligned_pairs(with_seq=True), see #225. If you have relied on on
-  get_aligned_pairs(with_seq=True) in pysam-0.8.4, please check your
-  results.
-* improved autodetection of file formats in AlignmentFile and VariantFile.
-
-Release 0.8.4
-=============
-
-This release contains numerous bugfixes and a first implementation of
-a pythonic interface to VCF/BCF files. Note that this code is still
-incomplete and preliminary, but does offer a nearly complete immutable
-Pythonic interface to VCF/BCF metadata and data with reading and
-writing capability.
-
-Potential isses when upgrading from v0.8.3:
-
-* binary tags are now returned as python arrays
-
-* renamed several methods for pep8 compatibility, old names still retained for 
-  backwards compatibility, but should be considered deprecated.
-
-   * gettid() is now get_tid()
-   * getrname() is now get_reference_name()
-   * parseRegion() is now parse_region()
-
-* some methods have changed for pep8 compatibility without the old
-  names being present:
-
-   * fromQualityString() is now qualitystring_to_array()
-   * toQualityString() is now qualities_to_qualitystring()
-
-* faidx now returns strings and not binary strings in py3.
-
-* The cython components have been broken up into smaller files with
-  more specific content. This will affect users using the cython
-  interfaces.
-
-Edited list of commit log changes:
-
-*    fixes AlignmentFile.check_index to return True
-*    add RG/PM header tag - closes #179
-*    add with_seq option to get_aligned_pairs
-*    use char * inside reconsituteReferenceSequence
-*    add soft clipping for get_reference_sequence
-*    add get_reference_sequence
-*    queryEnd now computes length from cigar string if no sequence present, closes #176
-*    tolerate missing space at end of gtf files, closes #162
-*    do not raise Error when receiving output on stderr
-*    add docu about fetching without index, closes #170
-*    FastaFile and FastxFile now return strings in python3, closes #173
-*    py3 compat: relative -> absolute imports.
-*    add reference_name and next_reference_name attributes to AlignedSegment
-*    add function signatures to cvcf cython.  Added note about other VCF code.
-*    add context manager functions to FastaFile
-*    add reference_name and next_reference_name attributes to AlignedSegment
-*    PileupColumn also gets a reference_name attribute.
-*    add context manager functions to FastaFile
-*    TabixFile.header for remote files raises AttributeError, fixes #157
-*    add context manager interface to TabixFile, closes #165
-*    change ctypedef enum to typedef enum for cython 0.23
-*    add function signatures to cvcf cython, also added note about other VCF code
-*    remove exception for custom upper-case header record tags.
-*    rename VALID_HEADER_FIELDS to KNOWN_HEADER_FIELDS
-*    fix header record tag parsing for custom tags.
-*    use cython.str in count_coverage, fixes #141
-*    avoid maketrans (issues with python3)
-*    refactoring: AlignedSegment now in separate module
-*    do not execute remote tests if URL not available
-*    fix the unmapped count, incl reads with no SQ group
-*    add raw output to tags
-*    added write access for binary tags
-*    bugfix in call to resize
-*    implemented writing of binary tags from arrays
-*    implemented convert_binary_tag to use arrays
-*    add special cases for reads that are unmapped or whose mates are unmapped.
-*    rename TabProxies to ctabixproxies
-*    remove underscores from utility functions
-*    move utility methods into cutils
-*    remove callback argument to fetch - closes #128
-*    avoid calling close in dealloc
-*    add unit tests for File object opening
-*    change AlignmentFile.open to filepath_or_object
-*    implement copy.copy, close #65
-*    add chaching of array attributes in AlignedSegment, closes #121
-*    add export of Fastafile
-*    remove superfluous pysam_dispatch
-*    use persist option in FastqFile
-*    get_tag: expose tag type if requested with `with_value_type`
-*    fix to allow reading vcf record info via tabix-based vcf reader
-*    add pFastqProxy and pFastqFile objects to make it possible to work with multiple fastq records per file handle, unlike FastqProxy/FastqFile.
-*    release GIL around htslib IO operations
-*    More work on read/write support, API improvements
-*    add `phased` property on `VariantRecordSample`
-*    add mutable properties to VariantRecord
-*    BCF fixes and start of read/write support
-*    VariantHeaderRecord objects now act like mappings for attributes.
-*    add VariantHeader.alts dict from alt ID->Record.
-*    Bug fix to strong representation of structured header records.
-*    VariantHeader is now mutable
-
-
-Release 0.8.3
-=============
-
-* samtools command now accept the "catch_stdout" option.
-
-* get_aligned_pairs now works for soft-clipped reads.
-
-* query_position is now None when a PileupRead is not aligned
-  to a particular position.
-
-* AlignedSegments are now comparable and hashable.
-
-Release 0.8.2.1
-===============
-
-* Installation bugfix release.
-
-Release 0.8.2
-=============
-
-* Pysam now wraps htslib 1.2.1 and samtools version 1.2.
-
-* Added CRAM file support to pysam.
-
-* New alignment info interface.
-   * opt() and setTag are deprecated, use get_tag() and set_tag()
-     instead.
-   * added has_tag()
-   * tags is deprecated, use get_tags() and set_tags() instead.
-
-* FastqFile is now FastxFile to reflect that the latter permits
-  iteration over both fastq- and fasta-formatted files.
-
-* A Cython wrapper for htslib VCF/BCF reader/writer. The wrapper
-  provides a nearly complete Pythonic interface to VCF/BCF metadata
-  with reading and writing capability. However, the interface is still
-  incomplete and preliminary and lacks capability to mutate the
-  resulting data.
-
-Release 0.8.1
-=============
-
-* Pysam now wraps htslib and samtools versions 1.1.
-
-* Bugfixes, most notable:
-  * issue #43: uncompressed BAM output
-  * issue #42: skip tests requiring network if none available
-  * issue #19: multiple iterators can now be made to work on the same tabix file
-  * issue #24: All strings returned from/passed to the pysam API are now unicode in python 3
-  * issue #5:  type guessing for lists of integers fixed    
-
-* API changes for consistency. The old API is still present,
-  but deprecated.
-  In particular:
-
-  * Tabixfile -> TabixFile
-  * Fastafile -> FastaFile
-  * Fastqfile -> FastqFile
-  * Samfile -> AlignmentFile
-  * AlignedRead -> AlignedSegment
-     * qname -> query_name
-     * tid -> reference_id
-     * pos -> reference_start
-     * mapq -> mapping_quality
-     * rnext -> next_reference_id
-     * pnext -> next_reference_start
-     * cigar -> cigartuples
-     * cigarstring -> cigarstring
-     * tlen -> template_length
-     * seq -> query_sequence
-     * qual -> query_qualities, now returns array
-     * qqual -> query_alignment_qualities, now returns array
-     * tags -> tags
-     * alen -> reference_length, reference is always "alignment", so removed
-     * aend -> reference_end
-     * rlen -> query_length
-     * query -> query_alignment_sequence
-     * qstart -> query_alignment_start
-     * qend -> query_alignment_end
-     * qlen -> query_alignment_length
-     * mrnm -> next_reference_id   
-     * mpos -> next_reference_start
-     * rname -> reference_id
-     * isize -> template_length
-     * blocks -> get_blocks()
-     * aligned_pairs -> get_aligned_pairs()
-     * inferred_length -> infer_query_length()
-     * positions -> get_reference_positions()
-     * overlap() -> get_overlap()
-
-  * All strings are now passed to or received from the pysam API
-    as strings, no more bytes.
-
-Other changes:
-   * AlignmentFile.fetch(reopen) option is now multiple_iterators. The
-     default changed to not reopen a file unless requested by the user.
-   * FastaFile.getReferenceLength is now FastaFile.get_reference_length
-
-Backwards incompatible changes
-
-* Empty cigarstring now returns None (instead of '')
-* Empty cigar now returns None (instead of [])
-* When using the extension classes in cython modules, AlignedRead
-  needs to be substituted with AlignedSegment. 
-* fancy_str() has been removed
-* qual, qqual now return arrays
-
-Release 0.8.0
-=============
-
-* Disabled features
-   * IteratorColumn.setMask() disabled as htslib does not implement
-     this functionality?
-
-* Not implemented yet:
-   * reading SAM files without header
-
-Tabix files between version 0.7.8 and 0.8.0 are
-not compatible and need to be re-indexed.
-
-While version 0.7.8 and 0.8.0 should be mostly
-compatible, there are some notable exceptions:
-
-* tabix iterators will fail if there are comments
-  in the middle or the end of a file.
-
-* tabix raises always ValueError for invalid intervals.
-  Previously, different types of errors were raised
-  (KeyError, IndexError, ValueError) depending on
-  the type of invalid intervals (missing chromosome,
-  out-of-range, malformatted interval).
-
-
-Release 0.7.8
-=============
-
-* added AlignedRead.setTag method
-* added AlignedRead.blocks
-* unsetting CIGAR strings is now possible
-* empty CIGAR string returns empty list
-* added reopen flag to Samfile.fetch()
-* various bugfixes
-
-Release 0.7.7
-=============
-
-* added Fastafile.references, .nreferences and .lengths
-* tabix_iterator now uses kseq.h for python 2.7
-
-Release 0.7.6
-=============
-
-* added inferred_length property
-* issue 122: MACOSX getline missing, now it works?
-* seq and qual can be set None
-* added Fastqfile
-
-Release 0.7.5
-=============
-
-* switch to samtools 0.1.19
-* issue 122: MACOSX getline missing
-* issue 130: clean up tempfiles
-* various other bugfixes
-
-Release 0.7.4
-=============
-
-* further bugfixes to setup.py and package layout
-
-Release 0.7.3
-=============
-
-* further bugfixes to setup.py
-* upgraded distribute_setup.py to 0.6.34
-
-Release 0.7.2
-=============
-
-* bugfix in installer - failed when cython not present
-* changed installation locations of shared libraries
-
-Release 0.7.1
-=============
-
-* bugfix: missing PP tag PG records in header
-* added pre-built .c files to distribution
-
-Release 0.7
-===========
-
-* switch to tabix 0.2.6
-* added cigarstring field
-* python3 compatibility
-* added B tag handling
-* added check_sq and check_header options to Samfile.__init__
-* added lazy GTF parsing to tabix
-* reworked support for VCF format parsing
-* bugfixes
-
-Release 0.6
-===========
-
-* switch to samtools 0.1.18
-* various bugfixes
-* removed references to deprecated 'samtools pileup' functionality
-* AlignedRead.tags now returns an empty list if there are no tags.
-* added pnext, rnext and tlen
-
-Release 0.5
-===========
-
-* switch to samtools 0.1.16 and tabix 0.2.5
-* improved tabix parsing, added vcf support
-* re-organized code to permit linking against pysam
-* various bugfixes
-* added Samfile.positions and Samfile.overlap
-
-Release 0.4
-===========
-
-* switch to samtools 0.1.12a and tabix 0.2.3
-* added snp and indel calling.
-* switch from pyrex to cython
-* changed handling of samtools stderr
-* various bugfixes
-* added Samfile.count and Samfile.mate
-* deprecated AlignedRead.rname, added AlignedRead.tid
-
-Release 0.3
-===========
-
-* switch to samtools 0.1.8
-* added support for tabix files
-* numerous bugfixes including
-* permit simultaneous iterators on the same file
-* working access to remote files
+.. include:: ../NEWS
diff --git a/doc/requirements-rtd.txt b/doc/requirements-rtd.txt

new file mode 100644 (file)

index 0000000..beb55b6
--- /dev/null
+++ b/doc/requirements-rtd.txt
@@ -0,0 +1,2 @@
+sphinx==7.2.5
+sphinx-rtd-theme==1.3.0
diff --git a/pyproject.toml b/pyproject.toml

index 41067835ab13c323596f52bcc13baae3e8cbe7a8..1f89f9b2bc4589cd8a5b49cfd5b8cc67bdff4025 100644 (file)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,7 @@
  [project]
  name = "pysam"
-description = "pysam - a python module for reading, manipulating and writing genomic data sets."
+description = "Package for reading, manipulating, and writing genomic data"
  license = { text = "MIT License" }
-version = "0.21.0"
  authors = [
   { name = "Andreas Heger", email = "andreas.heger@gmail.com"}
  ]
@@ -11,13 +10,32 @@ requires-python = ">=3.6"
  dynamic = [
      "classifiers",
      "readme",
+    "version",
  ]
  
-dependencies = [
-    "cython",
-]
-
+[project.urls]
+"Documentation" = "https://pysam.readthedocs.io/"
+"Release notes" = "https://pysam.readthedocs.io/en/stable/release.html"
  
  [build-system]
-requires = ["setuptools>=59.0", "wheel", "Cython>=0.29.30,<3.0"]
+requires = ["setuptools>=59.0", "Cython>=0.29.12,<4"]
  build-backend = "setuptools.build_meta:__legacy__"
+
+[tool.cibuildwheel]
+before-all = "{project}/devtools/install-prerequisites.sh"
+# Necessary until we build libhts.a out-of-tree from within build_temp
+before-build = "make -C {project}/htslib distclean"
+
+test-requires = ["pytest"]
+test-command = "REF_PATH=: pytest {project}/tests"
+
+[tool.tox]
+legacy_tox_ini = """
+    [tox]
+    envlist = py36, py311
+
+    [testenv]
+    deps = pytest
+    setenv = REF_PATH=:
+    commands = pytest tests
+"""
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx

index 75b5ee92dbaef27ccc9f0e1f6a1eb2c9e026a342..3071f3753ad0e00c78022b28c944a3974cccb511 100644 (file)
--- a/pysam/libcalignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -757,7 +757,18 @@ cdef inline bytes build_alignment_sequence(bam1_t * src):
          elif op == BAM_CHARD_CLIP:
              pass # advances neither
  
-    cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
+    cdef char *md_tag, md_buffer[2];
+    cdef uint8_t md_typecode = md_tag_ptr[0]
+    if md_typecode == b'Z':
+        md_tag = bam_aux2Z(md_tag_ptr)
+    elif md_typecode == b'A':
+        # Work around HTSeq bug that writes 1-character strings as MD:A:v
+        md_buffer[0] = bam_aux2A(md_tag_ptr)
+        md_buffer[1] = b'\0'
+        md_tag = md_buffer
+    else:
+        raise TypeError('Tagged field MD:{}:<value> does not have expected type MD:Z'.format(chr(md_typecode)))
+
      cdef int md_idx = 0
      cdef char c
      s_idx = 0
@@ -1083,7 +1094,10 @@ cdef class AlignedSegment:
          _sam = force_bytes(sam)
          line.s = _sam
  
-        sam_parse1(&line, dest.header.ptr, dest._delegate)
+        cdef int ret
+        ret = sam_parse1(&line, dest.header.ptr, dest._delegate)
+        if ret < 0:
+            raise ValueError("parsing SAM record string failed (error code {})".format(ret))
  
          return dest
  
@@ -1845,12 +1859,16 @@ cdef class AlignedSegment:
      def get_reference_positions(self, full_length=False):
          """a list of reference positions that this read aligns to.
  
-        By default, this method only returns positions in the
-        reference that are within the alignment. If *full_length* is
-        set, None values will be included for any soft-clipped or
-        unaligned positions within the read. The returned list will
-        thus be of the same length as the read.
+        By default, this method returns the (0-based) positions on the
+        reference that are within the read's alignment, leaving gaps
+        corresponding to deletions and other reference skips.
  
+        When *full_length* is True, the returned list is the same length
+        as the read and additionally includes None values corresponding
+        to insertions or soft-clipping, i.e., to bases of the read that
+        are not aligned to a reference position.
+        (See also :meth:`get_aligned_pairs` which additionally returns
+        the corresponding positions along the read.)
          """
          cdef uint32_t k, i, l, pos
          cdef int op
@@ -1958,6 +1976,10 @@ cdef class AlignedSegment:
      def get_aligned_pairs(self, matches_only=False, with_seq=False):
          """a list of aligned read (query) and reference positions.
  
+        Each item in the returned list is a tuple consisting of
+        the 0-based offset from the start of the read sequence
+        followed by the 0-based reference position.
+
          For inserts, deletions, skipping either query or reference
          position may be None.
  
@@ -1968,7 +1990,7 @@ cdef class AlignedSegment:
          ----------
  
          matches_only : bool
-          If True, only matched bases are returned - no None on either
+          If True, only matched bases are returned --- no None on either
            side.
          with_seq : bool
            If True, return a third element in the tuple containing the
diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi

index 74637f82fa72e491013b62fb25addc83734039ae..28b395aed078cb76af6ef33e3e5df0582ea3673d 100644 (file)
--- a/pysam/libcalignmentfile.pyi
+++ b/pysam/libcalignmentfile.pyi
@@ -71,6 +71,10 @@ class AlignmentHeader:
      def is_valid_tid(self, tid: int) -> bool: ...
      def get_tid(self, reference: str) -> int: ...
  
+# The iterator produced by AlignmentFile is currently itself, but this may
+# change in future and code should not make assumptions about this type.
+AlignmentFileIterator = AlignmentFile
+
  class AlignmentFile(HTSFile):
      def __init__(
          self,
@@ -172,8 +176,8 @@ class AlignmentFile(HTSFile):
      @property
      def nocoordinate(self) -> int: ...
      def get_index_statistics(self) -> List[IndexStats]: ...
-    def __iter__(self) -> Any: ...
-    def __next__(self) -> Any: ...
+    def __iter__(self) -> AlignmentFileIterator: ...
+    def __next__(self) -> AlignedSegment: ...
      def is_valid_tid(self, tid: int) -> bool: ...
      def get_tid(self, reference: str) -> int: ...
      def get_reference_name(self, tid: int) -> str: ...
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx

index e37a411cf7b538753865b8eecb66929025017066..97d4e6d592fd1e000e22e1cb4d452b5815900f59 100644 (file)
--- a/pysam/libcalignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -73,7 +73,8 @@ from cpython cimport array as c_array
  from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
  from pysam.libcutils cimport encode_filename, from_string_and_size
  from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
-from pysam.libchtslib cimport HTSFile, hisremote
+from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
+                              HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
  
  from io import StringIO
  
@@ -1005,7 +1006,8 @@ cdef class AlignmentFile(HTSFile):
  
                  if cfilename or cindexname:
                      with nogil:
-                        self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
+                        self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
+                                                     HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
  
                      if not self.index and (cindexname or require_index):
                          if errno:
diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx

index 8c088af278453ad4339f1db7d7f7b5d296782226..8ecfe5f3888e7d02ad5fa1400b625debc471871a 100644 (file)
--- a/pysam/libcbcf.pyx
+++ b/pysam/libcbcf.pyx
@@ -3479,7 +3479,7 @@ cdef class VariantRecordSample(object):
          return bcf_format_get_alleles(self)
  
      @alleles.setter
-    def alleles(self, value: tuple):
+    def alleles(self, value):
          # Sets the genotype, supply a tuple of alleles to set.
          # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
          # The genotype is reset when an empty tuple, None or (None,) is supplied
diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx

index 0d88f8d918a9c4580c7892acd519cfccc6aa5f67..d66a3c6125b8e9267df1cf2e17a815ce02a40aee 100644 (file)
--- a/pysam/libcbgzf.pyx
+++ b/pysam/libcbgzf.pyx
@@ -10,6 +10,7 @@ import io
  
  from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
  from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdio  cimport SEEK_SET
  from libc.stdlib cimport malloc, calloc, realloc, free
  
  from cpython.object cimport PyObject
@@ -18,7 +19,7 @@ from cpython.bytes  cimport PyBytes_FromStringAndSize, _PyBytes_Resize
  from pysam.libcutils   cimport force_bytes, encode_filename
  from pysam.libchtslib  cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \
                                 bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \
-                               bgzf_tell, bgzf_getline, kstring_t, SEEK_SET, BGZF
+                               bgzf_tell, bgzf_getline, kstring_t, BGZF
  
  __all__ = ["BGZFile"]
  
diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd

index 30a1b76191bccbcf0797f6718f57b2321dc4d3d9..56e746074eb4c691b3c8ce5c54daa9bb227780c0 100644 (file)
--- a/pysam/libchtslib.pxd
+++ b/pysam/libchtslib.pxd
@@ -273,8 +273,6 @@ cdef extern from "htslib/bgzf.h" nogil:
      #  Write the data in the buffer to the file.
      int bgzf_flush(BGZF *fp)
  
-    int SEEK_SET
-
      #  Return a virtual file pointer to the current location in the file.
      #  No interpretation of the value should be made, other than a subsequent
      #  call to bgzf_seek can be used to position the file at the same point.
@@ -285,7 +283,7 @@ cdef extern from "htslib/bgzf.h" nogil:
      #
      #  @param fp     BGZF file handler
      #  @param pos    virtual file offset returned by bgzf_tell()
-    #  @param whence must be SEEK_SET
+    #  @param whence must be SEEK_SET (cimported from libc.stdio / posix.unistd)
      #  @return       0 on success and -1 on error
      # /
      int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
@@ -362,7 +360,7 @@ cdef extern from "htslib/bgzf.h" nogil:
      #
      #   @param fp           BGZF file handler; must be opened for reading
      #   @param uoffset      file offset in the uncompressed data
-    #   @param where        SEEK_SET supported atm
+    #   @param where        SEEK_SET (cimported from libc.stdio) supported atm
      #
      #   Returns 0 on success and -1 on error.
      int bgzf_useek(BGZF *fp, long uoffset, int where)
@@ -688,6 +686,17 @@ cdef extern from "htslib/hts.h" nogil:
      #    @return  The index, or NULL if an error occurred.
      hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
  
+    #### Load a specific index file
+    #    @param fn     Input BAM/BCF/etc filename
+    #     @param fnidx  The input index filename
+    #     @param fmt    One of the HTS_FMT_* index formats
+    #     @param flags  Flags to alter behaviour (see description)
+    #     @return  The index, or NULL if an error occurred.
+    hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
+
+    int HTS_IDX_SAVE_REMOTE
+    int HTS_IDX_SILENT_FAIL
+
      uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
      void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
  
@@ -1092,6 +1101,14 @@ cdef extern from "htslib/sam.h" nogil:
      # @return  The index, or NULL if an error occurred.
      hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
  
+    # Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file
+    # @param fp     File handle of the data file whose index is being opened
+    # @param fn     BAM/CRAM/etc data file filename
+    # @param fnidx  Index filename, or NULL to search alongside @a fn
+    # @param flags  Flags to alter behaviour
+    # @return  The index, or NULL if an error occurred.
+    hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
+
      # Generate and save an index file
      # @param fn        Input BAM/etc filename, to which .csi/etc will be added
      # @param min_shift Positive to generate CSI, or 0 to generate BAI
@@ -1466,6 +1483,7 @@ cdef extern from "htslib/tbx.h" nogil:
  
      tbx_t * tbx_index_load(char *fn)
      tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
+    tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags)
  
      # free the array but not the values
      char **tbx_seqnames(tbx_t *tbx, int *n)
@@ -2088,6 +2106,7 @@ cdef extern from "htslib/vcf.h" nogil:
      #************************************************************************
  
      hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
+    hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
      int bcf_index_build(const char *fn, int min_shift)
      int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
  
diff --git a/pysam/libchtslib.pyi b/pysam/libchtslib.pyi

index 925828bee46e14d01c4648405452547ea6b31e3b..fcd793596fbcfd2f19d962b81f1779abe9ed0d1b 100644 (file)
--- a/pysam/libchtslib.pyi
+++ b/pysam/libchtslib.pyi
@@ -96,7 +96,7 @@ class HTSFile:
      @property
      def is_bcf(self) -> bool: ...
      def reset(self) -> None: ...
-    def seek(self, offset: int) -> int: ...
+    def seek(self, offset: int, whence: int = ...) -> int: ...
      def tell(self) -> int: ...
      def add_hts_options(self, format_options: Optional[List[str]] = ...) -> None: ...
      def parse_region(
diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx

index 760d268563534090f80b5b88b6eccfbb3ef98a06..3cb7b7affa14bf08dc38007f279d3922d667dbfd 100644 (file)
--- a/pysam/libchtslib.pyx
+++ b/pysam/libchtslib.pyx
@@ -14,7 +14,7 @@ from libc.stdint cimport INT32_MAX
  from cpython cimport PyBytes_FromStringAndSize
  from pysam.libchtslib cimport *
  from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport encode_filename, from_string_and_size, libc_whence_from_io
  
  
  ########################################################################
@@ -35,11 +35,6 @@ from warnings import warn
  
  __all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
  
-# defines imported from samtools
-DEF SEEK_SET = 0
-DEF SEEK_CUR = 1
-DEF SEEK_END = 2
-
  # maximum genomic coordinace
  cdef int MAX_POS = (1 << 31) - 1
  
@@ -108,7 +103,7 @@ cdef class HFile(object):
          self.fp = NULL
  
          if hclose(fp) != 0:
-            raise IOError(herrno(self.fp), 'failed to close HFile', self.name)
+            raise IOError(errno, 'failed to close HFile', self.name)
  
      def fileno(self):
          if self.fp == NULL:
@@ -246,11 +241,11 @@ cdef class HFile(object):
      def readlines(self):
          return list(self)
  
-    def seek(self, Py_ssize_t offset, int whence=SEEK_SET):
+    def seek(self, Py_ssize_t offset, int whence=io.SEEK_SET):
          if self.fp == NULL:
              raise IOError('operation on closed HFile')
  
-        cdef Py_ssize_t off = hseek(self.fp, offset, whence)
+        cdef Py_ssize_t off = hseek(self.fp, offset, libc_whence_from_io(whence))
  
          if off < 0:
              raise IOError(herrno(self.fp), 'seek failed on HFile', self.name)
@@ -479,19 +474,21 @@ cdef class HTSFile(object):
          """
          return self.seek(self.start_offset)
  
-    def seek(self, uint64_t offset):
+    def seek(self, uint64_t offset, int whence=io.SEEK_SET):
          """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
          if not self.is_open:
              raise ValueError('I/O operation on closed file')
          if self.is_stream:
              raise IOError('seek not available in streams')
  
+        whence = libc_whence_from_io(whence)
+
          cdef int64_t ret
          if self.htsfile.format.compression == bgzf:
              with nogil:
-                ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
+                ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence)
          elif self.htsfile.format.compression == no_compression:
-            ret = 0 if (hseek(self.htsfile.fp.hfile, offset, SEEK_SET) >= 0) else -1
+            ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1
          else:
              raise NotImplementedError("seek not implemented in files compressed by method {}".format(
                  self.htsfile.format.compression))
diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd

index de7f115b8e5d7347ab519bbb6f7d8bd021f6a631..1bce05707a89ee4b70e9dd108018039de926a78e 100644 (file)
--- a/pysam/libcutils.pxd
+++ b/pysam/libcutils.pxd
@@ -7,6 +7,8 @@ from cpython cimport array as c_array
  
  cpdef parse_region(contig=*, start=*, stop=*, region=*, reference=*, end=*)
  
+cdef int libc_whence_from_io(int whence)
+
  #########################################################################
  # Utility functions for quality string conversions
  
diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx

index 246c83543c373f55db74413875ec709a354c2fe4..cb08ef271ae0be4f5d2f188e3b237163fbe3f23d 100644 (file)
--- a/pysam/libcutils.pyx
+++ b/pysam/libcutils.pyx
@@ -18,6 +18,7 @@ from libc.stdint cimport INT32_MAX, int32_t
  from libc.stdio cimport fprintf, stderr, fflush
  from libc.stdio cimport stdout as c_stdout
  from posix.fcntl cimport open as c_open, O_WRONLY
+from posix.unistd cimport SEEK_SET, SEEK_CUR, SEEK_END
  
  from libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \
      samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn
@@ -261,6 +262,16 @@ cpdef parse_region(contig=None,
      return contig, rstart, rstop
  
  
+cdef int libc_whence_from_io(int whence):
+    # io.SEEK_SET/_CUR/_END are by definition 0/1/2 but C/POSIX's equivalents
+    # have unspecified values. So we must translate, but checking for 0/1/2
+    # rather than io.SEEK_SET/etc suffices.
+    if whence == 0: return SEEK_SET
+    if whence == 1: return SEEK_CUR
+    if whence == 2: return SEEK_END
+    return whence  # Otherwise likely invalid, but let HTSlib or OS report it
+
+
  def _pysam_dispatch(collection,
                      method,
                      args=None,
diff --git a/pysam/version.h b/pysam/version.h

index 6d353c59c44df86d4950d8aefc92ae2bb930b32a..645557ba7bb8648365882455b3ca96a153dbaad1 100644 (file)
--- a/pysam/version.h
+++ b/pysam/version.h
@@ -1,5 +1,5 @@
  // Version information used while compiling samtools, bcftools, and htslib
  
-#define SAMTOOLS_VERSION "1.17 (pysam)"
-#define BCFTOOLS_VERSION "1.17 (pysam)"
-#define HTS_VERSION_TEXT "1.17 (pysam)"
+#define SAMTOOLS_VERSION "1.18 (pysam)"
+#define BCFTOOLS_VERSION "1.18 (pysam)"
+#define HTS_VERSION_TEXT "1.18 (pysam)"
diff --git a/pysam/version.py b/pysam/version.py

index 78b3ffdd1c10dbd3072ca0a0def53f4ff5f540ea..62a9f31a513869ea9e4f49850aa3e94c83a5c41b 100644 (file)
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,6 +1,6 @@
  # pysam versioning information
-__version__ = "0.21.0"
+__version__ = "0.22.0"
  
-__samtools_version__ = "1.17"
-__bcftools_version__ = "1.17"
-__htslib_version__ = "1.17"
+__samtools_version__ = "1.18"
+__bcftools_version__ = "1.18"
+__htslib_version__ = "1.18"
diff --git a/requirements-dev.txt b/requirements-dev.txt

new file mode 100644 (file)

index 0000000..420c17e
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1 @@
+Cython>=0.29.12,<4
diff --git a/requirements.txt b/requirements.txt

deleted file mode 100644 (file)

index f937d1c..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-cython>=0.29.12
diff --git a/samtools/README b/samtools/README

index 60b37acc86079ee9c867071313d16365370c9748..8f4f2369ab17e6e94badeaa4d0cf6a96d4dadbed 100644 (file)
--- a/samtools/README
+++ b/samtools/README
@@ -9,7 +9,7 @@ Building samtools
  The typical simple case of building Samtools using the HTSlib bundled within
  this Samtools release tarball is done as follows:
  
-    cd .../samtools-1.17 # Within the unpacked release directory
+    cd .../samtools-1.18 # Within the unpacked release directory
      ./configure
      make
  
@@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing.  Building for
  installation using the HTSlib bundled within this Samtools release tarball,
  and building the various HTSlib utilities such as bgzip is done as follows:
  
-    cd .../samtools-1.17 # Within the unpacked release directory
+    cd .../samtools-1.18 # Within the unpacked release directory
      ./configure --prefix=/path/to/location
      make all all-htslib
      make install install-htslib
@@ -48,7 +48,7 @@ There are two advantages to this:
  To build with plug-ins, you need to use the --enable-plugins configure option
  as follows:
  
-    cd .../samtools-1.17 # Within the unpacked release directory
+    cd .../samtools-1.18 # Within the unpacked release directory
      ./configure --enable-plugins --prefix=/path/to/location
      make all all-htslib
      make install install-htslib
@@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from
  the source distribution instead of installing the package.  In that case
  you can use:
  
-    cd .../samtools-1.17 # Within the unpacked release directory
-    ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.17
+    cd .../samtools-1.18 # Within the unpacked release directory
+    ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.18
      make all all-htslib
  
  It is possible to override the built-in search path using the HTS_PATH
diff --git a/samtools/bam_ampliconclip.c b/samtools/bam_ampliconclip.c

index 91fc85888bb7e3a3714712312a18bb10ec1c6445..72f39bd5d6888d60d796df9c3b28c4575ff532e4 100644 (file)
--- a/samtools/bam_ampliconclip.c
+++ b/samtools/bam_ampliconclip.c
@@ -1,7 +1,7 @@
  /*  bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
                            from the 5' end.
  
-    Copyright (C) 2020-2022 Genome Research Ltd.
+    Copyright (C) 2020-2023 Genome Research Ltd.
  
      Authors: Andrew Whitwham <aw7@sanger.ac.uk>
               Rob Davies <rmd+git@sanger.ac.uk>
@@ -59,6 +59,7 @@ typedef struct {
      int oa_tag;
      int del_tag;
      int tol;
+    int unmap_len;
      char *arg_list;
      char *stats_file;
      char *rejects_file;
@@ -638,6 +639,7 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
      long filtered = 0, written = 0, failed = 0;
      kstring_t str = KS_INITIALIZE;
      kstring_t oat = KS_INITIALIZE;
+    kstring_t seq = KS_INITIALIZE;
      bed_entry_list_t *sites;
      FILE *stats_fp = stderr;
      khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
@@ -829,16 +831,46 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
                  }
              }
  
-            if (param->fail_len >= 0 || param->filter_len >= 0) {
-               hts_pos_t aql = active_query_len(b);
+            if (param->fail_len >= 0 || param->filter_len >= 0 || param->unmap_len >= 0) {
+                hts_pos_t aql = active_query_len(b);
  
-               if (param->fail_len >= 0 && aql <= param->fail_len) {
-                   b->core.flag |= BAM_FQCFAIL;
-               }
+                if (param->fail_len >= 0 && aql <= param->fail_len) {
+                    b->core.flag |= BAM_FQCFAIL;
+                }
+
+                if (param->filter_len >= 0 && aql <= param->filter_len) {
+                    filter = 1;
+                }
+
+                if (param->unmap_len >= 0 && aql <= param->unmap_len) {
+
+                    if (ks_resize(&seq, b->core.l_qseq) < 0) {
+                        fprintf(stderr, "[ampliconclip] error: allocate memory for sequence %s\n", bam_get_seq(b));
+                        goto fail;
+                    }
+
+                    ks_clear(&seq);
+                    char *sb = ks_str(&seq);
+                    uint8_t *sequence = bam_get_seq(b);
+                    int i;
  
-               if (param->filter_len >= 0 && aql <= param->filter_len) {
-                   filter = 1;
-               }
+                    for (i = 0; i < b->core.l_qseq ; ++i) {
+                        *sb++ = seq_nt16_str[bam_seqi(sequence, i)];
+                    }
+
+                    if (bam_set1(b_tmp, b->core.l_qname - b->core.l_extranul - 1, bam_get_qname(b),
+                                 (b->core.flag | BAM_FUNMAP), b->core.tid, b->core.pos, 0,
+                                 0, NULL, b->core.mtid, b->core.mpos, b->core.isize,
+                                 b->core.l_qseq, seq.s, (const char *)bam_get_qual(b),
+                                 bam_get_l_aux(b)) < 0) {
+                        fprintf(stderr, "[ampliconclip] error: could not unmap read %s\n", bam_get_seq(b));
+                        goto fail;
+                    }
+
+                    memcpy(bam_get_aux(b_tmp), bam_get_aux(b), bam_get_l_aux(b));
+                    b_tmp->l_data += bam_get_l_aux(b);
+                    swap_bams(&b, &b_tmp);
+                }
             }
  
             if (b->core.flag & BAM_FQCFAIL) {
@@ -913,6 +945,7 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
  fail:
      destroy_bed_hash(bed_hash);
      ks_free(&oat);
+    ks_free(&seq);
      sam_hdr_destroy(header);
      bam_destroy1(b);
      bam_destroy1(b_tmp);
@@ -935,6 +968,7 @@ static void usage(void) {
      fprintf(stderr, " --fail              mark unclipped, mapped reads as QCFAIL.\n");
      fprintf(stderr, " --filter-len INT    do not output reads INT size or shorter.\n");
      fprintf(stderr, " --fail-len   INT    mark as QCFAIL reads INT size or shorter.\n");
+    fprintf(stderr, " --unmap-len  INT    unmap reads INT size or shorter, default 0.\n");
      fprintf(stderr, " --no-excluded       do not write excluded reads (unmapped or QCFAIL).\n");
      fprintf(stderr, " --rejects-file FILE file to write filtered reads.\n");
      fprintf(stderr, " --original          for clipped entries add an OA tag with original data.\n");
@@ -955,7 +989,7 @@ int amplicon_clip_main(int argc, char **argv) {
      htsThreadPool p = {NULL, 0};
      samFile *in = NULL, *out = NULL, *reject = NULL;
      clipping_type clipping = soft_clip;
-    cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
+    cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, 0, NULL, NULL, NULL};
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
@@ -973,6 +1007,7 @@ int amplicon_clip_main(int argc, char **argv) {
          {"original", no_argument, NULL, 1013},
          {"keep-tag", no_argument, NULL, 1014},
          {"tolerance", required_argument, NULL, 1015},
+        {"unmap-len", required_argument, NULL, 1016},
          {NULL, 0, NULL, 0}
      };
  
@@ -996,6 +1031,7 @@ int amplicon_clip_main(int argc, char **argv) {
              case 1013: param.oa_tag = 1; break;
              case 1014: param.del_tag = 0; break;
              case 1015: param.tol = atoi(optarg); break;
+            case 1016: param.unmap_len = atoi(optarg); break;
              default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                        /* else fall-through */
              case '?': usage(); exit(1);
@@ -1014,7 +1050,7 @@ int amplicon_clip_main(int argc, char **argv) {
  
      if (param.tol < 0) {
          fprintf(stderr, "[ampliconclip] warning: invalid tolerance of %d,"
-                        " reseting tolerance to default of 5.\n", param.tol);
+                        " resetting tolerance to default of 5.\n", param.tol);
          param.tol = 5;
      }
  
diff --git a/samtools/bam_ampliconclip.c.pysam.c b/samtools/bam_ampliconclip.c.pysam.c

index 4eb9c5a4dd7169e306b3a7d56a237e40d1770a66..0c368508e9c7d5d53d31defef7f196cba5a0bf62 100644 (file)
--- a/samtools/bam_ampliconclip.c.pysam.c
+++ b/samtools/bam_ampliconclip.c.pysam.c
@@ -3,7 +3,7 @@
  /*  bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
                            from the 5' end.
  
-    Copyright (C) 2020-2022 Genome Research Ltd.
+    Copyright (C) 2020-2023 Genome Research Ltd.
  
      Authors: Andrew Whitwham <aw7@sanger.ac.uk>
               Rob Davies <rmd+git@sanger.ac.uk>
@@ -61,6 +61,7 @@ typedef struct {
      int oa_tag;
      int del_tag;
      int tol;
+    int unmap_len;
      char *arg_list;
      char *stats_file;
      char *rejects_file;
@@ -640,6 +641,7 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
      long filtered = 0, written = 0, failed = 0;
      kstring_t str = KS_INITIALIZE;
      kstring_t oat = KS_INITIALIZE;
+    kstring_t seq = KS_INITIALIZE;
      bed_entry_list_t *sites;
      FILE *stats_fp = samtools_stderr;
      khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
@@ -831,16 +833,46 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
                  }
              }
  
-            if (param->fail_len >= 0 || param->filter_len >= 0) {
-               hts_pos_t aql = active_query_len(b);
+            if (param->fail_len >= 0 || param->filter_len >= 0 || param->unmap_len >= 0) {
+                hts_pos_t aql = active_query_len(b);
  
-               if (param->fail_len >= 0 && aql <= param->fail_len) {
-                   b->core.flag |= BAM_FQCFAIL;
-               }
+                if (param->fail_len >= 0 && aql <= param->fail_len) {
+                    b->core.flag |= BAM_FQCFAIL;
+                }
+
+                if (param->filter_len >= 0 && aql <= param->filter_len) {
+                    filter = 1;
+                }
+
+                if (param->unmap_len >= 0 && aql <= param->unmap_len) {
+
+                    if (ks_resize(&seq, b->core.l_qseq) < 0) {
+                        fprintf(samtools_stderr, "[ampliconclip] error: allocate memory for sequence %s\n", bam_get_seq(b));
+                        goto fail;
+                    }
+
+                    ks_clear(&seq);
+                    char *sb = ks_str(&seq);
+                    uint8_t *sequence = bam_get_seq(b);
+                    int i;
  
-               if (param->filter_len >= 0 && aql <= param->filter_len) {
-                   filter = 1;
-               }
+                    for (i = 0; i < b->core.l_qseq ; ++i) {
+                        *sb++ = seq_nt16_str[bam_seqi(sequence, i)];
+                    }
+
+                    if (bam_set1(b_tmp, b->core.l_qname - b->core.l_extranul - 1, bam_get_qname(b),
+                                 (b->core.flag | BAM_FUNMAP), b->core.tid, b->core.pos, 0,
+                                 0, NULL, b->core.mtid, b->core.mpos, b->core.isize,
+                                 b->core.l_qseq, seq.s, (const char *)bam_get_qual(b),
+                                 bam_get_l_aux(b)) < 0) {
+                        fprintf(samtools_stderr, "[ampliconclip] error: could not unmap read %s\n", bam_get_seq(b));
+                        goto fail;
+                    }
+
+                    memcpy(bam_get_aux(b_tmp), bam_get_aux(b), bam_get_l_aux(b));
+                    b_tmp->l_data += bam_get_l_aux(b);
+                    swap_bams(&b, &b_tmp);
+                }
             }
  
             if (b->core.flag & BAM_FQCFAIL) {
@@ -915,6 +947,7 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
  fail:
      destroy_bed_hash(bed_hash);
      ks_free(&oat);
+    ks_free(&seq);
      sam_hdr_destroy(header);
      bam_destroy1(b);
      bam_destroy1(b_tmp);
@@ -937,6 +970,7 @@ static void usage(void) {
      fprintf(samtools_stderr, " --fail              mark unclipped, mapped reads as QCFAIL.\n");
      fprintf(samtools_stderr, " --filter-len INT    do not output reads INT size or shorter.\n");
      fprintf(samtools_stderr, " --fail-len   INT    mark as QCFAIL reads INT size or shorter.\n");
+    fprintf(samtools_stderr, " --unmap-len  INT    unmap reads INT size or shorter, default 0.\n");
      fprintf(samtools_stderr, " --no-excluded       do not write excluded reads (unmapped or QCFAIL).\n");
      fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n");
      fprintf(samtools_stderr, " --original          for clipped entries add an OA tag with original data.\n");
@@ -957,7 +991,7 @@ int amplicon_clip_main(int argc, char **argv) {
      htsThreadPool p = {NULL, 0};
      samFile *in = NULL, *out = NULL, *reject = NULL;
      clipping_type clipping = soft_clip;
-    cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
+    cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, 0, NULL, NULL, NULL};
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
@@ -975,6 +1009,7 @@ int amplicon_clip_main(int argc, char **argv) {
          {"original", no_argument, NULL, 1013},
          {"keep-tag", no_argument, NULL, 1014},
          {"tolerance", required_argument, NULL, 1015},
+        {"unmap-len", required_argument, NULL, 1016},
          {NULL, 0, NULL, 0}
      };
  
@@ -998,6 +1033,7 @@ int amplicon_clip_main(int argc, char **argv) {
              case 1013: param.oa_tag = 1; break;
              case 1014: param.del_tag = 0; break;
              case 1015: param.tol = atoi(optarg); break;
+            case 1016: param.unmap_len = atoi(optarg); break;
              default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                        /* else fall-through */
              case '?': usage(); samtools_exit(1);
@@ -1016,7 +1052,7 @@ int amplicon_clip_main(int argc, char **argv) {
  
      if (param.tol < 0) {
          fprintf(samtools_stderr, "[ampliconclip] warning: invalid tolerance of %d,"
-                        " reseting tolerance to default of 5.\n", param.tol);
+                        " resetting tolerance to default of 5.\n", param.tol);
          param.tol = 5;
      }
  
diff --git a/samtools/bam_consensus.c b/samtools/bam_consensus.c

index 4cdaf3fac42e2ae450ca10091ce474b4fcf1e9cc..3cbb24fa7d96afffc791803215a4b752d5af9ee9 100644 (file)
--- a/samtools/bam_consensus.c
+++ b/samtools/bam_consensus.c
@@ -2043,20 +2043,30 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
      }
  
      if (opts->all_bases) {
-        if (tid != opts->last_tid && opts->last_tid >= 0) {
-            hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
-            if (opts->iter)
-                len =  MIN(opts->iter->end, len);
-            if (empty_pileup2(opts, opts->h, opts->last_tid, opts->last_pos,
-                              len) < 0)
-                return -1;
-            if (tid >= 0) {
-                if (empty_pileup2(opts, opts->h, tid,
-                                  opts->iter ? opts->iter->beg : 0,
-                                  pos-1) < 0)
+        if (tid != opts->last_tid && opts->last_tid >= -1) {
+            if (opts->last_tid >= 0) {
+                // remainder of previous ref
+                hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
+                if (opts->iter)
+                    len =  MIN(opts->iter->end, len);
+                if (empty_pileup2(opts, opts->h, opts->last_tid,
+                                  opts->last_pos, len) < 0)
+                    return -1;
+            }
+
+            opts->last_pos = opts->iter ? opts->iter->beg : 0;
+        }
+
+        // Any refs between last_tid and tid
+        if (!opts->iter && tid > opts->last_tid && opts->all_bases > 1) {
+            while (++opts->last_tid < tid) {
+                hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
+                if (empty_pileup2(opts, opts->h, opts->last_tid, 0, len) < 0)
                      return -1;
              }
          }
+
+        // Any gaps in this ref (same tid) or at start of this new tid
          if (opts->last_pos >= 0 && pos > opts->last_pos+1) {
              if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos,
                                pos-1) < 0)
@@ -2167,9 +2177,11 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
              return 0;
      }
  
+ next_ref:
      if (tid != opts->last_tid) {
          if (opts->last_tid != -1) {
              if (opts->all_bases) {
+                // Fill in remainder of previous reference
                  int i, N;
                  if (opts->iter) {
                      opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1);
@@ -2197,9 +2209,13 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
          }
  
          seq->l = 0; qual->l = 0;
+
+        if (!opts->iter && opts->all_bases > 1 && ++opts->last_tid < tid) {
+            opts->last_pos = 0;
+            goto next_ref;
+        }
+
          opts->last_tid = tid;
-//        if (opts->all_bases)
-//            opts->last_pos = 0;
          if (opts->iter)
              opts->last_pos = opts->iter->beg;
          else
@@ -2710,6 +2726,13 @@ int main_consensus(int argc, char **argv) {
              if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0)
                  goto err;
          }
+        while (!opts.iter && opts.all_bases > 1 &&
+               ++opts.last_tid < opts.h->n_targets) {
+            int len = sam_hdr_tid2len(opts.h, opts.last_tid);
+            if (empty_pileup2(&opts, opts.h, opts.last_tid, 0, len) < 0)
+                goto err;
+        }
+
      } else {
          if (pileup_loop(opts.fp, opts.h, readaln2,
                          opts.mode != MODE_SIMPLE ? nm_init : NULL,
@@ -2717,6 +2740,8 @@ int main_consensus(int argc, char **argv) {
                          opts.mode != MODE_SIMPLE ? nm_free : NULL,
                          &opts) < 0)
              goto err;
+
+    next_ref_q:
          if (opts.all_bases) {
              // fill out terminator
              int tid = opts.iter ? opts.iter->tid : opts.last_tid;
@@ -2744,6 +2769,13 @@ int main_consensus(int argc, char **argv) {
              dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid),
                         opts.ks_ins_seq.s,  opts.ks_ins_seq.l,
                         opts.ks_ins_qual.s, opts.ks_ins_qual.l);
+
+        if (!opts.iter && opts.all_bases > 1 &&
+            ++opts.last_tid < opts.h->n_targets) {
+            opts.last_pos = 0;
+            opts.ks_ins_seq.l = opts.ks_ins_qual.l = 0;
+            goto next_ref_q;
+        }
  //        if (consensus_loop(&opts) < 0) {
  //            print_error_errno("consensus", "Failed");
  //            goto err;
diff --git a/samtools/bam_consensus.c.pysam.c b/samtools/bam_consensus.c.pysam.c

index 70f47ba394cd4f2528a9459c55da973b4c42a217..b090a9a565a95768460b2f923d184128909972c8 100644 (file)
--- a/samtools/bam_consensus.c.pysam.c
+++ b/samtools/bam_consensus.c.pysam.c
@@ -2045,20 +2045,30 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
      }
  
      if (opts->all_bases) {
-        if (tid != opts->last_tid && opts->last_tid >= 0) {
-            hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
-            if (opts->iter)
-                len =  MIN(opts->iter->end, len);
-            if (empty_pileup2(opts, opts->h, opts->last_tid, opts->last_pos,
-                              len) < 0)
-                return -1;
-            if (tid >= 0) {
-                if (empty_pileup2(opts, opts->h, tid,
-                                  opts->iter ? opts->iter->beg : 0,
-                                  pos-1) < 0)
+        if (tid != opts->last_tid && opts->last_tid >= -1) {
+            if (opts->last_tid >= 0) {
+                // remainder of previous ref
+                hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
+                if (opts->iter)
+                    len =  MIN(opts->iter->end, len);
+                if (empty_pileup2(opts, opts->h, opts->last_tid,
+                                  opts->last_pos, len) < 0)
+                    return -1;
+            }
+
+            opts->last_pos = opts->iter ? opts->iter->beg : 0;
+        }
+
+        // Any refs between last_tid and tid
+        if (!opts->iter && tid > opts->last_tid && opts->all_bases > 1) {
+            while (++opts->last_tid < tid) {
+                hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
+                if (empty_pileup2(opts, opts->h, opts->last_tid, 0, len) < 0)
                      return -1;
              }
          }
+
+        // Any gaps in this ref (same tid) or at start of this new tid
          if (opts->last_pos >= 0 && pos > opts->last_pos+1) {
              if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos,
                                pos-1) < 0)
@@ -2169,9 +2179,11 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
              return 0;
      }
  
+ next_ref:
      if (tid != opts->last_tid) {
          if (opts->last_tid != -1) {
              if (opts->all_bases) {
+                // Fill in remainder of previous reference
                  int i, N;
                  if (opts->iter) {
                      opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1);
@@ -2199,9 +2211,13 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p,
          }
  
          seq->l = 0; qual->l = 0;
+
+        if (!opts->iter && opts->all_bases > 1 && ++opts->last_tid < tid) {
+            opts->last_pos = 0;
+            goto next_ref;
+        }
+
          opts->last_tid = tid;
-//        if (opts->all_bases)
-//            opts->last_pos = 0;
          if (opts->iter)
              opts->last_pos = opts->iter->beg;
          else
@@ -2712,6 +2728,13 @@ int main_consensus(int argc, char **argv) {
              if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0)
                  goto err;
          }
+        while (!opts.iter && opts.all_bases > 1 &&
+               ++opts.last_tid < opts.h->n_targets) {
+            int len = sam_hdr_tid2len(opts.h, opts.last_tid);
+            if (empty_pileup2(&opts, opts.h, opts.last_tid, 0, len) < 0)
+                goto err;
+        }
+
      } else {
          if (pileup_loop(opts.fp, opts.h, readaln2,
                          opts.mode != MODE_SIMPLE ? nm_init : NULL,
@@ -2719,6 +2742,8 @@ int main_consensus(int argc, char **argv) {
                          opts.mode != MODE_SIMPLE ? nm_free : NULL,
                          &opts) < 0)
              goto err;
+
+    next_ref_q:
          if (opts.all_bases) {
              // fill out terminator
              int tid = opts.iter ? opts.iter->tid : opts.last_tid;
@@ -2746,6 +2771,13 @@ int main_consensus(int argc, char **argv) {
              dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid),
                         opts.ks_ins_seq.s,  opts.ks_ins_seq.l,
                         opts.ks_ins_qual.s, opts.ks_ins_qual.l);
+
+        if (!opts.iter && opts.all_bases > 1 &&
+            ++opts.last_tid < opts.h->n_targets) {
+            opts.last_pos = 0;
+            opts.ks_ins_seq.l = opts.ks_ins_qual.l = 0;
+            goto next_ref_q;
+        }
  //        if (consensus_loop(&opts) < 0) {
  //            print_error_errno("consensus", "Failed");
  //            goto err;
diff --git a/samtools/bam_fastq.c b/samtools/bam_fastq.c

index c17821d9db027201397590b380d96004731371d0..e4701b1e78710937261f34407b2d824aa61a30b3 100644 (file)
--- a/samtools/bam_fastq.c
+++ b/samtools/bam_fastq.c
@@ -1,6 +1,6 @@
  /*  bam_fastq.c -- FASTA and FASTQ file generation
  
-    Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
+    Copyright (C) 2009-2017, 2019-2020, 2023 Genome Research Ltd.
      Portions copyright (C) 2009, 2011, 2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <assert.h>
  #include <inttypes.h>
  #include <unistd.h>
+#include <float.h>
  
  #include "htslib/sam.h"
  #include "htslib/klist.h"
@@ -64,8 +65,14 @@ static void bam2fq_usage(FILE *to, const char *command)
  "  -o FILE      write reads designated READ1 or READ2 to FILE\n"
  "               note: if a singleton file is specified with -s, only\n"
  "               paired reads will be written to the -1 and -2 files.\n"
-"  -f INT       only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
-"  -F INT       only include reads with none of the FLAGS in INT present [0x900]\n"       //   F&x == 0
+"  -d, --tag TAG[:VAL]\n"
+"               only include reads containing TAG, optionally with value VAL\n"
+"  -f, --require-flags INT\n"
+"               only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
+"  -F, --excl[ude]-flags INT\n"
+"               only include reads with none of the FLAGs in INT present [0x900]\n"   //   F&x == 0
+"      --rf, --incl[ude]-flags INT\n"
+"               only include reads with any  of the FLAGs in INT present [0]\n"       // !(F&x == 0)
  "  -G INT       only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
  "  -n           don't append /1 and /2 to the read name\n"
  "  -N           always append /1 and /2 to the read name\n",
@@ -132,7 +139,7 @@ typedef struct bam2fq_opts {
      char *fnr[3];
      char *fn_input; // pointer to input filename in argv do not free
      bool has12, has12always, use_oq, copy_tags, illumina_tag;
-    int flag_on, flag_off, flag_alloff;
+    int flag_on, flag_off, flag_alloff, flag_anyon;
      sam_global_args ga;
      fastfile filetype;
      int def_qual;
@@ -142,6 +149,10 @@ typedef struct bam2fq_opts {
      char *index_format;
      char *extra_tags;
      char compression_level;
+    const char *filter_tag;       // -d opt
+    const char *filter_value_str;
+    int64_t filter_value_int;
+    float filter_value_flt;
  } bam2fq_opts_t;
  
  typedef struct bam2fq_state {
@@ -152,7 +163,7 @@ typedef struct bam2fq_state {
      samFile *hstdout;
      sam_hdr_t *h;
      bool has12, use_oq, copy_tags, illumina_tag;
-    int flag_on, flag_off, flag_alloff;
+    int flag_on, flag_off, flag_alloff, flag_anyon;
      fastfile filetype;
      int def_qual;
      char *index_sequence;
@@ -176,6 +187,9 @@ static void free_opts(bam2fq_opts_t *opts)
      free(opts);
  }
  
+// Make mnemonic distinct values for longoption-only options
+#define LONGOPT(c)  ((c) + 128)
+
  // return true if valid
  static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
  {
@@ -193,12 +207,19 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
      opts->extra_tags = NULL;
      opts->compression_level = 1;
      opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY;
-    int flag_off_set = 0;
  
      int c;
      sam_global_args_init(&opts->ga);
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
+        {"require-flags", required_argument, NULL, 'f'},
+        {"excl-flags", required_argument, NULL, 'F'},
+        {"exclude-flags", required_argument, NULL, 'F'},
+        // following the same convention as view: g exists as a longoption_only
+        // argument, accessible from the command line as --rf/--incl[ude]-flags
+        {"rf", required_argument, NULL, LONGOPT('g')},
+        {"incl-flags", required_argument, NULL, LONGOPT('g')},
+        {"include-flags", required_argument, NULL, LONGOPT('g')},
          {"i1", required_argument, NULL, 1},
          {"I1", required_argument, NULL, 1},
          {"i2", required_argument, NULL, 2},
@@ -208,9 +229,10 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
          {"index-format", required_argument, NULL, 3},
          {"barcode-tag", required_argument, NULL, 'b'},
          {"quality-tag", required_argument, NULL, 'q'},
+        {"tag", required_argument, NULL, 'd'},
          { NULL, 0, NULL, 0 }
      };
-    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
+    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:",
                              lopts, NULL)) > 0) {
          switch (c) {
              case 'b': opts->barcode_tag = optarg; break;
@@ -223,14 +245,11 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
              case '2': opts->fnr[2] = optarg; break;
              case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break;
              case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
-            case 'F':
-                if (!flag_off_set) {
-                    flag_off_set = 1;
-                    opts->flag_off = 0;
-                }
-                opts->flag_off |= strtol(optarg, 0, 0);
-                break;
+            // note that flag_off does not have |= because it has a default
+            // value of 0x900 which needs to be replaced by the optarg
+            case 'F': opts->flag_off = strtol(optarg, 0, 0); break;
              case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
+            case LONGOPT('g'): opts->flag_anyon |= strtol(optarg, 0, 0); break;
              case 'n': opts->has12 = false; break;
              case 'N': opts->has12always = true; break;
              case 'O': opts->use_oq = true; break;
@@ -247,6 +266,22 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
              case 'T': opts->extra_tags = optarg; break;
              case 'v': opts->def_qual = atoi(optarg); break;
  
+            case 'd':
+                if (strlen(optarg) < 2 ||
+                    (strlen(optarg) > 2 && optarg[2] != ':')) {
+                    print_error("fastq",
+                                "Invalid \"tag:value\" option: \"%s\"",
+                                optarg);
+                    free_opts(opts);
+                    return false;
+                }
+
+                opts->filter_tag = optarg;
+                opts->filter_value_str = strlen(optarg) > 2 ? optarg+3 : NULL;
+                opts->filter_value_int = INT64_MAX; // fill out later
+                opts->filter_value_flt = FLT_MAX;
+                break;
+
              case '?':
                  bam2fq_usage(stderr, argv[0]);
                  free_opts(opts);
@@ -401,6 +436,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
      state->flag_on = opts->flag_on;
      state->flag_off = opts->flag_off;
      state->flag_alloff = opts->flag_alloff;
+    state->flag_anyon = opts->flag_anyon;
      state->has12 = opts->has12;
      state->use_oq = opts->use_oq;
      state->illumina_tag = opts->illumina_tag;
@@ -411,7 +447,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
      state->hstdout = NULL;
      state->compression_level = opts->compression_level;
  
-    state->fp = sam_open(opts->fn_input, "r");
+    state->fp = sam_open_format(opts->fn_input, "r", &opts->ga.in);
      if (state->fp == NULL) {
          print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
          free(state);
@@ -430,7 +466,17 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
      }
  
      uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
-    if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX;
+    if (opts->use_oq || opts->extra_tags || opts->index_file[0])
+        rf |= SAM_AUX;
+    if (opts->filter_tag) {
+        if (memcmp(opts->filter_tag, "NM", 2) == 0 ||
+            memcmp(opts->filter_tag, "MD", 2) == 0)
+            rf |= SAM_AUX | SAM_SEQ;
+        else if (memcmp(opts->filter_tag, "RG", 2) == 0)
+            rf |= SAM_RGAUX;
+        else
+            rf |= SAM_AUX;
+    }
      if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
          fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
          free(state);
@@ -576,10 +622,59 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
      return valid;
  }
  
-static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
+static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state,
+                                 bam2fq_opts_t *opts)
  {
+    if (opts->filter_tag) {
+        uint8_t *s = bam_aux_get(b, opts->filter_tag);
+        if (!s)
+            return true;
+
+        if (opts->filter_value_str) {
+            switch (*s) {
+            case 'i': case 'I':
+            case 's': case 'S':
+            case 'c': case 'C':
+                if (opts->filter_value_int == INT64_MAX)
+                    // cache integer conversion for repeated use
+                    opts->filter_value_int =
+                        strtoll(opts->filter_value_str, NULL, 0);
+                if (opts->filter_value_int != bam_aux2i(s))
+                    return true;
+                break;
+
+            case 'f':
+                if (opts->filter_value_flt == FLT_MAX)
+                    opts->filter_value_flt = atof(opts->filter_value_str);
+                // Comparing floats is hard.
+                // Eg (double)0.1 - (double)0.1f is -1.5e-9.
+                // Given BAM binary encoding is float however, just keep it.
+                // This means rounding errors will (hopefully) always be the
+                // same and basic equality still works.
+                if (opts->filter_value_flt != (float)bam_aux2f(s))
+                    return true;
+                break;
+
+            case 'A':
+                if (s[1] != *opts->filter_value_str)
+                    return true;
+                break;
+
+            case 'Z': case 'H':
+                if (strcmp((char *)s+1, opts->filter_value_str) != 0)
+                    return true;
+                break;
+
+            default:
+                // Anything unsupported fails the filter match too.
+                return true;
+            }
+        }
+    }
+
      return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
          ||  (b->core.flag&(state->flag_off)) != 0
+        ||  (((b->core.flag&(state->flag_anyon)) == 0) && (state->flag_anyon != 0))
          ||  (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
  
  }
@@ -798,7 +893,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
          }
          at_eof = res < 0;
  
-        if (!at_eof && filter_it_out(b[n], state))
+        if (!at_eof && filter_it_out(b[n], state, opts))
              continue;
          if (!at_eof) {
              ++n_reads;
diff --git a/samtools/bam_fastq.c.pysam.c b/samtools/bam_fastq.c.pysam.c

index fbe65fbec6c73cf48a4dc04e9a5165ca53eee5a0..cd8fa2757554205d4a0e2fbf1e607d9c208eaf8f 100644 (file)
--- a/samtools/bam_fastq.c.pysam.c
+++ b/samtools/bam_fastq.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_fastq.c -- FASTA and FASTQ file generation
  
-    Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
+    Copyright (C) 2009-2017, 2019-2020, 2023 Genome Research Ltd.
      Portions copyright (C) 2009, 2011, 2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <assert.h>
  #include <inttypes.h>
  #include <unistd.h>
+#include <float.h>
  
  #include "htslib/sam.h"
  #include "htslib/klist.h"
@@ -66,8 +67,14 @@ static void bam2fq_usage(FILE *to, const char *command)
  "  -o FILE      write reads designated READ1 or READ2 to FILE\n"
  "               note: if a singleton file is specified with -s, only\n"
  "               paired reads will be written to the -1 and -2 files.\n"
-"  -f INT       only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
-"  -F INT       only include reads with none of the FLAGS in INT present [0x900]\n"       //   F&x == 0
+"  -d, --tag TAG[:VAL]\n"
+"               only include reads containing TAG, optionally with value VAL\n"
+"  -f, --require-flags INT\n"
+"               only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
+"  -F, --excl[ude]-flags INT\n"
+"               only include reads with none of the FLAGs in INT present [0x900]\n"   //   F&x == 0
+"      --rf, --incl[ude]-flags INT\n"
+"               only include reads with any  of the FLAGs in INT present [0]\n"       // !(F&x == 0)
  "  -G INT       only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
  "  -n           don't append /1 and /2 to the read name\n"
  "  -N           always append /1 and /2 to the read name\n",
@@ -134,7 +141,7 @@ typedef struct bam2fq_opts {
      char *fnr[3];
      char *fn_input; // pointer to input filename in argv do not free
      bool has12, has12always, use_oq, copy_tags, illumina_tag;
-    int flag_on, flag_off, flag_alloff;
+    int flag_on, flag_off, flag_alloff, flag_anyon;
      sam_global_args ga;
      fastfile filetype;
      int def_qual;
@@ -144,6 +151,10 @@ typedef struct bam2fq_opts {
      char *index_format;
      char *extra_tags;
      char compression_level;
+    const char *filter_tag;       // -d opt
+    const char *filter_value_str;
+    int64_t filter_value_int;
+    float filter_value_flt;
  } bam2fq_opts_t;
  
  typedef struct bam2fq_state {
@@ -154,7 +165,7 @@ typedef struct bam2fq_state {
      samFile *hstdout;
      sam_hdr_t *h;
      bool has12, use_oq, copy_tags, illumina_tag;
-    int flag_on, flag_off, flag_alloff;
+    int flag_on, flag_off, flag_alloff, flag_anyon;
      fastfile filetype;
      int def_qual;
      char *index_sequence;
@@ -178,6 +189,9 @@ static void free_opts(bam2fq_opts_t *opts)
      free(opts);
  }
  
+// Make mnemonic distinct values for longoption-only options
+#define LONGOPT(c)  ((c) + 128)
+
  // return true if valid
  static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
  {
@@ -195,12 +209,19 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
      opts->extra_tags = NULL;
      opts->compression_level = 1;
      opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY;
-    int flag_off_set = 0;
  
      int c;
      sam_global_args_init(&opts->ga);
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
+        {"require-flags", required_argument, NULL, 'f'},
+        {"excl-flags", required_argument, NULL, 'F'},
+        {"exclude-flags", required_argument, NULL, 'F'},
+        // following the same convention as view: g exists as a longoption_only
+        // argument, accessible from the command line as --rf/--incl[ude]-flags
+        {"rf", required_argument, NULL, LONGOPT('g')},
+        {"incl-flags", required_argument, NULL, LONGOPT('g')},
+        {"include-flags", required_argument, NULL, LONGOPT('g')},
          {"i1", required_argument, NULL, 1},
          {"I1", required_argument, NULL, 1},
          {"i2", required_argument, NULL, 2},
@@ -210,9 +231,10 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
          {"index-format", required_argument, NULL, 3},
          {"barcode-tag", required_argument, NULL, 'b'},
          {"quality-tag", required_argument, NULL, 'q'},
+        {"tag", required_argument, NULL, 'd'},
          { NULL, 0, NULL, 0 }
      };
-    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
+    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:",
                              lopts, NULL)) > 0) {
          switch (c) {
              case 'b': opts->barcode_tag = optarg; break;
@@ -225,14 +247,11 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
              case '2': opts->fnr[2] = optarg; break;
              case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break;
              case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
-            case 'F':
-                if (!flag_off_set) {
-                    flag_off_set = 1;
-                    opts->flag_off = 0;
-                }
-                opts->flag_off |= strtol(optarg, 0, 0);
-                break;
+            // note that flag_off does not have |= because it has a default
+            // value of 0x900 which needs to be replaced by the optarg
+            case 'F': opts->flag_off = strtol(optarg, 0, 0); break;
              case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
+            case LONGOPT('g'): opts->flag_anyon |= strtol(optarg, 0, 0); break;
              case 'n': opts->has12 = false; break;
              case 'N': opts->has12always = true; break;
              case 'O': opts->use_oq = true; break;
@@ -249,6 +268,22 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
              case 'T': opts->extra_tags = optarg; break;
              case 'v': opts->def_qual = atoi(optarg); break;
  
+            case 'd':
+                if (strlen(optarg) < 2 ||
+                    (strlen(optarg) > 2 && optarg[2] != ':')) {
+                    print_error("fastq",
+                                "Invalid \"tag:value\" option: \"%s\"",
+                                optarg);
+                    free_opts(opts);
+                    return false;
+                }
+
+                opts->filter_tag = optarg;
+                opts->filter_value_str = strlen(optarg) > 2 ? optarg+3 : NULL;
+                opts->filter_value_int = INT64_MAX; // fill out later
+                opts->filter_value_flt = FLT_MAX;
+                break;
+
              case '?':
                  bam2fq_usage(samtools_stderr, argv[0]);
                  free_opts(opts);
@@ -403,6 +438,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
      state->flag_on = opts->flag_on;
      state->flag_off = opts->flag_off;
      state->flag_alloff = opts->flag_alloff;
+    state->flag_anyon = opts->flag_anyon;
      state->has12 = opts->has12;
      state->use_oq = opts->use_oq;
      state->illumina_tag = opts->illumina_tag;
@@ -413,7 +449,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
      state->hstdout = NULL;
      state->compression_level = opts->compression_level;
  
-    state->fp = sam_open(opts->fn_input, "r");
+    state->fp = sam_open_format(opts->fn_input, "r", &opts->ga.in);
      if (state->fp == NULL) {
          print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
          free(state);
@@ -432,7 +468,17 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
      }
  
      uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
-    if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX;
+    if (opts->use_oq || opts->extra_tags || opts->index_file[0])
+        rf |= SAM_AUX;
+    if (opts->filter_tag) {
+        if (memcmp(opts->filter_tag, "NM", 2) == 0 ||
+            memcmp(opts->filter_tag, "MD", 2) == 0)
+            rf |= SAM_AUX | SAM_SEQ;
+        else if (memcmp(opts->filter_tag, "RG", 2) == 0)
+            rf |= SAM_RGAUX;
+        else
+            rf |= SAM_AUX;
+    }
      if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
          fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
          free(state);
@@ -578,10 +624,59 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
      return valid;
  }
  
-static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
+static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state,
+                                 bam2fq_opts_t *opts)
  {
+    if (opts->filter_tag) {
+        uint8_t *s = bam_aux_get(b, opts->filter_tag);
+        if (!s)
+            return true;
+
+        if (opts->filter_value_str) {
+            switch (*s) {
+            case 'i': case 'I':
+            case 's': case 'S':
+            case 'c': case 'C':
+                if (opts->filter_value_int == INT64_MAX)
+                    // cache integer conversion for repeated use
+                    opts->filter_value_int =
+                        strtoll(opts->filter_value_str, NULL, 0);
+                if (opts->filter_value_int != bam_aux2i(s))
+                    return true;
+                break;
+
+            case 'f':
+                if (opts->filter_value_flt == FLT_MAX)
+                    opts->filter_value_flt = atof(opts->filter_value_str);
+                // Comparing floats is hard.
+                // Eg (double)0.1 - (double)0.1f is -1.5e-9.
+                // Given BAM binary encoding is float however, just keep it.
+                // This means rounding errors will (hopefully) always be the
+                // same and basic equality still works.
+                if (opts->filter_value_flt != (float)bam_aux2f(s))
+                    return true;
+                break;
+
+            case 'A':
+                if (s[1] != *opts->filter_value_str)
+                    return true;
+                break;
+
+            case 'Z': case 'H':
+                if (strcmp((char *)s+1, opts->filter_value_str) != 0)
+                    return true;
+                break;
+
+            default:
+                // Anything unsupported fails the filter match too.
+                return true;
+            }
+        }
+    }
+
      return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
          ||  (b->core.flag&(state->flag_off)) != 0
+        ||  (((b->core.flag&(state->flag_anyon)) == 0) && (state->flag_anyon != 0))
          ||  (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
  
  }
@@ -800,7 +895,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
          }
          at_eof = res < 0;
  
-        if (!at_eof && filter_it_out(b[n], state))
+        if (!at_eof && filter_it_out(b[n], state, opts))
              continue;
          if (!at_eof) {
              ++n_reads;
diff --git a/samtools/bam_import.c b/samtools/bam_import.c

index 14ff0b0de7f81d10f79de0c987c5968aef2c3961..079e04bf6a841682aec35a6790b34a636179fefb 100644 (file)
--- a/samtools/bam_import.c
+++ b/samtools/bam_import.c
@@ -4,7 +4,7 @@
   *   samtools import a_1.fq a_2.fq
   *   samtools import a_interleaved.fq
   *
- * Copyright (C) 2020-2021 Genome Research Ltd.
+ * Copyright (C) 2020-2021, 2023 Genome Research Ltd.
   *
   * Author: James Bonfield <jkb@sanger.ac.uk>
   */
@@ -93,6 +93,7 @@ typedef struct {
      char *rg;
      char *rg_line;
      char *order;
+    int order_str;
      int compress_level;
      htsThreadPool p;
      int name2;
@@ -358,9 +359,23 @@ static int import_fastq(int argc, char **argv, opts_t *opts) {
              }
  
              if (opts->order) {
-                if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
-                    ret = -1;
-                    goto err;
+                if (opts->order_str) {
+                    char buf[25];
+                    snprintf(buf, sizeof(buf), "%0*"PRIu64,
+                             opts->order_str, read_num++);
+                    if (bam_aux_update_str(b, opts->order,
+                                           strlen(buf), buf) < 0) {
+                        ret = -1;
+                        goto err;
+                    }
+                } else {
+                    if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
+                        ret = -1;
+                        goto err;
+                    }
+                    if (read_num == UINT_MAX)
+                        fprintf(stderr, "Warning: --order tag has overflowed."
+                                "  Consider using TAG:LENGTH instead\n");
                  }
              }
  
@@ -421,6 +436,7 @@ int main_import(int argc, char *argv[]) {
          .rg = NULL,
          .rg_line = NULL,
          .order = NULL,
+        .order_str = 0,
          .compress_level = -1,
          .name2 = 0,
      };
@@ -470,7 +486,11 @@ int main_import(int argc, char *argv[]) {
          case 'N': opts.name2 = 1; break;
  
          case 9: opts.no_pg = 1; break;
-        case 3: opts.order = optarg; break;
+        case 3:
+            opts.order = optarg;
+            if (strlen(optarg) > 3 && optarg[2] == ':')
+                opts.order_str = atoi(optarg+3);
+            break;
  
          case 'h': return usage(stdout, EXIT_SUCCESS);
          case '?': return usage(stderr, EXIT_FAILURE);
diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c

index 842ff60781ff651484c78a4d034707e61f17eec8..f16a7811fc4718a9375d9657804b2eb9d56b39e3 100644 (file)
--- a/samtools/bam_import.c.pysam.c
+++ b/samtools/bam_import.c.pysam.c
@@ -6,7 +6,7 @@
   *   samtools import a_1.fq a_2.fq
   *   samtools import a_interleaved.fq
   *
- * Copyright (C) 2020-2021 Genome Research Ltd.
+ * Copyright (C) 2020-2021, 2023 Genome Research Ltd.
   *
   * Author: James Bonfield <jkb@sanger.ac.uk>
   */
@@ -95,6 +95,7 @@ typedef struct {
      char *rg;
      char *rg_line;
      char *order;
+    int order_str;
      int compress_level;
      htsThreadPool p;
      int name2;
@@ -360,9 +361,23 @@ static int import_fastq(int argc, char **argv, opts_t *opts) {
              }
  
              if (opts->order) {
-                if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
-                    ret = -1;
-                    goto err;
+                if (opts->order_str) {
+                    char buf[25];
+                    snprintf(buf, sizeof(buf), "%0*"PRIu64,
+                             opts->order_str, read_num++);
+                    if (bam_aux_update_str(b, opts->order,
+                                           strlen(buf), buf) < 0) {
+                        ret = -1;
+                        goto err;
+                    }
+                } else {
+                    if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
+                        ret = -1;
+                        goto err;
+                    }
+                    if (read_num == UINT_MAX)
+                        fprintf(samtools_stderr, "Warning: --order tag has overflowed."
+                                "  Consider using TAG:LENGTH instead\n");
                  }
              }
  
@@ -423,6 +438,7 @@ int main_import(int argc, char *argv[]) {
          .rg = NULL,
          .rg_line = NULL,
          .order = NULL,
+        .order_str = 0,
          .compress_level = -1,
          .name2 = 0,
      };
@@ -472,7 +488,11 @@ int main_import(int argc, char *argv[]) {
          case 'N': opts.name2 = 1; break;
  
          case 9: opts.no_pg = 1; break;
-        case 3: opts.order = optarg; break;
+        case 3:
+            opts.order = optarg;
+            if (strlen(optarg) > 3 && optarg[2] == ':')
+                opts.order_str = atoi(optarg+3);
+            break;
  
          case 'h': return usage(samtools_stdout, EXIT_SUCCESS);
          case '?': return usage(samtools_stderr, EXIT_FAILURE);
diff --git a/samtools/bam_index.c b/samtools/bam_index.c

index f7c3358a0ce119f582b104b63e1b2f1c13b76282..0803f3e4244c3bc1247282b0ebe184cff8486621 100644 (file)
--- a/samtools/bam_index.c
+++ b/samtools/bam_index.c
@@ -1,6 +1,6 @@
  /*  bam_index.c -- index and idxstats subcommands.
  
-    Copyright (C) 2008-2011, 2013-2016, 2018, 2019  Genome Research Ltd.
+    Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023  Genome Research Ltd.
      Portions copyright (C) 2010 Broad Institute.
      Portions copyright (C) 2013 Peter Cock, The James Hutton Institute.
  
@@ -47,12 +47,12 @@ static void index_usage(FILE *fp)
  "Usage: samtools index -M [-bc] [-m INT] <in1.bam> <in2.bam>...\n"
  "   or: samtools index [-bc] [-m INT] <in.bam> [out.index]\n"
  "Options:\n"
-"  -b       Generate BAI-format index for BAM files [default]\n"
-"  -c       Generate CSI-format index for BAM files\n"
-"  -m INT   Set minimum interval size for CSI indices to 2^INT [%d]\n"
-"  -M       Interpret all filename arguments as files to be indexed\n"
-"  -o FILE  Write index to FILE [alternative to <out.index> as an argument]\n"
-"  -@ INT   Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
+"  -b, --bai            Generate BAI-format index for BAM files [default]\n"
+"  -c, --csi            Generate CSI-format index for BAM files\n"
+"  -m, --min-shift INT  Set minimum interval size for CSI indices to 2^INT [%d]\n"
+"  -M                   Interpret all filename arguments as files to be indexed\n"
+"  -o, --output FILE    Write index to FILE [alternative to <out.index> in args]\n"
+"  -@, --threads INT    Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
  }
  
  // Returns 1 if the file does not exist or can be positively
@@ -80,7 +80,16 @@ int bam_index(int argc, char *argv[])
      int n_files, c, i, ret;
      const char *fn_idx = NULL;
  
-    while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0)
+    static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'),
+        {"output",    required_argument, NULL, 'o'},
+        {"bai",       no_argument,       NULL, 'b'},
+        {"csi",       no_argument,       NULL, 'c'},
+        {"min-shift", required_argument, NULL, 'm'},
+        { NULL, 0, NULL, 0 }
+    };
+
+    while ((c = getopt_long(argc, argv, "bcm:Mo:@:", lopts, NULL)) >= 0)
          switch (c) {
          case 'b': csi = 0; break;
          case 'c': csi = 1; break;
diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c

index 6627cfac95b97f48f8b5ed914d7e32bfc4d8fc95..3093c01382ff1e509b50281dc5007a78c3ad7a39 100644 (file)
--- a/samtools/bam_index.c.pysam.c
+++ b/samtools/bam_index.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_index.c -- index and idxstats subcommands.
  
-    Copyright (C) 2008-2011, 2013-2016, 2018, 2019  Genome Research Ltd.
+    Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023  Genome Research Ltd.
      Portions copyright (C) 2010 Broad Institute.
      Portions copyright (C) 2013 Peter Cock, The James Hutton Institute.
  
@@ -49,12 +49,12 @@ static void index_usage(FILE *fp)
  "Usage: samtools index -M [-bc] [-m INT] <in1.bam> <in2.bam>...\n"
  "   or: samtools index [-bc] [-m INT] <in.bam> [out.index]\n"
  "Options:\n"
-"  -b       Generate BAI-format index for BAM files [default]\n"
-"  -c       Generate CSI-format index for BAM files\n"
-"  -m INT   Set minimum interval size for CSI indices to 2^INT [%d]\n"
-"  -M       Interpret all filename arguments as files to be indexed\n"
-"  -o FILE  Write index to FILE [alternative to <out.index> as an argument]\n"
-"  -@ INT   Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
+"  -b, --bai            Generate BAI-format index for BAM files [default]\n"
+"  -c, --csi            Generate CSI-format index for BAM files\n"
+"  -m, --min-shift INT  Set minimum interval size for CSI indices to 2^INT [%d]\n"
+"  -M                   Interpret all filename arguments as files to be indexed\n"
+"  -o, --output FILE    Write index to FILE [alternative to <out.index> in args]\n"
+"  -@, --threads INT    Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
  }
  
  // Returns 1 if the file does not exist or can be positively
@@ -82,7 +82,16 @@ int bam_index(int argc, char *argv[])
      int n_files, c, i, ret;
      const char *fn_idx = NULL;
  
-    while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0)
+    static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'),
+        {"output",    required_argument, NULL, 'o'},
+        {"bai",       no_argument,       NULL, 'b'},
+        {"csi",       no_argument,       NULL, 'c'},
+        {"min-shift", required_argument, NULL, 'm'},
+        { NULL, 0, NULL, 0 }
+    };
+
+    while ((c = getopt_long(argc, argv, "bcm:Mo:@:", lopts, NULL)) >= 0)
          switch (c) {
          case 'b': csi = 0; break;
          case 'c': csi = 1; break;
diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c

index fc333c432ae81b86c734f3b89b0df1b589f83d0f..677a47f8283c19957f6dfc7da041f542af3fec55 100644 (file)
--- a/samtools/bam_markdup.c
+++ b/samtools/bam_markdup.c
@@ -76,6 +76,7 @@ typedef struct {
      regex_t *bc_rgx;
      int read_groups;
      int json;
+    int dc;
  } md_param_t;
  
  typedef struct {
@@ -96,6 +97,7 @@ typedef struct read_queue_s {
      bam1_t *b;
      struct read_queue_s *duplicate;
      struct read_queue_s *original;
+    int dc;
      hts_pos_t pos;
      int dup_checked;
      int read_group;
@@ -1616,6 +1618,7 @@ static int bam_mark_duplicates(md_param_t *param) {
          in_read->original = NULL;
          in_read->dup_checked = 0;
          in_read->read_group = 0;
+        in_read->dc = 1;
  
          if (param->read_groups) {
              uint8_t *data;
@@ -1703,6 +1706,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                          }
  
                          bp->p = in_read;
+                        bp->p->dc += 1;
  
                          if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings))
                              goto fail;
@@ -1765,6 +1769,7 @@ static int bam_mark_duplicates(md_param_t *param) {
  
                      if (new_score + tie_add > old_score) { // swap reads
                          dup = bp->p->b;
+                        in_read->dc += bp->p->dc;
  
                          if (param->check_chain) {
  
@@ -1805,6 +1810,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                          }
  
                          dup = in_read->b;
+                        bp->p->dc += 1;
                      }
  
                      if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings))
@@ -1846,6 +1852,8 @@ static int bam_mark_duplicates(md_param_t *param) {
                              in_read->original = bp->p;
                          }
  
+                        bp->p->dc += 1;
+
                          if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings))
                              goto fail;
  
@@ -1860,6 +1868,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                          // to the single hash and mark the other as duplicate
                          if (new_score > old_score) { // swap reads
                              dup = bp->p->b;
+                            in_read->dc += bp->p->dc;
  
                              if (param->check_chain) {
                                  in_read->duplicate = bp->p;
@@ -1877,6 +1886,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                                  in_read->original = bp->p;
                              }
  
+                            bp->p->dc += 1;
                              dup = in_read->b;
                          }
  
@@ -1914,6 +1924,9 @@ static int bam_mark_duplicates(md_param_t *param) {
              }
  
              if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+                if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+                    bam_aux_update_int(in_read->b, "dc", in_read->dc);
+                }
                  if (param->supp) {
                      if (tmp_file_write(&temp, in_read->b)) {
                          print_error("markdup", "error, writing temp output failed.\n");
@@ -1977,12 +1990,20 @@ static int bam_mark_duplicates(md_param_t *param) {
              }
  
              if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+                if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+                    bam_aux_update_int(in_read->b, "dc",  in_read->dc);
+                }
+
                  if (param->supp) {
                      if (tmp_file_write(&temp, in_read->b)) {
                          print_error("markdup", "error, writing temp output failed on final write.\n");
                          goto fail;
                      }
                  } else {
+                    if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+                        bam_aux_update_int(in_read->b, "dc", in_read->dc);
+                    }
+
                      if (sam_write1(param->out, header, in_read->b) < 0) {
                          print_error("markdup", "error, writing output failed on final write.\n");
                          goto fail;
@@ -2044,6 +2065,10 @@ static int bam_mark_duplicates(md_param_t *param) {
              }
  
              if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) {
+                if (param->dc && (b->core.flag & BAM_FDUP)) {
+                    uint8_t* data = bam_aux_get(b, "dc");
+                    if(data) bam_aux_del(b, data);
+                }
                  if (sam_write1(param->out, header, b) < 0) {
                      print_error("markdup", "error, writing final output failed.\n");
                      goto fail;
@@ -2179,6 +2204,7 @@ static int bam_mark_duplicates(md_param_t *param) {
      if (param->check_chain && (param->tag || param->opt_dist))
          free(dup_list.c);
  
+    free(idx_fn);
      free(stat_array);
      kh_destroy(reads, pair_hash);
      kh_destroy(reads, single_hash);
@@ -2205,6 +2231,7 @@ static int bam_mark_duplicates(md_param_t *param) {
      if (param->check_chain && (param->tag || param->opt_dist))
          free(dup_list.c);
  
+    free(idx_fn);
      free(stat_array);
      kh_destroy(reads, pair_hash);
      kh_destroy(reads, single_hash);
@@ -2242,6 +2269,7 @@ static int markdup_usage(void) {
      fprintf(stderr, "  --use-read-groups  Use the read group tags in duplicate matching.\n");
      fprintf(stderr, "  -t                 Mark primary duplicates with the name of the original in a \'do\' tag."
                                          " Mainly for information and debugging.\n");
+    fprintf(stderr, "  --duplicate-count  Record the original primary read duplication count(include itself) in a \'dc\' tag.\n");
  
      sam_global_opt_help(stderr, "-.O..@..");
  
@@ -2263,7 +2291,7 @@ int bam_markdup(int argc, char **argv) {
      char *regex = NULL, *bc_regex = NULL;
      char *regex_order = "txy";
      md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                        1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0};
+                        1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0, 0};
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
@@ -2278,6 +2306,7 @@ int bam_markdup(int argc, char **argv) {
          {"barcode-rgx", required_argument, NULL, 1008},
          {"use-read-groups", no_argument, NULL, 1009},
          {"json", no_argument, NULL, 1010},
+        {"duplicate-count", no_argument, NULL, 1011},
          {NULL, 0, NULL, 0}
      };
  
@@ -2314,6 +2343,7 @@ int bam_markdup(int argc, char **argv) {
              case 1008: bc_name = 1, bc_regex = optarg; break;
              case 1009: param.read_groups = 1; break;
              case 1010: param.json = 1; param.do_stats = 1; break;
+            case 1011: param.dc = 1; break;
              default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
              /* else fall-through */
              case '?': return markdup_usage();
diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c

index 3e3b0b5f812ea72e27c015c0b0fba16a0caec1ff..e8fea3d2a1b4b94382256203be42eda78181aecf 100644 (file)
--- a/samtools/bam_markdup.c.pysam.c
+++ b/samtools/bam_markdup.c.pysam.c
@@ -78,6 +78,7 @@ typedef struct {
      regex_t *bc_rgx;
      int read_groups;
      int json;
+    int dc;
  } md_param_t;
  
  typedef struct {
@@ -98,6 +99,7 @@ typedef struct read_queue_s {
      bam1_t *b;
      struct read_queue_s *duplicate;
      struct read_queue_s *original;
+    int dc;
      hts_pos_t pos;
      int dup_checked;
      int read_group;
@@ -1618,6 +1620,7 @@ static int bam_mark_duplicates(md_param_t *param) {
          in_read->original = NULL;
          in_read->dup_checked = 0;
          in_read->read_group = 0;
+        in_read->dc = 1;
  
          if (param->read_groups) {
              uint8_t *data;
@@ -1705,6 +1708,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                          }
  
                          bp->p = in_read;
+                        bp->p->dc += 1;
  
                          if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings))
                              goto fail;
@@ -1767,6 +1771,7 @@ static int bam_mark_duplicates(md_param_t *param) {
  
                      if (new_score + tie_add > old_score) { // swap reads
                          dup = bp->p->b;
+                        in_read->dc += bp->p->dc;
  
                          if (param->check_chain) {
  
@@ -1807,6 +1812,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                          }
  
                          dup = in_read->b;
+                        bp->p->dc += 1;
                      }
  
                      if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings))
@@ -1848,6 +1854,8 @@ static int bam_mark_duplicates(md_param_t *param) {
                              in_read->original = bp->p;
                          }
  
+                        bp->p->dc += 1;
+
                          if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings))
                              goto fail;
  
@@ -1862,6 +1870,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                          // to the single hash and mark the other as duplicate
                          if (new_score > old_score) { // swap reads
                              dup = bp->p->b;
+                            in_read->dc += bp->p->dc;
  
                              if (param->check_chain) {
                                  in_read->duplicate = bp->p;
@@ -1879,6 +1888,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                                  in_read->original = bp->p;
                              }
  
+                            bp->p->dc += 1;
                              dup = in_read->b;
                          }
  
@@ -1916,6 +1926,9 @@ static int bam_mark_duplicates(md_param_t *param) {
              }
  
              if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+                if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+                    bam_aux_update_int(in_read->b, "dc", in_read->dc);
+                }
                  if (param->supp) {
                      if (tmp_file_write(&temp, in_read->b)) {
                          print_error("markdup", "error, writing temp output failed.\n");
@@ -1979,12 +1992,20 @@ static int bam_mark_duplicates(md_param_t *param) {
              }
  
              if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+                if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+                    bam_aux_update_int(in_read->b, "dc",  in_read->dc);
+                }
+
                  if (param->supp) {
                      if (tmp_file_write(&temp, in_read->b)) {
                          print_error("markdup", "error, writing temp output failed on final write.\n");
                          goto fail;
                      }
                  } else {
+                    if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+                        bam_aux_update_int(in_read->b, "dc", in_read->dc);
+                    }
+
                      if (sam_write1(param->out, header, in_read->b) < 0) {
                          print_error("markdup", "error, writing output failed on final write.\n");
                          goto fail;
@@ -2046,6 +2067,10 @@ static int bam_mark_duplicates(md_param_t *param) {
              }
  
              if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) {
+                if (param->dc && (b->core.flag & BAM_FDUP)) {
+                    uint8_t* data = bam_aux_get(b, "dc");
+                    if(data) bam_aux_del(b, data);
+                }
                  if (sam_write1(param->out, header, b) < 0) {
                      print_error("markdup", "error, writing final output failed.\n");
                      goto fail;
@@ -2181,6 +2206,7 @@ static int bam_mark_duplicates(md_param_t *param) {
      if (param->check_chain && (param->tag || param->opt_dist))
          free(dup_list.c);
  
+    free(idx_fn);
      free(stat_array);
      kh_destroy(reads, pair_hash);
      kh_destroy(reads, single_hash);
@@ -2207,6 +2233,7 @@ static int bam_mark_duplicates(md_param_t *param) {
      if (param->check_chain && (param->tag || param->opt_dist))
          free(dup_list.c);
  
+    free(idx_fn);
      free(stat_array);
      kh_destroy(reads, pair_hash);
      kh_destroy(reads, single_hash);
@@ -2244,6 +2271,7 @@ static int markdup_usage(void) {
      fprintf(samtools_stderr, "  --use-read-groups  Use the read group tags in duplicate matching.\n");
      fprintf(samtools_stderr, "  -t                 Mark primary duplicates with the name of the original in a \'do\' tag."
                                          " Mainly for information and debugging.\n");
+    fprintf(samtools_stderr, "  --duplicate-count  Record the original primary read duplication count(include itself) in a \'dc\' tag.\n");
  
      sam_global_opt_help(samtools_stderr, "-.O..@..");
  
@@ -2265,7 +2293,7 @@ int bam_markdup(int argc, char **argv) {
      char *regex = NULL, *bc_regex = NULL;
      char *regex_order = "txy";
      md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                        1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0};
+                        1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0, 0};
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
@@ -2280,6 +2308,7 @@ int bam_markdup(int argc, char **argv) {
          {"barcode-rgx", required_argument, NULL, 1008},
          {"use-read-groups", no_argument, NULL, 1009},
          {"json", no_argument, NULL, 1010},
+        {"duplicate-count", no_argument, NULL, 1011},
          {NULL, 0, NULL, 0}
      };
  
@@ -2316,6 +2345,7 @@ int bam_markdup(int argc, char **argv) {
              case 1008: bc_name = 1, bc_regex = optarg; break;
              case 1009: param.read_groups = 1; break;
              case 1010: param.json = 1; param.do_stats = 1; break;
+            case 1011: param.dc = 1; break;
              default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
              /* else fall-through */
              case '?': return markdup_usage();
diff --git a/samtools/bam_md.c b/samtools/bam_md.c

index d7fd60fb2fbef00c13c3d9d4303ae2152a2ce735..b9182b6a8915ddf7a189cd9d57263c46f94f1a1e 100644 (file)
--- a/samtools/bam_md.c
+++ b/samtools/bam_md.c
@@ -411,8 +411,11 @@ int bam_fillmd(int argc, char *argv[])
  
      header = sam_hdr_read(fp);
      if (header == NULL || sam_hdr_nref(header) == 0) {
-        fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
-        goto fail;
+        // NB: if we have no SQ headers but have aligned data, then this will
+        // be caught during processing with e.g.
+        // "[E::sam_parse1] no SQ lines present in the header"
+        fprintf(stderr, "[bam_fillmd] warning: input SAM does not have "
+                "header, performing a no-op.\n");
      }
  
      fpout = sam_open_format("-", mode_w, &ga.out);
diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c

index 0daf177419017180ed3c485e7354d20c87486a4a..795eccb21c5074892b692c924f475cb314cad60c 100644 (file)
--- a/samtools/bam_md.c.pysam.c
+++ b/samtools/bam_md.c.pysam.c
@@ -413,8 +413,11 @@ int bam_fillmd(int argc, char *argv[])
  
      header = sam_hdr_read(fp);
      if (header == NULL || sam_hdr_nref(header) == 0) {
-        fprintf(samtools_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
-        goto fail;
+        // NB: if we have no SQ headers but have aligned data, then this will
+        // be caught during processing with e.g.
+        // "[E::sam_parse1] no SQ lines present in the header"
+        fprintf(samtools_stderr, "[bam_fillmd] warning: input SAM does not have "
+                "header, performing a no-op.\n");
      }
  
      fpout = sam_open_format(samtools_stdout_fn, mode_w, &ga.out);
diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c

index 0ad308a982fd189987d1c1c40c89368a7104f045..f84c805385fd2fde742fed6810093d6ce7b1ba6e 100644 (file)
--- a/samtools/bam_reheader.c
+++ b/samtools/bam_reheader.c
@@ -127,6 +127,11 @@ int cram_reheader(cram_fd *in, sam_hdr_t *h, const char *arg_list, int no_pg)
      if (!h)
          return ret;
  
+    // Match output version number with input file.
+    char vers[99];
+    sprintf(vers, "%d.%d", cram_major_vers(in), cram_minor_vers(in));
+    cram_set_option(out, CRAM_OPT_VERSION, vers);
+
      // Attempt to fill out a cram->refs[] array from @SQ headers
      sam_hdr_t *cram_h = sam_hdr_dup(h);
      if (!cram_h)
diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c

index 22a6cd929f0ccdf9c3339c192b0b66d2c2b9a125..5a78c661370981d2da5ea3a8fa3225c519093744 100644 (file)
--- a/samtools/bam_reheader.c.pysam.c
+++ b/samtools/bam_reheader.c.pysam.c
@@ -129,6 +129,11 @@ int cram_reheader(cram_fd *in, sam_hdr_t *h, const char *arg_list, int no_pg)
      if (!h)
          return ret;
  
+    // Match output version number with input file.
+    char vers[99];
+    sprintf(vers, "%d.%d", cram_major_vers(in), cram_minor_vers(in));
+    cram_set_option(out, CRAM_OPT_VERSION, vers);
+
      // Attempt to fill out a cram->refs[] array from @SQ headers
      sam_hdr_t *cram_h = sam_hdr_dup(h);
      if (!cram_h)
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c

index 875e29c2e36e4abff9380ed723c6597c68431c50..b44bd665de8eb92120b82be280e7bcb323c625b5 100644 (file)
--- a/samtools/bam_sort.c
+++ b/samtools/bam_sort.c
@@ -1,6 +1,6 @@
  /*  bam_sort.c -- sorting and merging.
  
-    Copyright (C) 2008-2022 Genome Research Ltd.
+    Copyright (C) 2008-2023 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -54,6 +54,8 @@ DEALINGS IN THE SOFTWARE.  */
  #include "bedidx.h"
  #include "bam.h"
  
+//#define DEBUG_MINHASH
+
  #define BAM_BLOCK_SIZE 2*1024*1024
  #define MAX_TMP_FILES 64
  
@@ -1783,7 +1785,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out,
                              htsThreadPool *htspool,
                              const char *cmd, const htsFormat *in_fmt,
                              const htsFormat *out_fmt, char *arg_list, int no_pg,
-                            int write_index) {
+                            int write_index, int final_out) {
      samFile *fpout = NULL, **fp = NULL;
      heap1_t *heap = NULL;
      uint64_t idx = 0;
@@ -1884,7 +1886,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out,
      ks_heapmake(heap, heap_size, heap);
      while (heap->pos != HEAP_EMPTY) {
          bam1_t *b = heap->entry.bam_record;
-        if (g_sam_order == MinHash && b->core.tid == -1) {
+        if (g_sam_order == MinHash && b->core.tid == -1 && final_out) {
              // Remove the cached minhash value
              b->core.pos = -1;
              b->core.mpos = -1;
@@ -2052,6 +2054,11 @@ static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b)
  //
  // The 64-bit sort key is split over the bam pos and isize fields.
  // This permits it to survive writing to temporary file and coming back.
+
+#ifdef DEBUG_MINHASH
+static int ntot = 0, nmis = 0, ndup = 0;
+#endif
+
  static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
  {
      const bam1_t *A = a.bam_record;
@@ -2062,16 +2069,18 @@ static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
  
      if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b);
  
-    const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
-    const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
+    const uint64_t m_a = (((uint64_t)A->core.pos)<<31)|(uint32_t)A->core.mpos;
+    const uint64_t m_b = (((uint64_t)B->core.pos)<<31)|(uint32_t)B->core.mpos;
  
      if (m_a < m_b) // by hash
          return -1;
      else if (m_a > m_b)
          return 1;
-    else if (A->core.isize < B->core.isize) // by hash location in seq
+
+    // Bigger pos with size minhash means starts further to left
+    else if (A->core.isize > B->core.isize) // by hash location in seq
          return -1;
-    else if (A->core.isize > B->core.isize)
+    else if (A->core.isize < B->core.isize)
          return 1;
      else
          return bam1_cmp_core(a,b);
@@ -2243,6 +2252,8 @@ typedef struct {
      int error;
      int large_pos;
      int minimiser_kmer;
+    bool try_rev;
+    bool no_squash;
  } worker_t;
  
  // Returns 0 for success
@@ -2273,6 +2284,8 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu
      for (i = 0; i < l; ++i) {
          bam1_t *b = buf[i].bam_record;
          if (clear_minhash && b->core.tid == -1) {
+            // To see the position for debugging
+            // b->core.pos = ((((uint64_t)b->core.pos)<<31)|(uint32_t)b->core.mpos) + b->core.isize;
              // Remove the cached minhash value
              b->core.pos = -1;
              b->core.mpos = -1;
@@ -2381,8 +2394,15 @@ err:
      return ret;
  }
  
+KHASH_MAP_INIT_INT64(kmer, int64_t)
+static khash_t(kmer) *kmer_h = NULL;
+
+// Punt homopolymers somewhere central in the hash space
+#define XOR 0xdead7878beef7878
+
  /*
- * Computes the minhash of a sequence using both forward and reverse strands.
+ * Computes the minhash of a sequence using forward strand and if requested
+ * reverse strand.
   *
   * This is used as a sort key for unmapped data, to collate like sequences
   * together and to improve compression ratio.
@@ -2390,13 +2410,219 @@ err:
   * The minhash is returned and *pos filled out with location of this hash
   * key in the sequence if pos != NULL.
   */
-static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
+static uint64_t minhash(bam1_t *b, int kmer, int window, int *curr_pos,
+                        int *end, int *is_rev, int try_fwd, int try_rev,
+                        int no_squash) {
      uint64_t hashf = 0, minhashf = UINT64_MAX;
-    uint64_t hashr = 0, minhashr = UINT64_MAX;
-    int minhashpf = 0, minhashpr = 0, i;
+    int minhashpf = *curr_pos, i, j;
+    uint64_t mask = (1L<<(2*kmer))-1;
+    uint8_t *seq = bam_get_seq(b);
+    int len = b->core.l_qseq;
+    uint64_t xor = XOR & mask;
+
+    if (is_rev) *is_rev = 0;
+
+    // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+    // =ACM GRSV TWYH KDBN
+#define X 0
+    static unsigned char L[16] = {
+        X,0,1,X,  2,X,X,X,  3,X,X,X,  X,X,X,X,
+    };
+    uint64_t R[16] = {
+        X,3,2,X,  1,X,X,X,  0,X,X,X,  X,X,X,X,
+    };
+    for (i = 0; i < 16; i++)
+        R[i] <<= 2*(kmer-1);
+
+    int i_start = *curr_pos;
+    int i_end = MIN(i_start + window, len);
+    int last_base = -1;
+
+    if (try_fwd) {
+        // Initialise hash keys
+        for (i = i_start, j = 0; j < kmer-1 && i < i_end; i++) {
+            int base = bam_seqi(seq, i);
+            // collapse homopolymers
+            if (no_squash || last_base != base) {
+                last_base = base;
+                hashf = (hashf<<2) | L[base];
+                j++;
+            }
+        }
+
+        // Loop to find minimum
+        if (no_squash) {
+            for (; i < i_end; i++) {
+                int base = bam_seqi(seq, i);
+                hashf = (hashf<<2) | L[base];
+                uint64_t hashfx = (hashf ^ XOR) & mask;
+                if (minhashf > hashfx)
+                    minhashf = hashfx, minhashpf = i;
+            }
+        } else {
+            for (; i < i_end; i++) {
+                int base = bam_seqi(seq, i);
+                if (last_base != base) {
+                    last_base = base;
+                    hashf = (hashf<<2) | L[base];
+                    uint64_t hashfx = (hashf ^ XOR) & mask;
+                    if (minhashf > hashfx)
+                        minhashf = hashfx, minhashpf = i;
+                }
+            }
+        }
+    }
+
+    // Same as above for the reverse strand.
+    // Not used for now, but we may wish to consider indexing in both
+    // strands, recording the strand in value (pos), and comparing in one
+    // strand only.  Right now we compare on both against a single-stranded
+    // index.
+    if (try_rev) {
+        uint64_t hashr = 0, minhashr = UINT64_MAX;
+        int minhashpr = *curr_pos;
+        int last_base = -1;
+
+        for (i = i_start, j = 0; j < kmer-1 && i < len; i++) {
+            int base = bam_seqi(seq, i);
+            if (no_squash || last_base != base) {
+                last_base = base;
+                hashr = (hashr>>2) | R[base];
+                j++;
+            }
+        }
+
+        if (no_squash) {
+            for (; i < i_end; i++) {
+                int base = bam_seqi(seq, i);
+                hashr =  (hashr>>2) | R[base];
+                if (minhashr > (hashr^xor))
+                    minhashr = (hashr^xor), minhashpr = len-i+kmer-2;
+            }
+        } else {
+            for (; i < i_end; i++) {
+                int base = bam_seqi(seq, i);
+                if (last_base != base) {
+                    last_base = base;
+                    hashr =  (hashr>>2) | R[base];
+                    if (minhashr > (hashr^xor))
+                        minhashr = (hashr^xor), minhashpr = len-i+kmer-2;
+                }
+            }
+        }
+
+        if (minhashr < minhashf) {
+            minhashf  = minhashr;
+            minhashpf = minhashpr;
+            if (is_rev) *is_rev = 1;
+        }
+    }
+
+    // "*curr_pos = minhashpf" is faster here, but is sometimes
+    // poorer in compression.  Eg 10 million novaseq records with
+    // 75.1MB vs 76.9MB cram BA field.
+    //*curr_pos = minhashpf;
+    *curr_pos = minhashpf - (kmer-1);
+    if (end) *end = (i_end == len);
+    return minhashf;
+}
+
+#define UNIQ_BIT  60
+#define UNIQ_TEST(x) (((x) & (1ULL<<UNIQ_BIT))==0)
+#define UNIQ_MASK ((1ULL<<UNIQ_BIT)-1)
+static int build_minhash_index(char *fn, int kmer, int window, int no_squash) {
+    int ret = 1;
+    samFile *in;
+    sam_hdr_t *h = NULL;
+    bam1_t *b = NULL;
+
+    in = sam_open(fn, "r");
+    if (!in) {
+        perror(fn);
+        return 1;
+    }
+
+    kmer_h = kh_init(kmer);
+    if (!kmer_h)
+        goto err;
+
+    if (!(h = sam_hdr_read(in)))
+        goto err;
+
+    if (!(b = bam_init1()))
+        goto err;
+
+    int r;
+    uint64_t tpos = 0;
+    while ((r = sam_read1(in, h, b)) >= 0) {
+        //fprintf(stderr, "LEN\t%d\t%s\n", b->core.l_qseq, bam_get_qname(b));
+        uint64_t hashf;
+        int pos = 0, end = 0;
+        khiter_t k;
+        int ret;
+
+        if (b->core.l_qseq < window)
+            continue;
+
+        // fwd
+        while (!end) {
+            int last_pos = pos;
+            hashf = minhash(b, kmer, window, &pos, &end, NULL, 1, 0,
+                            no_squash);
+            k = kh_put(kmer, kmer_h, hashf, &ret);
+            kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<<UNIQ_BIT);
+            pos = MAX(last_pos+kmer, pos+1);
+            //pos++;  Slower, but indexes a bit better?
+        }
+        tpos += b->core.l_qseq;
+
+// We could also add reverse keys to the index here.
+// This would avoid reverse complementing during the matching stage.
+// We'd need to add a flag (another high bit of kh_value) to indicate
+// strand.
+// I'm unsure if this is a good trade-off or not.
+
+//        // rev
+//        pos = 0; end = 0;
+//        while (!end) {
+//            hashf = minhash(b, kmer, window, &pos, &end, NULL, 0, 1,
+//                            no_squash);
+//            k = kh_put(kmer, kmer_h, hashf, &ret);
+//            kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<<UNIQ_BIT);
+//            pos++;
+//        }
+//
+//        tpos += b->core.l_qseq;
+    }
+    if (r < -1)
+        goto err;
+
+    ret = 0;
+ err:
+    if (b) bam_destroy1(b);
+    if (h) sam_hdr_destroy(h);
+    sam_close(in);
+
+    return ret;
+}
+
+/*
+ * A variant of minhash that compares against a previously built index.
+ *
+ * We follow the same steps of scanning through this sequence to find the
+ * minimum hash, but we prefer hash keys that have unique placement in the
+ * index, or if not unique, then non-uniquely placed, over ones that
+ * are absent from the index.
+ */
+static uint64_t minhash_with_idx(bam1_t *b, int kmer, int *pos, int *rev,
+                                 bool try_rev) {
+    uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX;
+    uint64_t minhashfd = UINT64_MAX;
+    int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j;
      uint64_t mask = (1L<<(2*kmer))-1;
      unsigned char *seq = bam_get_seq(b);
      int len = b->core.l_qseq;
+    const uint64_t xor = XOR & mask;
  
      // Lookup tables for bam_seqi to 0123 fwd/rev hashes
      // =ACM GRSV TWYH KDBN
@@ -2410,39 +2636,266 @@ static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
      for (i = 0; i < 16; i++)
          R[i] <<= 2*(kmer-1);
  
-    // Punt homopolymers somewhere central in the hash space
-#define XOR (0xdead7878beef7878 & mask)
-
      // Initialise hash keys
-    for (i = 0; i < kmer-1 && i < len; i++) {
+    for (i = j = 0; j < kmer-1 && i < len; i++, j++) {
          int base = bam_seqi(seq, i);
          hashf = (hashf<<2) | L[base];
-        hashr = (hashr>>2) | R[base];
      }
  
      // Loop to find minimum
+    int found_f = 0, found_r = 0;
      for (; i < len; i++) {
          int base = bam_seqi(seq, i);
+        hashf = ((hashf<<2) | L[base]) & mask;
+        const uint64_t hashfx = hashf^xor;
+
+        // Priority for sorting
+        // 1. Unique key in index
+        // 2. Dup key in index
+        // 3. Everything else
+        int index = 0;
+        if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) {
+            khiter_t k = kh_get(kmer, kmer_h, hashfx);
+            if (k != kh_end(kmer_h))
+                index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+        }
+        found_f |= index;
+        switch (index) {
+        case 2: minhashfi = hashfx, minhashpfi = i; break;
+        case 1: minhashfd = hashfx, minhashpfd = i; break;
+
+        default:
+            if (minhashf > hashfx)
+                minhashf = hashfx, minhashpf = i;
+        }
+    }
+
+    if (minhashfi != UINT64_MAX)
+        minhashf = minhashfi, minhashpf = minhashpfi;
+    else if (minhashfd != UINT64_MAX)
+        minhashf = minhashfd, minhashpf = minhashpfd;
+
+    // Same as above for the reverse strand
+    int dir = 0;
+    if (try_rev) {
+        uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX;
+        uint64_t minhashrd = UINT64_MAX;
+        int minhashpr = 0, minhashpri = 0, minhashprd = 0;
+
+        for (i = j = 0; j < kmer-1 && i < len; i++, j++) {
+            int base = bam_seqi(seq, i);
+            hashr = (hashr>>2) | R[base];
+        }
+        for (; i < len; i++) {
+            int base = bam_seqi(seq, i);
+            hashr =  (hashr>>2) | R[base];
+            const uint64_t hashrx = hashr^xor;
+
+            int index = 0;
+            if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) {
+                khiter_t k = kh_get(kmer, kmer_h, hashrx);
+                if (k != kh_end(kmer_h))
+                    index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+            }
+            found_r |= index;
+            switch (index) {
+            case 2: minhashri = hashrx, minhashpri = i; break;
+            case 1: minhashrd = hashrx, minhashprd = i; break;
+
+            default:
+                if (minhashr > hashrx)
+                    minhashr = hashrx, minhashpr = i;
+            }
+        }
+        if (minhashri != UINT64_MAX)
+            minhashr = minhashri, minhashpr = minhashpri;
+        else if (minhashrd != UINT64_MAX)
+            minhashr = minhashrd, minhashpr = minhashprd;
+
+        // Pick reverse if better mapping
+        if ((minhashf > minhashr) || (!found_f && found_r)) {
+            if (!found_f || found_r) {
+                minhashf  = minhashr;
+                minhashpf = b->core.l_qseq - minhashpr + kmer - 2;
+                dir = 1;
+            }
+        }
+    }
+
+#ifdef DEBUG_MINHASH
+    ntot++;
+    khiter_t k = kh_get(kmer, kmer_h, minhashf);
+    if (k != kh_end(kmer_h)) {
+        if (!UNIQ_TEST(kh_value(kmer_h, k)))
+            ndup++;
+        minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+    } else {
+        nmis++;
+    }
+#else
+    // For indexed kmers, our hash key is the position the kmer
+    // occurs in the concatenated reference rather than the hash itself.
+    khiter_t k = kh_get(kmer, kmer_h, minhashf);
+    if (k != kh_end(kmer_h))
+        minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+#endif
+
+    if (rev) *rev = dir;
+    if (pos) *pos = minhashpf;
  
+    return minhashf != UINT64_MAX ? minhashf : 0;
+}
+
+// As per minhash_with_idx but with homopolymer squashing enabled.
+// This function is duplicated to remove conditionals and speed up the
+// hashing code. (Minus the ifdef-ed out code, which is kept above mainly
+// for posterity.)
+static uint64_t minhash_with_idx_squash(bam1_t *b, int kmer, int *pos,
+                                        int *rev, bool try_rev) {
+    uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX;
+    uint64_t minhashfd = UINT64_MAX;
+    int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j;
+    uint64_t mask = (1L<<(2*kmer))-1;
+    unsigned char *seq = bam_get_seq(b);
+    int len = b->core.l_qseq;
+    const uint64_t xor = XOR & mask;
+
+    // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+    // =ACM GRSV TWYH KDBN
+#define X 0
+    unsigned char L[16] = {
+        X,0,1,X,  2,X,X,X,  3,X,X,X,  X,X,X,X,
+    };
+    uint64_t R[16] = {
+        X,3,2,X,  1,X,X,X,  0,X,X,X,  X,X,X,X,
+    };
+    for (i = 0; i < 16; i++)
+        R[i] <<= 2*(kmer-1);
+
+    // Initialise hash keys
+    int last_base = -1;
+    for (i = j = 0; j < kmer-1 && i < len; i++) {
+        int base = bam_seqi(seq, i);
+        if (base == last_base)
+            continue;
+        last_base = base;
+        j++;
+        hashf = (hashf<<2) | L[base];
+    }
+
+    // Loop to find minimum
+    int found_f = 0, found_r = 0;
+    for (; i < len; i++) {
+        int base = bam_seqi(seq, i);
+        if (base == last_base)
+            continue;
+        last_base = base;
          hashf = ((hashf<<2) | L[base]) & mask;
-        hashr =  (hashr>>2) | R[base];
+        const uint64_t hashfx = hashf^xor;
+
+        // Priority for sorting
+        // 1. Unique key in index
+        // 2. Dup key in index
+        // 3. Everything else
+        int index = 0;
+        if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) {
+            khiter_t k = kh_get(kmer, kmer_h, hashfx);
+            if (k != kh_end(kmer_h))
+                index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+        }
+        found_f |= index;
+        switch (index) {
+        case 2: minhashfi = hashfx, minhashpfi = i; break;
+        case 1: minhashfd = hashfx, minhashpfd = i; break;
  
-        if (minhashf > (hashf^XOR))
-            minhashf = (hashf^XOR), minhashpf = i;
-        if (minhashr > (hashr^XOR))
-            minhashr = (hashr^XOR), minhashpr = len-i+kmer-2;
+        default:
+            if (minhashf > hashfx)
+                minhashf = hashfx, minhashpf = i;
+        }
+    }
+
+    if (minhashfi != UINT64_MAX)
+        minhashf = minhashfi, minhashpf = minhashpfi;
+    else if (minhashfd != UINT64_MAX)
+        minhashf = minhashfd, minhashpf = minhashpfd;
+
+    // Same as above for the reverse strand
+    int dir = 0;
+    if (try_rev) {
+        uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX;
+        uint64_t minhashrd = UINT64_MAX;
+        int minhashpr = 0, minhashpri = 0, minhashprd = 0;
+        int last_base = -1;
+
+        for (i = j = 0; j < kmer-1 && i < len; i++) {
+            int base = bam_seqi(seq, i);
+            if (base == last_base)
+                continue;
+            last_base = base;
+            j++;
+            hashr = (hashr>>2) | R[base];
+        }
+        for (; i < len; i++) {
+            int base = bam_seqi(seq, i);
+            if (base == last_base)
+                continue;
+            last_base = base;
+            hashr =  (hashr>>2) | R[base];
+            const uint64_t hashrx = hashr^xor;
+
+            int index = 0;
+            if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) {
+                khiter_t k = kh_get(kmer, kmer_h, hashrx);
+                if (k != kh_end(kmer_h))
+                    index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+            }
+            found_r |= index;
+            switch (index) {
+            case 2: minhashri = hashrx, minhashpri = i; break;
+            case 1: minhashrd = hashrx, minhashprd = i; break;
  
+            default:
+                if (minhashr > hashrx)
+                    minhashr = hashrx, minhashpr = i;
+            }
+        }
+        if (minhashri != UINT64_MAX)
+            minhashr = minhashri, minhashpr = minhashpri;
+        else if (minhashrd != UINT64_MAX)
+            minhashr = minhashrd, minhashpr = minhashprd;
+
+        // Pick reverse if better mapping
+        if ((minhashf > minhashr) || (!found_f && found_r)) {
+            if (!found_f || found_r) {
+                minhashf  = minhashr;
+                minhashpf = b->core.l_qseq - minhashpr + kmer - 2;
+                dir = 1;
+            }
+        }
      }
  
-    if (minhashf <= minhashr) {
-        if (rev) *rev = 0;
-        if (pos) *pos = minhashpf;
-        return minhashf;
+#ifdef DEBUG_MINHASH
+    ntot++;
+    khiter_t k = kh_get(kmer, kmer_h, minhashf);
+    if (k != kh_end(kmer_h)) {
+        if (!UNIQ_TEST(kh_value(kmer_h, k)))
+            ndup++;
+        minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
      } else {
-        if (rev) *rev = 1;
-        if (pos) *pos = minhashpr;
-        return minhashr;
+        nmis++;
      }
+#else
+    // For indexed kmers, our hash key is the position the kmer
+    // occurs in the concatenated reference rather than the hash itself.
+    khiter_t k = kh_get(kmer, kmer_h, minhashf);
+    if (k != kh_end(kmer_h))
+        minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+#endif
+
+    if (rev) *rev = dir;
+    if (pos) *pos = minhashpf;
+
+    return minhashf != UINT64_MAX ? minhashf : 0;
  }
  
  //--- Start of candidates to punt to htslib
@@ -2556,18 +3009,35 @@ static inline void worker_minhash(worker_t *w) {
              continue;
  
          int pos = 0, rev = 0;
-        uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev);
+        uint64_t mh = kmer_h
+            ? (w->no_squash
+               ? minhash_with_idx(b, w->minimiser_kmer, &pos, &rev,
+                                  w->try_rev)
+               : minhash_with_idx_squash(b, w->minimiser_kmer, &pos, &rev,
+                                         w->try_rev)
+               )
+            : minhash(b, w->minimiser_kmer, b->core.l_qseq,
+                      &pos, NULL, &rev, 1, w->try_rev, w->no_squash);
          if (rev)
              reverse_complement(b);
  
+        if (!kmer_h) {
+            mh += 1LL<<30;
+            pos = 65535-pos >= 0 ? 65535-pos : 0;
+        } else {
+            mh -= pos;
+            pos = 0;
+        }
+
+
          // Store 64-bit hash in unmapped pos and mpos fields.
          // The position of hash is in isize, which we use for
          // resolving ties when sorting by hash key.
          // These are unused for completely unmapped data and
          // will be reset during final output.
-        b->core.pos = mh>>31;
+        b->core.pos = (mh>>31) & 0x7fffffff;
          b->core.mpos = mh&0x7fffffff;
-        b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+        b->core.isize = pos;
      }
  }
  
@@ -2595,7 +3065,8 @@ static void *worker(void *data)
  
  static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h,
                         int n_threads, buf_region *in_mem,
-                       int large_pos, int minimiser_kmer)
+                       int large_pos, int minimiser_kmer, bool try_rev,
+                       bool no_squash)
  {
      int i;
      size_t pos, rest;
@@ -2619,6 +3090,8 @@ static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h,
          w[i].h = h;
          w[i].large_pos = large_pos;
          w[i].minimiser_kmer = minimiser_kmer;
+        w[i].try_rev = try_rev;
+        w[i].no_squash = no_squash;
          in_mem[i].from = pos;
          in_mem[i].to = pos + w[i].buf_len;
          pos += w[i].buf_len; rest -= w[i].buf_len;
@@ -2700,6 +3173,7 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header)
    @param  sam_order the order in which the sort should occur
    @param  sort_tag  the tag to use if sorting by Tag
    @param  minimiser_kmer the kmer size when sorting by MinHash
+  @param  try_rev  try reverse strand when sorting by MinHash
    @param  fn       name of the file to be sorted
    @param  prefix   prefix of the temporary files (prefix.NNNN.bam are written)
    @param  fnout    name of the final output file to be written
@@ -2717,9 +3191,9 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header)
    NOT thread safe.
   */
  int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
-                      const char *fn, const char *prefix,
-                      const char *fnout, const char *modeout,
-                      size_t _max_mem, int n_threads,
+                      bool try_rev, bool no_squash, const char *fn,
+                      const char *prefix, const char *fnout,
+                      const char *modeout, size_t _max_mem, int n_threads,
                        const htsFormat *in_fmt, const htsFormat *out_fmt,
                        char *arg_list, int no_pg, int write_index)
  {
@@ -2958,7 +3432,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
                  goto err;
  
              int sort_res = sort_blocks(k, buf, header, n_threads,
-                                       in_mem, large_pos, minimiser_kmer);
+                                       in_mem, large_pos, minimiser_kmer,
+                                       try_rev, no_squash);
              if (sort_res < 0)
                  goto err;
  
@@ -2988,7 +3463,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
                                       &fns[consolidate_from], n_threads,
                                       in_mem, buf, keys,
                                       lib_lookup, &htspool, "sort", NULL, NULL,
-                                     NULL, 1, 0) >= 0) {
+                                     NULL, 1, 0, 0) >= 0) {
                      merge_res = 0;
                      break;
                  }
@@ -3031,7 +3506,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
      // Sort last records
      if (k > 0) {
          num_in_mem = sort_blocks(k, buf, header, n_threads,
-                                 in_mem, large_pos, minimiser_kmer);
+                                 in_mem, large_pos, minimiser_kmer, try_rev,
+                                 no_squash);
          if (num_in_mem < 0) goto err;
      } else {
          num_in_mem = 0;
@@ -3060,7 +3536,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
          if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header,
                               n_files, fns, num_in_mem, in_mem, buf, keys,
                               lib_lookup, &htspool, "sort", in_fmt, out_fmt,
-                             arg_list, no_pg, write_index) < 0) {
+                             arg_list, no_pg, write_index, 1) < 0) {
              // Propagate bam_merge_simple() failure; it has already emitted a
              // message explaining the failure, so no further message is needed.
              goto err;
@@ -3109,7 +3585,8 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma
      sprintf(fnout, "%s.bam", prefix);
      SamOrder sam_order = is_by_qname ? QueryName : Coordinate;
      g_sam_order = sam_order;
-    ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
+    ret = bam_sort_core_ext(sam_order, NULL, 0, false, true, fn, prefix,
+                            fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
      free(fnout);
      return ret;
  }
@@ -3123,7 +3600,11 @@ static void sort_usage(FILE *fp)
  "  -u         Output uncompressed data (equivalent to -l 0)\n"
  "  -m INT     Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
  "  -M         Use minimiser for clustering unaligned/unplaced reads\n"
+"  -R         Do not use reverse strand (only compatible with -M)\n"
  "  -K INT     Kmer size to use for minimiser [20]\n"
+"  -I FILE    Order minimisers by their position in FILE FASTA\n"
+"  -w INT     Window size for minimiser indexing via -I ref.fa [100]\n"
+"  -H         Squash homopolymers when computing minimiser\n"
  "  -n         Sort by read name (not compatible with samtools index command)\n"
  "  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
  "  -o FILE    Write final output to FILE rather than standard output\n"
@@ -3159,11 +3640,15 @@ int bam_sort(int argc, char *argv[])
      SamOrder sam_order = Coordinate;
      bool by_tag = false;
      int minimiser_kmer = 20;
+    bool try_rev = true;
      char* sort_tag = NULL, *arg_list = NULL;
      char *fnout = "-", modeout[12];
      kstring_t tmpprefix = { 0, 0, NULL };
      struct stat st;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    int window = 100;
+    char *minimiser_ref = NULL;
+    int no_squash = 1;
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
@@ -3173,7 +3658,7 @@ int bam_sort(int argc, char *argv[])
          { NULL, 0, NULL, 0 }
      };
  
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
          switch (c) {
          case 'o': fnout = optarg; o_seen = 1; break;
          case 'n': sam_order = QueryName; break;
@@ -3192,6 +3677,15 @@ int bam_sort(int argc, char *argv[])
          case   1: no_pg = 1; break;
          case   2: sam_order = TemplateCoordinate; break;
          case 'M': sam_order = MinHash; break;
+        case 'I':
+            sam_order = MinHash; // implicit option
+            minimiser_ref = optarg;
+            break;
+        case 'H': no_squash = 0; break;
+
+        case 'w': window = atoi(optarg); break;
+
+        case 'R': try_rev = false; break;
          case 'K':
              minimiser_kmer = atoi(optarg);
              if (minimiser_kmer < 1)
@@ -3206,6 +3700,17 @@ int bam_sort(int argc, char *argv[])
          }
      }
  
+    if (minimiser_ref) {
+        fprintf(stderr, "Building index ... ");
+        fflush(stderr);
+        if (build_minhash_index(minimiser_ref, minimiser_kmer, window,
+                                no_squash)) {
+            ret = EXIT_FAILURE;
+            goto sort_end;
+        }
+        fprintf(stderr, "done\n");
+    }
+
      // Change sort order if tag sorting is requested.  Must update based on secondary index
      if (by_tag) {
          sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate;
@@ -3262,7 +3767,9 @@ int bam_sort(int argc, char *argv[])
          ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
      }
  
-    ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0,
+    ret = bam_sort_core_ext(sam_order, sort_tag,
+                            (sam_order == MinHash) ? minimiser_kmer : 0,
+                            try_rev, no_squash,
                              (nargs > 0) ? argv[optind] : "-",
                              tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
                              &ga.in, &ga.out, arg_list, no_pg, ga.write_index);
@@ -3278,6 +3785,12 @@ int bam_sort(int argc, char *argv[])
          ret = EXIT_FAILURE;
      }
  
+#ifdef DEBUG_MINHASH
+    fprintf(stderr, "Missed %.1f%%, dup %.1f%%\n",
+            100.0*nmis/(ntot+.1),
+            100.0*ndup/(ntot+.1));
+#endif
+
  sort_end:
      free(tmpprefix.s);
      free(arg_list);
diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c

index 4353f61b89809ab569bb0b696528ccfefb1127f7..80aa4d8076d7abea0994be7684bb43159d85e690 100644 (file)
--- a/samtools/bam_sort.c.pysam.c
+++ b/samtools/bam_sort.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_sort.c -- sorting and merging.
  
-    Copyright (C) 2008-2022 Genome Research Ltd.
+    Copyright (C) 2008-2023 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -56,6 +56,8 @@ DEALINGS IN THE SOFTWARE.  */
  #include "bedidx.h"
  #include "bam.h"
  
+//#define DEBUG_MINHASH
+
  #define BAM_BLOCK_SIZE 2*1024*1024
  #define MAX_TMP_FILES 64
  
@@ -1785,7 +1787,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out,
                              htsThreadPool *htspool,
                              const char *cmd, const htsFormat *in_fmt,
                              const htsFormat *out_fmt, char *arg_list, int no_pg,
-                            int write_index) {
+                            int write_index, int final_out) {
      samFile *fpout = NULL, **fp = NULL;
      heap1_t *heap = NULL;
      uint64_t idx = 0;
@@ -1886,7 +1888,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out,
      ks_heapmake(heap, heap_size, heap);
      while (heap->pos != HEAP_EMPTY) {
          bam1_t *b = heap->entry.bam_record;
-        if (g_sam_order == MinHash && b->core.tid == -1) {
+        if (g_sam_order == MinHash && b->core.tid == -1 && final_out) {
              // Remove the cached minhash value
              b->core.pos = -1;
              b->core.mpos = -1;
@@ -2054,6 +2056,11 @@ static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b)
  //
  // The 64-bit sort key is split over the bam pos and isize fields.
  // This permits it to survive writing to temporary file and coming back.
+
+#ifdef DEBUG_MINHASH
+static int ntot = 0, nmis = 0, ndup = 0;
+#endif
+
  static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
  {
      const bam1_t *A = a.bam_record;
@@ -2064,16 +2071,18 @@ static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
  
      if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b);
  
-    const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
-    const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
+    const uint64_t m_a = (((uint64_t)A->core.pos)<<31)|(uint32_t)A->core.mpos;
+    const uint64_t m_b = (((uint64_t)B->core.pos)<<31)|(uint32_t)B->core.mpos;
  
      if (m_a < m_b) // by hash
          return -1;
      else if (m_a > m_b)
          return 1;
-    else if (A->core.isize < B->core.isize) // by hash location in seq
+
+    // Bigger pos with size minhash means starts further to left
+    else if (A->core.isize > B->core.isize) // by hash location in seq
          return -1;
-    else if (A->core.isize > B->core.isize)
+    else if (A->core.isize < B->core.isize)
          return 1;
      else
          return bam1_cmp_core(a,b);
@@ -2245,6 +2254,8 @@ typedef struct {
      int error;
      int large_pos;
      int minimiser_kmer;
+    bool try_rev;
+    bool no_squash;
  } worker_t;
  
  // Returns 0 for success
@@ -2275,6 +2286,8 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu
      for (i = 0; i < l; ++i) {
          bam1_t *b = buf[i].bam_record;
          if (clear_minhash && b->core.tid == -1) {
+            // To see the position for debugging
+            // b->core.pos = ((((uint64_t)b->core.pos)<<31)|(uint32_t)b->core.mpos) + b->core.isize;
              // Remove the cached minhash value
              b->core.pos = -1;
              b->core.mpos = -1;
@@ -2383,8 +2396,15 @@ err:
      return ret;
  }
  
+KHASH_MAP_INIT_INT64(kmer, int64_t)
+static khash_t(kmer) *kmer_h = NULL;
+
+// Punt homopolymers somewhere central in the hash space
+#define XOR 0xdead7878beef7878
+
  /*
- * Computes the minhash of a sequence using both forward and reverse strands.
+ * Computes the minhash of a sequence using forward strand and if requested
+ * reverse strand.
   *
   * This is used as a sort key for unmapped data, to collate like sequences
   * together and to improve compression ratio.
@@ -2392,13 +2412,219 @@ err:
   * The minhash is returned and *pos filled out with location of this hash
   * key in the sequence if pos != NULL.
   */
-static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
+static uint64_t minhash(bam1_t *b, int kmer, int window, int *curr_pos,
+                        int *end, int *is_rev, int try_fwd, int try_rev,
+                        int no_squash) {
      uint64_t hashf = 0, minhashf = UINT64_MAX;
-    uint64_t hashr = 0, minhashr = UINT64_MAX;
-    int minhashpf = 0, minhashpr = 0, i;
+    int minhashpf = *curr_pos, i, j;
+    uint64_t mask = (1L<<(2*kmer))-1;
+    uint8_t *seq = bam_get_seq(b);
+    int len = b->core.l_qseq;
+    uint64_t xor = XOR & mask;
+
+    if (is_rev) *is_rev = 0;
+
+    // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+    // =ACM GRSV TWYH KDBN
+#define X 0
+    static unsigned char L[16] = {
+        X,0,1,X,  2,X,X,X,  3,X,X,X,  X,X,X,X,
+    };
+    uint64_t R[16] = {
+        X,3,2,X,  1,X,X,X,  0,X,X,X,  X,X,X,X,
+    };
+    for (i = 0; i < 16; i++)
+        R[i] <<= 2*(kmer-1);
+
+    int i_start = *curr_pos;
+    int i_end = MIN(i_start + window, len);
+    int last_base = -1;
+
+    if (try_fwd) {
+        // Initialise hash keys
+        for (i = i_start, j = 0; j < kmer-1 && i < i_end; i++) {
+            int base = bam_seqi(seq, i);
+            // collapse homopolymers
+            if (no_squash || last_base != base) {
+                last_base = base;
+                hashf = (hashf<<2) | L[base];
+                j++;
+            }
+        }
+
+        // Loop to find minimum
+        if (no_squash) {
+            for (; i < i_end; i++) {
+                int base = bam_seqi(seq, i);
+                hashf = (hashf<<2) | L[base];
+                uint64_t hashfx = (hashf ^ XOR) & mask;
+                if (minhashf > hashfx)
+                    minhashf = hashfx, minhashpf = i;
+            }
+        } else {
+            for (; i < i_end; i++) {
+                int base = bam_seqi(seq, i);
+                if (last_base != base) {
+                    last_base = base;
+                    hashf = (hashf<<2) | L[base];
+                    uint64_t hashfx = (hashf ^ XOR) & mask;
+                    if (minhashf > hashfx)
+                        minhashf = hashfx, minhashpf = i;
+                }
+            }
+        }
+    }
+
+    // Same as above for the reverse strand.
+    // Not used for now, but we may wish to consider indexing in both
+    // strands, recording the strand in value (pos), and comparing in one
+    // strand only.  Right now we compare on both against a single-stranded
+    // index.
+    if (try_rev) {
+        uint64_t hashr = 0, minhashr = UINT64_MAX;
+        int minhashpr = *curr_pos;
+        int last_base = -1;
+
+        for (i = i_start, j = 0; j < kmer-1 && i < len; i++) {
+            int base = bam_seqi(seq, i);
+            if (no_squash || last_base != base) {
+                last_base = base;
+                hashr = (hashr>>2) | R[base];
+                j++;
+            }
+        }
+
+        if (no_squash) {
+            for (; i < i_end; i++) {
+                int base = bam_seqi(seq, i);
+                hashr =  (hashr>>2) | R[base];
+                if (minhashr > (hashr^xor))
+                    minhashr = (hashr^xor), minhashpr = len-i+kmer-2;
+            }
+        } else {
+            for (; i < i_end; i++) {
+                int base = bam_seqi(seq, i);
+                if (last_base != base) {
+                    last_base = base;
+                    hashr =  (hashr>>2) | R[base];
+                    if (minhashr > (hashr^xor))
+                        minhashr = (hashr^xor), minhashpr = len-i+kmer-2;
+                }
+            }
+        }
+
+        if (minhashr < minhashf) {
+            minhashf  = minhashr;
+            minhashpf = minhashpr;
+            if (is_rev) *is_rev = 1;
+        }
+    }
+
+    // "*curr_pos = minhashpf" is faster here, but is sometimes
+    // poorer in compression.  Eg 10 million novaseq records with
+    // 75.1MB vs 76.9MB cram BA field.
+    //*curr_pos = minhashpf;
+    *curr_pos = minhashpf - (kmer-1);
+    if (end) *end = (i_end == len);
+    return minhashf;
+}
+
+#define UNIQ_BIT  60
+#define UNIQ_TEST(x) (((x) & (1ULL<<UNIQ_BIT))==0)
+#define UNIQ_MASK ((1ULL<<UNIQ_BIT)-1)
+static int build_minhash_index(char *fn, int kmer, int window, int no_squash) {
+    int ret = 1;
+    samFile *in;
+    sam_hdr_t *h = NULL;
+    bam1_t *b = NULL;
+
+    in = sam_open(fn, "r");
+    if (!in) {
+        perror(fn);
+        return 1;
+    }
+
+    kmer_h = kh_init(kmer);
+    if (!kmer_h)
+        goto err;
+
+    if (!(h = sam_hdr_read(in)))
+        goto err;
+
+    if (!(b = bam_init1()))
+        goto err;
+
+    int r;
+    uint64_t tpos = 0;
+    while ((r = sam_read1(in, h, b)) >= 0) {
+        //fprintf(samtools_stderr, "LEN\t%d\t%s\n", b->core.l_qseq, bam_get_qname(b));
+        uint64_t hashf;
+        int pos = 0, end = 0;
+        khiter_t k;
+        int ret;
+
+        if (b->core.l_qseq < window)
+            continue;
+
+        // fwd
+        while (!end) {
+            int last_pos = pos;
+            hashf = minhash(b, kmer, window, &pos, &end, NULL, 1, 0,
+                            no_squash);
+            k = kh_put(kmer, kmer_h, hashf, &ret);
+            kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<<UNIQ_BIT);
+            pos = MAX(last_pos+kmer, pos+1);
+            //pos++;  Slower, but indexes a bit better?
+        }
+        tpos += b->core.l_qseq;
+
+// We could also add reverse keys to the index here.
+// This would avoid reverse complementing during the matching stage.
+// We'd need to add a flag (another high bit of kh_value) to indicate
+// strand.
+// I'm unsure if this is a good trade-off or not.
+
+//        // rev
+//        pos = 0; end = 0;
+//        while (!end) {
+//            hashf = minhash(b, kmer, window, &pos, &end, NULL, 0, 1,
+//                            no_squash);
+//            k = kh_put(kmer, kmer_h, hashf, &ret);
+//            kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<<UNIQ_BIT);
+//            pos++;
+//        }
+//
+//        tpos += b->core.l_qseq;
+    }
+    if (r < -1)
+        goto err;
+
+    ret = 0;
+ err:
+    if (b) bam_destroy1(b);
+    if (h) sam_hdr_destroy(h);
+    sam_close(in);
+
+    return ret;
+}
+
+/*
+ * A variant of minhash that compares against a previously built index.
+ *
+ * We follow the same steps of scanning through this sequence to find the
+ * minimum hash, but we prefer hash keys that have unique placement in the
+ * index, or if not unique, then non-uniquely placed, over ones that
+ * are absent from the index.
+ */
+static uint64_t minhash_with_idx(bam1_t *b, int kmer, int *pos, int *rev,
+                                 bool try_rev) {
+    uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX;
+    uint64_t minhashfd = UINT64_MAX;
+    int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j;
      uint64_t mask = (1L<<(2*kmer))-1;
      unsigned char *seq = bam_get_seq(b);
      int len = b->core.l_qseq;
+    const uint64_t xor = XOR & mask;
  
      // Lookup tables for bam_seqi to 0123 fwd/rev hashes
      // =ACM GRSV TWYH KDBN
@@ -2412,39 +2638,266 @@ static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
      for (i = 0; i < 16; i++)
          R[i] <<= 2*(kmer-1);
  
-    // Punt homopolymers somewhere central in the hash space
-#define XOR (0xdead7878beef7878 & mask)
-
      // Initialise hash keys
-    for (i = 0; i < kmer-1 && i < len; i++) {
+    for (i = j = 0; j < kmer-1 && i < len; i++, j++) {
          int base = bam_seqi(seq, i);
          hashf = (hashf<<2) | L[base];
-        hashr = (hashr>>2) | R[base];
      }
  
      // Loop to find minimum
+    int found_f = 0, found_r = 0;
      for (; i < len; i++) {
          int base = bam_seqi(seq, i);
+        hashf = ((hashf<<2) | L[base]) & mask;
+        const uint64_t hashfx = hashf^xor;
+
+        // Priority for sorting
+        // 1. Unique key in index
+        // 2. Dup key in index
+        // 3. Everything else
+        int index = 0;
+        if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) {
+            khiter_t k = kh_get(kmer, kmer_h, hashfx);
+            if (k != kh_end(kmer_h))
+                index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+        }
+        found_f |= index;
+        switch (index) {
+        case 2: minhashfi = hashfx, minhashpfi = i; break;
+        case 1: minhashfd = hashfx, minhashpfd = i; break;
+
+        default:
+            if (minhashf > hashfx)
+                minhashf = hashfx, minhashpf = i;
+        }
+    }
+
+    if (minhashfi != UINT64_MAX)
+        minhashf = minhashfi, minhashpf = minhashpfi;
+    else if (minhashfd != UINT64_MAX)
+        minhashf = minhashfd, minhashpf = minhashpfd;
+
+    // Same as above for the reverse strand
+    int dir = 0;
+    if (try_rev) {
+        uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX;
+        uint64_t minhashrd = UINT64_MAX;
+        int minhashpr = 0, minhashpri = 0, minhashprd = 0;
+
+        for (i = j = 0; j < kmer-1 && i < len; i++, j++) {
+            int base = bam_seqi(seq, i);
+            hashr = (hashr>>2) | R[base];
+        }
+        for (; i < len; i++) {
+            int base = bam_seqi(seq, i);
+            hashr =  (hashr>>2) | R[base];
+            const uint64_t hashrx = hashr^xor;
+
+            int index = 0;
+            if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) {
+                khiter_t k = kh_get(kmer, kmer_h, hashrx);
+                if (k != kh_end(kmer_h))
+                    index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+            }
+            found_r |= index;
+            switch (index) {
+            case 2: minhashri = hashrx, minhashpri = i; break;
+            case 1: minhashrd = hashrx, minhashprd = i; break;
+
+            default:
+                if (minhashr > hashrx)
+                    minhashr = hashrx, minhashpr = i;
+            }
+        }
+        if (minhashri != UINT64_MAX)
+            minhashr = minhashri, minhashpr = minhashpri;
+        else if (minhashrd != UINT64_MAX)
+            minhashr = minhashrd, minhashpr = minhashprd;
+
+        // Pick reverse if better mapping
+        if ((minhashf > minhashr) || (!found_f && found_r)) {
+            if (!found_f || found_r) {
+                minhashf  = minhashr;
+                minhashpf = b->core.l_qseq - minhashpr + kmer - 2;
+                dir = 1;
+            }
+        }
+    }
+
+#ifdef DEBUG_MINHASH
+    ntot++;
+    khiter_t k = kh_get(kmer, kmer_h, minhashf);
+    if (k != kh_end(kmer_h)) {
+        if (!UNIQ_TEST(kh_value(kmer_h, k)))
+            ndup++;
+        minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+    } else {
+        nmis++;
+    }
+#else
+    // For indexed kmers, our hash key is the position the kmer
+    // occurs in the concatenated reference rather than the hash itself.
+    khiter_t k = kh_get(kmer, kmer_h, minhashf);
+    if (k != kh_end(kmer_h))
+        minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+#endif
+
+    if (rev) *rev = dir;
+    if (pos) *pos = minhashpf;
  
+    return minhashf != UINT64_MAX ? minhashf : 0;
+}
+
+// As per minhash_with_idx but with homopolymer squashing enabled.
+// This function is duplicated to remove conditionals and speed up the
+// hashing code. (Minus the ifdef-ed out code, which is kept above mainly
+// for posterity.)
+static uint64_t minhash_with_idx_squash(bam1_t *b, int kmer, int *pos,
+                                        int *rev, bool try_rev) {
+    uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX;
+    uint64_t minhashfd = UINT64_MAX;
+    int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j;
+    uint64_t mask = (1L<<(2*kmer))-1;
+    unsigned char *seq = bam_get_seq(b);
+    int len = b->core.l_qseq;
+    const uint64_t xor = XOR & mask;
+
+    // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+    // =ACM GRSV TWYH KDBN
+#define X 0
+    unsigned char L[16] = {
+        X,0,1,X,  2,X,X,X,  3,X,X,X,  X,X,X,X,
+    };
+    uint64_t R[16] = {
+        X,3,2,X,  1,X,X,X,  0,X,X,X,  X,X,X,X,
+    };
+    for (i = 0; i < 16; i++)
+        R[i] <<= 2*(kmer-1);
+
+    // Initialise hash keys
+    int last_base = -1;
+    for (i = j = 0; j < kmer-1 && i < len; i++) {
+        int base = bam_seqi(seq, i);
+        if (base == last_base)
+            continue;
+        last_base = base;
+        j++;
+        hashf = (hashf<<2) | L[base];
+    }
+
+    // Loop to find minimum
+    int found_f = 0, found_r = 0;
+    for (; i < len; i++) {
+        int base = bam_seqi(seq, i);
+        if (base == last_base)
+            continue;
+        last_base = base;
          hashf = ((hashf<<2) | L[base]) & mask;
-        hashr =  (hashr>>2) | R[base];
+        const uint64_t hashfx = hashf^xor;
+
+        // Priority for sorting
+        // 1. Unique key in index
+        // 2. Dup key in index
+        // 3. Everything else
+        int index = 0;
+        if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) {
+            khiter_t k = kh_get(kmer, kmer_h, hashfx);
+            if (k != kh_end(kmer_h))
+                index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+        }
+        found_f |= index;
+        switch (index) {
+        case 2: minhashfi = hashfx, minhashpfi = i; break;
+        case 1: minhashfd = hashfx, minhashpfd = i; break;
  
-        if (minhashf > (hashf^XOR))
-            minhashf = (hashf^XOR), minhashpf = i;
-        if (minhashr > (hashr^XOR))
-            minhashr = (hashr^XOR), minhashpr = len-i+kmer-2;
+        default:
+            if (minhashf > hashfx)
+                minhashf = hashfx, minhashpf = i;
+        }
+    }
+
+    if (minhashfi != UINT64_MAX)
+        minhashf = minhashfi, minhashpf = minhashpfi;
+    else if (minhashfd != UINT64_MAX)
+        minhashf = minhashfd, minhashpf = minhashpfd;
+
+    // Same as above for the reverse strand
+    int dir = 0;
+    if (try_rev) {
+        uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX;
+        uint64_t minhashrd = UINT64_MAX;
+        int minhashpr = 0, minhashpri = 0, minhashprd = 0;
+        int last_base = -1;
+
+        for (i = j = 0; j < kmer-1 && i < len; i++) {
+            int base = bam_seqi(seq, i);
+            if (base == last_base)
+                continue;
+            last_base = base;
+            j++;
+            hashr = (hashr>>2) | R[base];
+        }
+        for (; i < len; i++) {
+            int base = bam_seqi(seq, i);
+            if (base == last_base)
+                continue;
+            last_base = base;
+            hashr =  (hashr>>2) | R[base];
+            const uint64_t hashrx = hashr^xor;
+
+            int index = 0;
+            if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) {
+                khiter_t k = kh_get(kmer, kmer_h, hashrx);
+                if (k != kh_end(kmer_h))
+                    index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+            }
+            found_r |= index;
+            switch (index) {
+            case 2: minhashri = hashrx, minhashpri = i; break;
+            case 1: minhashrd = hashrx, minhashprd = i; break;
  
+            default:
+                if (minhashr > hashrx)
+                    minhashr = hashrx, minhashpr = i;
+            }
+        }
+        if (minhashri != UINT64_MAX)
+            minhashr = minhashri, minhashpr = minhashpri;
+        else if (minhashrd != UINT64_MAX)
+            minhashr = minhashrd, minhashpr = minhashprd;
+
+        // Pick reverse if better mapping
+        if ((minhashf > minhashr) || (!found_f && found_r)) {
+            if (!found_f || found_r) {
+                minhashf  = minhashr;
+                minhashpf = b->core.l_qseq - minhashpr + kmer - 2;
+                dir = 1;
+            }
+        }
      }
  
-    if (minhashf <= minhashr) {
-        if (rev) *rev = 0;
-        if (pos) *pos = minhashpf;
-        return minhashf;
+#ifdef DEBUG_MINHASH
+    ntot++;
+    khiter_t k = kh_get(kmer, kmer_h, minhashf);
+    if (k != kh_end(kmer_h)) {
+        if (!UNIQ_TEST(kh_value(kmer_h, k)))
+            ndup++;
+        minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
      } else {
-        if (rev) *rev = 1;
-        if (pos) *pos = minhashpr;
-        return minhashr;
+        nmis++;
      }
+#else
+    // For indexed kmers, our hash key is the position the kmer
+    // occurs in the concatenated reference rather than the hash itself.
+    khiter_t k = kh_get(kmer, kmer_h, minhashf);
+    if (k != kh_end(kmer_h))
+        minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+#endif
+
+    if (rev) *rev = dir;
+    if (pos) *pos = minhashpf;
+
+    return minhashf != UINT64_MAX ? minhashf : 0;
  }
  
  //--- Start of candidates to punt to htslib
@@ -2558,18 +3011,35 @@ static inline void worker_minhash(worker_t *w) {
              continue;
  
          int pos = 0, rev = 0;
-        uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev);
+        uint64_t mh = kmer_h
+            ? (w->no_squash
+               ? minhash_with_idx(b, w->minimiser_kmer, &pos, &rev,
+                                  w->try_rev)
+               : minhash_with_idx_squash(b, w->minimiser_kmer, &pos, &rev,
+                                         w->try_rev)
+               )
+            : minhash(b, w->minimiser_kmer, b->core.l_qseq,
+                      &pos, NULL, &rev, 1, w->try_rev, w->no_squash);
          if (rev)
              reverse_complement(b);
  
+        if (!kmer_h) {
+            mh += 1LL<<30;
+            pos = 65535-pos >= 0 ? 65535-pos : 0;
+        } else {
+            mh -= pos;
+            pos = 0;
+        }
+
+
          // Store 64-bit hash in unmapped pos and mpos fields.
          // The position of hash is in isize, which we use for
          // resolving ties when sorting by hash key.
          // These are unused for completely unmapped data and
          // will be reset during final output.
-        b->core.pos = mh>>31;
+        b->core.pos = (mh>>31) & 0x7fffffff;
          b->core.mpos = mh&0x7fffffff;
-        b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+        b->core.isize = pos;
      }
  }
  
@@ -2597,7 +3067,8 @@ static void *worker(void *data)
  
  static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h,
                         int n_threads, buf_region *in_mem,
-                       int large_pos, int minimiser_kmer)
+                       int large_pos, int minimiser_kmer, bool try_rev,
+                       bool no_squash)
  {
      int i;
      size_t pos, rest;
@@ -2621,6 +3092,8 @@ static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h,
          w[i].h = h;
          w[i].large_pos = large_pos;
          w[i].minimiser_kmer = minimiser_kmer;
+        w[i].try_rev = try_rev;
+        w[i].no_squash = no_squash;
          in_mem[i].from = pos;
          in_mem[i].to = pos + w[i].buf_len;
          pos += w[i].buf_len; rest -= w[i].buf_len;
@@ -2702,6 +3175,7 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header)
    @param  sam_order the order in which the sort should occur
    @param  sort_tag  the tag to use if sorting by Tag
    @param  minimiser_kmer the kmer size when sorting by MinHash
+  @param  try_rev  try reverse strand when sorting by MinHash
    @param  fn       name of the file to be sorted
    @param  prefix   prefix of the temporary files (prefix.NNNN.bam are written)
    @param  fnout    name of the final output file to be written
@@ -2719,9 +3193,9 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header)
    NOT thread safe.
   */
  int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
-                      const char *fn, const char *prefix,
-                      const char *fnout, const char *modeout,
-                      size_t _max_mem, int n_threads,
+                      bool try_rev, bool no_squash, const char *fn,
+                      const char *prefix, const char *fnout,
+                      const char *modeout, size_t _max_mem, int n_threads,
                        const htsFormat *in_fmt, const htsFormat *out_fmt,
                        char *arg_list, int no_pg, int write_index)
  {
@@ -2960,7 +3434,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
                  goto err;
  
              int sort_res = sort_blocks(k, buf, header, n_threads,
-                                       in_mem, large_pos, minimiser_kmer);
+                                       in_mem, large_pos, minimiser_kmer,
+                                       try_rev, no_squash);
              if (sort_res < 0)
                  goto err;
  
@@ -2990,7 +3465,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
                                       &fns[consolidate_from], n_threads,
                                       in_mem, buf, keys,
                                       lib_lookup, &htspool, "sort", NULL, NULL,
-                                     NULL, 1, 0) >= 0) {
+                                     NULL, 1, 0, 0) >= 0) {
                      merge_res = 0;
                      break;
                  }
@@ -3033,7 +3508,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
      // Sort last records
      if (k > 0) {
          num_in_mem = sort_blocks(k, buf, header, n_threads,
-                                 in_mem, large_pos, minimiser_kmer);
+                                 in_mem, large_pos, minimiser_kmer, try_rev,
+                                 no_squash);
          if (num_in_mem < 0) goto err;
      } else {
          num_in_mem = 0;
@@ -3062,7 +3538,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
          if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header,
                               n_files, fns, num_in_mem, in_mem, buf, keys,
                               lib_lookup, &htspool, "sort", in_fmt, out_fmt,
-                             arg_list, no_pg, write_index) < 0) {
+                             arg_list, no_pg, write_index, 1) < 0) {
              // Propagate bam_merge_simple() failure; it has already emitted a
              // message explaining the failure, so no further message is needed.
              goto err;
@@ -3111,7 +3587,8 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma
      sprintf(fnout, "%s.bam", prefix);
      SamOrder sam_order = is_by_qname ? QueryName : Coordinate;
      g_sam_order = sam_order;
-    ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
+    ret = bam_sort_core_ext(sam_order, NULL, 0, false, true, fn, prefix,
+                            fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
      free(fnout);
      return ret;
  }
@@ -3125,7 +3602,11 @@ static void sort_usage(FILE *fp)
  "  -u         Output uncompressed data (equivalent to -l 0)\n"
  "  -m INT     Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
  "  -M         Use minimiser for clustering unaligned/unplaced reads\n"
+"  -R         Do not use reverse strand (only compatible with -M)\n"
  "  -K INT     Kmer size to use for minimiser [20]\n"
+"  -I FILE    Order minimisers by their position in FILE FASTA\n"
+"  -w INT     Window size for minimiser indexing via -I ref.fa [100]\n"
+"  -H         Squash homopolymers when computing minimiser\n"
  "  -n         Sort by read name (not compatible with samtools index command)\n"
  "  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
  "  -o FILE    Write final output to FILE rather than standard output\n"
@@ -3161,11 +3642,15 @@ int bam_sort(int argc, char *argv[])
      SamOrder sam_order = Coordinate;
      bool by_tag = false;
      int minimiser_kmer = 20;
+    bool try_rev = true;
      char* sort_tag = NULL, *arg_list = NULL;
      char *fnout = "-", modeout[12];
      kstring_t tmpprefix = { 0, 0, NULL };
      struct stat st;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    int window = 100;
+    char *minimiser_ref = NULL;
+    int no_squash = 1;
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
@@ -3175,7 +3660,7 @@ int bam_sort(int argc, char *argv[])
          { NULL, 0, NULL, 0 }
      };
  
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
          switch (c) {
          case 'o': fnout = optarg; o_seen = 1; break;
          case 'n': sam_order = QueryName; break;
@@ -3194,6 +3679,15 @@ int bam_sort(int argc, char *argv[])
          case   1: no_pg = 1; break;
          case   2: sam_order = TemplateCoordinate; break;
          case 'M': sam_order = MinHash; break;
+        case 'I':
+            sam_order = MinHash; // implicit option
+            minimiser_ref = optarg;
+            break;
+        case 'H': no_squash = 0; break;
+
+        case 'w': window = atoi(optarg); break;
+
+        case 'R': try_rev = false; break;
          case 'K':
              minimiser_kmer = atoi(optarg);
              if (minimiser_kmer < 1)
@@ -3208,6 +3702,17 @@ int bam_sort(int argc, char *argv[])
          }
      }
  
+    if (minimiser_ref) {
+        fprintf(samtools_stderr, "Building index ... ");
+        fflush(samtools_stderr);
+        if (build_minhash_index(minimiser_ref, minimiser_kmer, window,
+                                no_squash)) {
+            ret = EXIT_FAILURE;
+            goto sort_end;
+        }
+        fprintf(samtools_stderr, "done\n");
+    }
+
      // Change sort order if tag sorting is requested.  Must update based on secondary index
      if (by_tag) {
          sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate;
@@ -3264,7 +3769,9 @@ int bam_sort(int argc, char *argv[])
          ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
      }
  
-    ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0,
+    ret = bam_sort_core_ext(sam_order, sort_tag,
+                            (sam_order == MinHash) ? minimiser_kmer : 0,
+                            try_rev, no_squash,
                              (nargs > 0) ? argv[optind] : "-",
                              tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
                              &ga.in, &ga.out, arg_list, no_pg, ga.write_index);
@@ -3280,6 +3787,12 @@ int bam_sort(int argc, char *argv[])
          ret = EXIT_FAILURE;
      }
  
+#ifdef DEBUG_MINHASH
+    fprintf(samtools_stderr, "Missed %.1f%%, dup %.1f%%\n",
+            100.0*nmis/(ntot+.1),
+            100.0*ndup/(ntot+.1));
+#endif
+
  sort_end:
      free(tmpprefix.s);
      free(arg_list);
diff --git a/samtools/bam_split.c b/samtools/bam_split.c

index 72a629838241e4fa921ad7238dc27d6c185457bb..e9f0fb591f8f0b712150d0100431773d7df37949 100644 (file)
--- a/samtools/bam_split.c
+++ b/samtools/bam_split.c
@@ -1,6 +1,6 @@
  /*  bam_split.c -- split subcommand.
  
-    Copyright (C) 2013-2016,2018-2019 Genome Research Ltd.
+    Copyright (C) 2013-2016,2018-2019,2023 Genome Research Ltd.
  
      Author: Martin Pollard <mp15@sanger.ac.uk>
  
@@ -292,7 +292,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
          }
      }
  
-    retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
+    retval->merged_input_file = sam_open_format(opts->merged_input_name, "r", &opts->ga.in);
      if (!retval->merged_input_file) {
          print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
          cleanup_state(retval, false);
@@ -341,7 +341,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
              }
          }
  
-        retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
+        char outmode[4] = "w";
+        sam_open_mode(outmode + 1, opts->unaccounted_name, NULL);
+        retval->unaccounted_file = sam_open_format(opts->unaccounted_name, outmode, &opts->ga.out);
+
          if (retval->unaccounted_file == NULL) {
              print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
              cleanup_state(retval, false);
@@ -381,6 +384,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
      size_t i;
      for (i = 0; i < retval->output_count; i++) {
          char* output_filename = NULL;
+        char outmode[4] = "w";
  
          output_filename = expand_format_string(opts->output_format_string,
                                                 input_base_name,
@@ -394,7 +398,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
          }
  
          retval->rg_output_file_name[i] = output_filename;
-        retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
+
+        sam_open_mode(outmode + 1, output_filename, NULL);
+        retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
+
          if (retval->rg_output_file[i] == NULL) {
              print_error_errno("split", "Could not open \"%s\"", output_filename);
              cleanup_state(retval, false);
diff --git a/samtools/bam_split.c.pysam.c b/samtools/bam_split.c.pysam.c

index 10152345e70fd4d4af3bc73e5925bb6346bada7c..6c48466da729b35e22850fec99c935b57a56b389 100644 (file)
--- a/samtools/bam_split.c.pysam.c
+++ b/samtools/bam_split.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_split.c -- split subcommand.
  
-    Copyright (C) 2013-2016,2018-2019 Genome Research Ltd.
+    Copyright (C) 2013-2016,2018-2019,2023 Genome Research Ltd.
  
      Author: Martin Pollard <mp15@sanger.ac.uk>
  
@@ -294,7 +294,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
          }
      }
  
-    retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
+    retval->merged_input_file = sam_open_format(opts->merged_input_name, "r", &opts->ga.in);
      if (!retval->merged_input_file) {
          print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
          cleanup_state(retval, false);
@@ -343,7 +343,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
              }
          }
  
-        retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
+        char outmode[4] = "w";
+        sam_open_mode(outmode + 1, opts->unaccounted_name, NULL);
+        retval->unaccounted_file = sam_open_format(opts->unaccounted_name, outmode, &opts->ga.out);
+
          if (retval->unaccounted_file == NULL) {
              print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
              cleanup_state(retval, false);
@@ -383,6 +386,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
      size_t i;
      for (i = 0; i < retval->output_count; i++) {
          char* output_filename = NULL;
+        char outmode[4] = "w";
  
          output_filename = expand_format_string(opts->output_format_string,
                                                 input_base_name,
@@ -396,7 +400,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list)
          }
  
          retval->rg_output_file_name[i] = output_filename;
-        retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
+
+        sam_open_mode(outmode + 1, output_filename, NULL);
+        retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
+
          if (retval->rg_output_file[i] == NULL) {
              print_error_errno("split", "Could not open \"%s\"", output_filename);
              cleanup_state(retval, false);
diff --git a/samtools/bamshuf.c b/samtools/bamshuf.c

index 05442bf6db4a73383bd2dd81482a4cbd30323a26..c297c5d35c5065000f98d6fab5f36115feaabe32 100644 (file)
--- a/samtools/bamshuf.c
+++ b/samtools/bamshuf.c
@@ -537,7 +537,7 @@ static int usage(FILE *fp, int n_files, int reads_store) {
              "      -l INT   Compression level [%d]\n" // DEF_CLEVEL
              "      -n INT   Number of temporary files [%d]\n" // n_files
              "      -T PREFIX\n"
-            "               Write tempory files to PREFIX.nnnn.bam\n"
+            "               Write temporary files to PREFIX.nnnn.bam\n"
              "      --no-PG  do not add a PG line\n",
              reads_store, DEF_CLEVEL, n_files);
  
diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c

index 6547b3ca9c7c15439f7b65798d77df613f624b25..e98bc8bb39c8bdeeeb80e821a58fea558b261a17 100644 (file)
--- a/samtools/bamshuf.c.pysam.c
+++ b/samtools/bamshuf.c.pysam.c
@@ -539,7 +539,7 @@ static int usage(FILE *fp, int n_files, int reads_store) {
              "      -l INT   Compression level [%d]\n" // DEF_CLEVEL
              "      -n INT   Number of temporary files [%d]\n" // n_files
              "      -T PREFIX\n"
-            "               Write tempory files to PREFIX.nnnn.bam\n"
+            "               Write temporary files to PREFIX.nnnn.bam\n"
              "      --no-PG  do not add a PG line\n",
              reads_store, DEF_CLEVEL, n_files);
  
diff --git a/samtools/consensus_pileup.c b/samtools/consensus_pileup.c

index b48aac2a9b6da449070e85354697fe5b24eb6a51..c9667b3cde25b10b016eda7e69ba77c2b52cec9e 100644 (file)
--- a/samtools/consensus_pileup.c
+++ b/samtools/consensus_pileup.c
@@ -1,6 +1,6 @@
  /*  consensus__pileup.h -- Pileup orientated data per consensus column
  
-    Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd.
+    Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd.
  
      Author: James Bonfied <jkb@sanger.ac.uk>
  
diff --git a/samtools/consensus_pileup.c.pysam.c b/samtools/consensus_pileup.c.pysam.c

index 99fb957256fd407673e479d71c69a2c9aaa6f7e8..adb68699891f53055accefd5f1cc40057e1426dc 100644 (file)
--- a/samtools/consensus_pileup.c.pysam.c
+++ b/samtools/consensus_pileup.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  consensus__pileup.h -- Pileup orientated data per consensus column
  
-    Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd.
+    Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd.
  
      Author: James Bonfied <jkb@sanger.ac.uk>
  
diff --git a/samtools/consensus_pileup.h b/samtools/consensus_pileup.h

index 7aacfaa44c77b2923b7d62168693858062a5fbb2..cc400aa2e9cd867681a0d26fd8529ac55de9840a 100644 (file)
--- a/samtools/consensus_pileup.h
+++ b/samtools/consensus_pileup.h
@@ -1,6 +1,6 @@
  /*  consensus_pileup.h -- Pileup orientated data per consensus column
  
-    Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd.
+    Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd.
  
      Author: James Bonfied <jkb@sanger.ac.uk>
  
diff --git a/samtools/cram_size.c b/samtools/cram_size.c

index 6c397bc37e3ca386dc1d54d85f13e743d390731e..54a987c6cb6d3ef28fe56dc7e810205212127937 100644 (file)
--- a/samtools/cram_size.c
+++ b/samtools/cram_size.c
@@ -558,8 +558,8 @@ static int cram_size(hFILE *hf_in, samFile *in, sam_hdr_t *h, FILE *outfp,
      fprintf(outfp, "Number of slices      %18"PRId64"\n", nslice);
      fprintf(outfp, "Number of sequences   %18"PRId64"\n", nseqs);
      fprintf(outfp, "Number of bases       %18"PRId64"\n", nbases);
-    fprintf(outfp, "Total file size       %18"PRId64"\n", end);
-    fprintf(outfp, "Format overhead size  %18"PRId64"\n", end - tot_size);
+    fprintf(outfp, "Total file size       %18"PRId64"\n", (int64_t) end);
+    fprintf(outfp, "Format overhead size  %18"PRId64"\n", (int64_t) (end - tot_size));
  
      return 0;
  
diff --git a/samtools/cram_size.c.pysam.c b/samtools/cram_size.c.pysam.c

index f260419145eb36111641b2668d82317bfbc8fb9d..b3031d00eeb2cd53288db051fc50d3442c0f4838 100644 (file)
--- a/samtools/cram_size.c.pysam.c
+++ b/samtools/cram_size.c.pysam.c
@@ -560,8 +560,8 @@ static int cram_size(hFILE *hf_in, samFile *in, sam_hdr_t *h, FILE *outfp,
      fprintf(outfp, "Number of slices      %18"PRId64"\n", nslice);
      fprintf(outfp, "Number of sequences   %18"PRId64"\n", nseqs);
      fprintf(outfp, "Number of bases       %18"PRId64"\n", nbases);
-    fprintf(outfp, "Total file size       %18"PRId64"\n", end);
-    fprintf(outfp, "Format overhead size  %18"PRId64"\n", end - tot_size);
+    fprintf(outfp, "Total file size       %18"PRId64"\n", (int64_t) end);
+    fprintf(outfp, "Format overhead size  %18"PRId64"\n", (int64_t) (end - tot_size));
  
      return 0;
  
diff --git a/samtools/reset.c b/samtools/reset.c

index f9b0c09f544f9a4a9981491635862b67819837c3..4e522cddf13a59b4575476a632aa23b28555f442 100644 (file)
--- a/samtools/reset.c
+++ b/samtools/reset.c
@@ -25,6 +25,8 @@ DEALINGS IN THE SOFTWARE
  
  */
  
+#include <config.h>
+
  #include "samtools.h"
  #include "htslib/sam.h"
  #include "sam_opts.h"
diff --git a/samtools/reset.c.pysam.c b/samtools/reset.c.pysam.c

index fdf44b936778ff6f9999fe0bbe9e6493c93a52c5..c98946f0228ff81bac07e6fe149efc95c83bf12a 100644 (file)
--- a/samtools/reset.c.pysam.c
+++ b/samtools/reset.c.pysam.c
@@ -27,6 +27,8 @@ DEALINGS IN THE SOFTWARE
  
  */
  
+#include <config.h>
+
  #include "samtools.h"
  #include "htslib/sam.h"
  #include "sam_opts.h"
diff --git a/samtools/sam_view.c b/samtools/sam_view.c

index d23e96589243fc5b5fd558473d9f8aba174820b3..aa5b92310dce313f56cee4409062bada951562f2 100644 (file)
--- a/samtools/sam_view.c
+++ b/samtools/sam_view.c
@@ -139,11 +139,17 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
      }
  }
  
-// Returns 0 to indicate read should be output 1 otherwise
+// Returns 0 to indicate read should be output 1 otherwise,
+// and -1 on error.
  static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
  {
-    if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
-        return 1;
+    if (settings->filter) {
+        int r = sam_passes_filter(h, b, settings->filter);
+        if (r < 0)  // err
+            return -1;
+        if (r == 0) // filter-out
+            return 1;
+    }
  
      if (settings->remove_B) bam_remove_B(b);
      if (settings->min_qlen > 0) {
@@ -581,7 +587,9 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t *
      while ((r =sam_itr_multi_next(conf->in, iter, rec))>=0) {
          if ( (rec->core.flag & BAM_FPAIRED) == 0 ) continue;
          if ( rec->core.mtid>=0 && bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.mtid), rec->core.mpos, rec->core.mpos) ) continue;
-        if ( process_aln(conf->header, rec, conf) ) continue;
+        int p = process_aln(conf->header, rec, conf);
+        if (p < 0)  goto out;
+        if (p == 1) continue;
  
          nmates++;
  
@@ -632,13 +640,16 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t *
               k = kh_get(names,mate_names,bam_get_qname(rec));
               if ( k != kh_end(mate_names) ) drop = 0;
          }
-        if (!drop && process_aln(conf->header, rec, conf) == 0) {
+        int p = 0;
+        if (!drop && (p=process_aln(conf->header, rec, conf))== 0) {
              if (adjust_tags(conf->header, rec, conf) != 0)
                  goto out;
              if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out,
                                   &write_error) < 0)
                  goto out;
          }
+        if (p < 0)
+            goto out;
      }
  
      if (r < -1) {
@@ -669,7 +680,12 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b,
          if (bam_sanitize(conf->header, b, conf->sanitize) < 0)
              return -1;
  
-    if (!process_aln(conf->header, b, conf)) {
+    int p;
+    if ((p = process_aln(conf->header, b, conf)) < 0) {
+        // error
+        return -1;
+    } else if (p == 0) {
+        // emit read
          if (!conf->is_count) {
              change_flag(b, conf);
              if (adjust_tags(conf->header, b, conf) != 0)
@@ -710,17 +726,17 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b,
  
  static int stream_view(samview_settings_t *conf) {
      bam1_t *b = bam_init1();
-    int write_error = 0, r;
+    int write_error = 0, r, p = 0;
      if (!b) {
          print_error_errno("view", "could not allocate bam record");
          return 1;
      }
      errno = 0; // prevent false error messages.
      while ((r = sam_read1(conf->in, conf->header, b)) >= 0) {
-        if (process_one_record(conf, b, &write_error) < 0) break;
+        if ((p = process_one_record(conf, b, &write_error)) < 0) break;
      }
      bam_destroy1(b);
-    if (r < -1) {
+    if (r < -1 || p < 0) {
          print_error_errno("view", "error reading file \"%s\"", conf->fn_in);
          return 1;
      }
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c

index 7961862a954634a068770242b04104c7e25cb28b..e1b681b820ac50bc105d39bdaf63609c4110f28f 100644 (file)
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -141,11 +141,17 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
      }
  }
  
-// Returns 0 to indicate read should be output 1 otherwise
+// Returns 0 to indicate read should be output 1 otherwise,
+// and -1 on error.
  static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
  {
-    if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
-        return 1;
+    if (settings->filter) {
+        int r = sam_passes_filter(h, b, settings->filter);
+        if (r < 0)  // err
+            return -1;
+        if (r == 0) // filter-out
+            return 1;
+    }
  
      if (settings->remove_B) bam_remove_B(b);
      if (settings->min_qlen > 0) {
@@ -583,7 +589,9 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t *
      while ((r =sam_itr_multi_next(conf->in, iter, rec))>=0) {
          if ( (rec->core.flag & BAM_FPAIRED) == 0 ) continue;
          if ( rec->core.mtid>=0 && bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.mtid), rec->core.mpos, rec->core.mpos) ) continue;
-        if ( process_aln(conf->header, rec, conf) ) continue;
+        int p = process_aln(conf->header, rec, conf);
+        if (p < 0)  goto out;
+        if (p == 1) continue;
  
          nmates++;
  
@@ -634,13 +642,16 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t *
               k = kh_get(names,mate_names,bam_get_qname(rec));
               if ( k != kh_end(mate_names) ) drop = 0;
          }
-        if (!drop && process_aln(conf->header, rec, conf) == 0) {
+        int p = 0;
+        if (!drop && (p=process_aln(conf->header, rec, conf))== 0) {
              if (adjust_tags(conf->header, rec, conf) != 0)
                  goto out;
              if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out,
                                   &write_error) < 0)
                  goto out;
          }
+        if (p < 0)
+            goto out;
      }
  
      if (r < -1) {
@@ -671,7 +682,12 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b,
          if (bam_sanitize(conf->header, b, conf->sanitize) < 0)
              return -1;
  
-    if (!process_aln(conf->header, b, conf)) {
+    int p;
+    if ((p = process_aln(conf->header, b, conf)) < 0) {
+        // error
+        return -1;
+    } else if (p == 0) {
+        // emit read
          if (!conf->is_count) {
              change_flag(b, conf);
              if (adjust_tags(conf->header, b, conf) != 0)
@@ -712,17 +728,17 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b,
  
  static int stream_view(samview_settings_t *conf) {
      bam1_t *b = bam_init1();
-    int write_error = 0, r;
+    int write_error = 0, r, p = 0;
      if (!b) {
          print_error_errno("view", "could not allocate bam record");
          return 1;
      }
      errno = 0; // prevent false error messages.
      while ((r = sam_read1(conf->in, conf->header, b)) >= 0) {
-        if (process_one_record(conf, b, &write_error) < 0) break;
+        if ((p = process_one_record(conf, b, &write_error)) < 0) break;
      }
      bam_destroy1(b);
-    if (r < -1) {
+    if (r < -1 || p < 0) {
          print_error_errno("view", "error reading file \"%s\"", conf->fn_in);
          return 1;
      }
diff --git a/samtools/stats.c b/samtools/stats.c

index 06802b1a414ae7616f5a0be22f27915903d1c573..44783a974cc7f07aa00c68ee106f342567f960c8 100644 (file)
--- a/samtools/stats.c
+++ b/samtools/stats.c
@@ -1556,7 +1556,13 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
      fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches);
      fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0);
-    float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0;
+    float avg_read_length = (stats->nreads_1st +
+                             stats->nreads_2nd +
+                             stats->nreads_other)
+        ? (float)stats->total_len / (stats->nreads_1st +
+                                     stats->nreads_2nd +
+                                     stats->nreads_other)
+        : 0;
      fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length);
      fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0);
      fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0);
diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c

index c3331957e516669b46a289d878d12381959518ab..b3462ccb20e9ef90f21dc51924b43e6458ef2481 100644 (file)
--- a/samtools/stats.c.pysam.c
+++ b/samtools/stats.c.pysam.c
@@ -1558,7 +1558,13 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
      fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches);
      fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0);
-    float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0;
+    float avg_read_length = (stats->nreads_1st +
+                             stats->nreads_2nd +
+                             stats->nreads_other)
+        ? (float)stats->total_len / (stats->nreads_1st +
+                                     stats->nreads_2nd +
+                                     stats->nreads_other)
+        : 0;
      fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length);
      fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0);
      fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0);
diff --git a/samtools/version.sh b/samtools/version.sh

index 1ac941397143aeef89581bc5833c525c5ca64508..7d17aee189ee631985545c9c0e23533cb9f64392 100755 (executable)
--- a/samtools/version.sh
+++ b/samtools/version.sh
@@ -24,7 +24,7 @@
  # DEALINGS IN THE SOFTWARE.
  
  # Master version, for use in tarballs or non-git source copies
-VERSION=1.17
+VERSION=1.18
  
  # If we have a git clone, then check against the current tag
  if [ -e .git ]
diff --git a/setup.py b/setup.py

index 291d0f94b0bac99e0213c8499c384cd7c6bf9e88..a4bf36ddd1105ad882a6f2282b49688f8315707c 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -1,27 +1,21 @@
  #! /usr/bin/python
  
-'''pysam - a python module for reading, manipulating and writing
+'''pysam --- a Python package for reading, manipulating, and writing
  genomic data sets.
  
-pysam is a lightweight wrapper of the htslib C-API and provides
-facilities to read and write SAM/BAM/VCF/BCF/BED/GFF/GTF/FASTA/FASTQ
-files as well as access to the command line functionality of the
-samtools and bcftools packages. The module supports compression and
-random access through indexing.
-
-This module provides a low-level wrapper around the htslib C-API as
-using cython and a high-level API for convenient access to the data
-within standard genomic file formats.
-
-See:
-http://www.htslib.org
-https://github.com/pysam-developers/pysam
-http://pysam.readthedocs.org/en/stable
+pysam is a lightweight wrapper of the HTSlib API and provides facilities
+to read and write SAM/BAM/CRAM/VCF/BCF/BED/GFF/GTF/FASTA/FASTQ files
+as well as access to the command-line functionality of samtools and bcftools.
+The module supports compression and random access through indexing.
  
+This module provides a low-level wrapper around HTSlib's C API using Cython
+and a high-level API for convenient access to the data within standard genomic
+file formats.
  '''
  
  import collections
  import glob
+import logging
  import os
  import platform
  import re
@@ -29,13 +23,20 @@ import subprocess
  import sys
  import sysconfig
  from contextlib import contextmanager
-from distutils import log
  from setuptools import setup, Command
-from distutils.command.build import build
  from setuptools.command.sdist import sdist
-from distutils.errors import LinkError
+from setuptools.extension import Extension
+
+try:
+    from setuptools.errors import LinkError
+except ImportError:
+    from distutils.errors import LinkError
+
+try:
+    from Cython.Distutils import build_ext
+except ImportError:
+    from setuptools.command.build_ext import build_ext
  
-from cy_build import CyExtension as Extension, cy_build_ext as build_ext
  try:
      import cython  # noqa
      HAVE_CYTHON = True
@@ -45,6 +46,8 @@ except ImportError:
  IS_PYTHON3 = sys.version_info.major >= 3
  IS_DARWIN = platform.system() == 'Darwin'
  
+log = logging.getLogger('pysam')
+
  
  @contextmanager
  def changedir(path):
@@ -233,19 +236,34 @@ class cythonize_sdist(sdist):
          sdist.run(self)
  
  
-# Override build command to add extra build steps.
-class extra_build(build):
+# Override Cythonised build_ext command to customise macOS shared libraries.
+
+class CyExtension(Extension):
+    def __init__(self, *args, **kwargs):
+        self._init_func = kwargs.pop("init_func", None)
+        self._prebuild_func = kwargs.pop("prebuild_func", None)
+        Extension.__init__(self, *args, **kwargs)
+
+    def extend_includes(self, includes):
+        self.include_dirs.extend(includes)
+
+    def extend_macros(self, macros):
+        self.define_macros.extend(macros)
+
+    def extend_extra_objects(self, objs):
+        self.extra_objects.extend(objs)
+
+
+class cy_build_ext(build_ext):
      def check_ext_symbol_conflicts(self):
          """Checks for symbols defined in multiple extension modules,
          which can lead to crashes due to incorrect functions being invoked.
          Avoid by adding an appropriate #define to import/pysam.h or in
          unusual cases adding another rewrite rule to devtools/import.py.
          """
-        build_ext_obj = self.distribution.get_command_obj('build_ext')
-
          symbols = dict()
          for ext in self.distribution.ext_modules:
-            for sym in run_nm_defined_symbols(build_ext_obj.get_ext_fullpath(ext.name)):
+            for sym in run_nm_defined_symbols(self.get_ext_fullpath(ext.name)):
                  symbols.setdefault(sym, []).append(ext.name.lstrip('pysam.'))
  
          errors = 0
@@ -257,14 +275,55 @@ class extra_build(build):
          if errors > 0: raise LinkError("symbols defined in multiple extensions")
  
      def run(self):
-        build.run(self)
+        if sys.platform == 'darwin':
+            ldshared = os.environ.get('LDSHARED', sysconfig.get_config_var('LDSHARED'))
+            os.environ['LDSHARED'] = ldshared.replace('-bundle', '')
+
+        build_ext.run(self)
          try:
              if HTSLIB_MODE != 'separate':
                  self.check_ext_symbol_conflicts()
          except OSError as e:
-            log.warn("skipping symbol collision check (invoking nm failed: %s)", e)
+            log.warning("skipping symbol collision check (invoking nm failed: %s)", e)
          except subprocess.CalledProcessError:
-            log.warn("skipping symbol collision check (invoking nm failed)")
+            log.warning("skipping symbol collision check (invoking nm failed)")
+
+    def build_extension(self, ext):
+
+        if isinstance(ext, CyExtension) and ext._init_func:
+            ext._init_func(ext)
+
+        if not self.inplace:
+            ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
+
+        if sys.platform == 'darwin':
+            # The idea is to give shared libraries an install name of the form
+            # `@rpath/<library-name.so>`, and to set the rpath equal to
+            # @loader_path. This will allow Python packages to find the library
+            # in the expected place, while still giving enough flexibility to
+            # external applications to link against the library.
+            relative_module_path = ext.name.replace(".", os.sep) + (sysconfig.get_config_var('EXT_SUFFIX') or sysconfig.get_config_var('SO'))
+            library_path = os.path.join(
+                "@rpath", os.path.basename(relative_module_path)
+            )
+
+            if not ext.extra_link_args:
+                ext.extra_link_args = []
+            ext.extra_link_args += ['-dynamiclib',
+                                    '-rpath', '@loader_path',
+                                    '-Wl,-headerpad_max_install_names',
+                                    '-Wl,-install_name,%s' % library_path,
+                                    '-Wl,-x']
+        else:
+            if not ext.extra_link_args:
+                ext.extra_link_args = []
+
+            ext.extra_link_args += ['-Wl,-rpath,$ORIGIN']
+
+        if isinstance(ext, CyExtension) and ext._prebuild_func:
+            ext._prebuild_func(ext, self.force)
+
+        build_ext.build_extension(self, ext)
  
  
  class clean_ext(Command):
@@ -432,7 +491,7 @@ with open(os.path.join("pysam", "config.py"), "w") as outf:
              for line in inf:
                  if line.startswith("#define"):
                      key, value = re.match(
-                        "#define (\S+)\s+(\S+)", line).groups()
+                        r"#define (\S+)\s+(\S+)", line).groups()
                      config_values[key] = value
              for key in ["ENABLE_GCS",
                          "ENABLE_PLUGINS",
@@ -515,7 +574,7 @@ def prebuild_libchtslib(ext, force):
              args = " ".join(ext.extra_compile_args)
              run_make(["ALL_CPPFLAGS=-I. " + args + " $(CPPFLAGS)", "lib-static"])
      else:
-        log.warn("skipping 'libhts.a' (already built)")
+        log.warning("skipping 'libhts.a' (already built)")
  
  
  def prebuild_libcsamtools(ext, force):
@@ -609,8 +668,9 @@ Operating System :: MacOS
  metadata = {
      'name': "pysam",
      'version': get_pysam_version(),
-    'description': "pysam",
+    'description': "Package for reading, manipulating, and writing genomic data",
      'long_description': __doc__,
+    'long_description_content_type': "text/x-rst",
      'author': "Andreas Heger",
      'author_email': "andreas.heger@gmail.com",
      'license': "MIT",
@@ -618,9 +678,8 @@ metadata = {
      'classifiers': [_f for _f in classifiers.split("\n") if _f],
      'url': "https://github.com/pysam-developers/pysam",
      'packages': package_list,
-    'requires': ['cython (>=0.29.12)'],
-    'ext_modules': [Extension(**opts) for opts in modules],
-    'cmdclass': {'build': extra_build, 'build_ext': build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist},
+    'ext_modules': [CyExtension(**opts) for opts in modules],
+    'cmdclass': {'build_ext': cy_build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist},
      'package_dir': package_dirs,
      'package_data': {'': ['*.pxd', '*.h', 'py.typed', '*.pyi'], },
      # do not pack in order to permit linking to csamtools.so
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py

index 1dc72d5f415a973ed8f14bad3d71e4b319fe39a9..855ae47844dce6b26d228d3e7c3a70838c236816 100644 (file)
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -776,6 +776,32 @@ class TestAlignedSegment(ReadTest):
              ],
          )
  
+    def test_get_aligned_pairs_1character_md(self):
+        a = self.build_read()
+        a.query_sequence = "A" * 7
+        a.cigarstring = "7M"
+        a.set_tag("MD", "7", value_type="A")
+        self.assertEqual(
+            a.get_aligned_pairs(with_seq=True),
+            [
+                (0, 20, "A"),
+                (1, 21, "A"),
+                (2, 22, "A"),
+                (3, 23, "A"),
+                (4, 24, "A"),
+                (5, 25, "A"),
+                (6, 26, "A"),
+            ],
+        )
+
+    def test_get_aligned_pairs_bad_type_md(self):
+        a = self.build_read()
+        a.query_sequence = "A" * 7
+        a.cigarstring = "7M"
+        a.set_tag("MD", 7)
+        with self.assertRaises(TypeError):
+            a.get_aligned_pairs(with_seq=True)
+
      def testNoSequence(self):
          """issue 176: retrieving length without query sequence
          with soft-clipping.
diff --git a/tests/_compile_test.pyx b/tests/_compile_test.pyx

index dfe79372c9e7ced14388541736b404644a336400..ea2c6462121f5c29c2eeffb78de14c9d2b8f7250 100644 (file)
--- a/tests/_compile_test.pyx
+++ b/tests/_compile_test.pyx
@@ -1,3 +1,5 @@
+# cython: language_level=3
+
  from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
  from pysam.libctabix cimport Tabixfile
  
diff --git a/tests/_cython_flagstat.pyx b/tests/_cython_flagstat.pyx

index 8e376b017b156916bf7154278da5d45bf8f19c94..39cc15f008c2c6c7893fc055c59cc0a7b9acade4 100644 (file)
--- a/tests/_cython_flagstat.pyx
+++ b/tests/_cython_flagstat.pyx
@@ -1,3 +1,5 @@
+# cython: language_level=3
+
  from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
  from pysam.libcalignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED
  from pysam.libcalignedsegment cimport pysam_get_flag
diff --git a/tox.ini b/tox.ini

deleted file mode 100644 (file)

index 150c5c5..0000000
--- a/tox.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-# content of: tox.ini , put in same dir as setup.py
-[tox]
-envlist = py36 py311
-
-[testenv]
-deps = pytest       # install pytest in the virtualenv where commands will be executed
-commands =
-    pytest tests
author	Andreas Tille <tille@debian.org>
	Wed, 8 Nov 2023 08:35:50 +0000 (09:35 +0100)
committer	Andreas Tille <tille@debian.org>
	Wed, 8 Nov 2023 08:35:50 +0000 (09:35 +0100)
.cirrus.yml	[new file with mode: 0644]	patch \| blob
.python-version	[deleted file]	patch \| blob \| history
.travis.disabled.yml	[deleted file]	patch \| blob \| history
MANIFEST.in		patch \| blob \| history
NEWS		patch \| blob \| history
README.rst		patch \| blob \| history
bcftools/LICENSE		patch \| blob \| history
bcftools/bcftools.h		patch \| blob \| history
bcftools/cigar_state.h		patch \| blob \| history
bcftools/consensus.c		patch \| blob \| history
bcftools/consensus.c.pysam.c		patch \| blob \| history
bcftools/convert.c		patch \| blob \| history
bcftools/convert.c.pysam.c		patch \| blob \| history
bcftools/convert.h		patch \| blob \| history
bcftools/csq.c		patch \| blob \| history
bcftools/csq.c.pysam.c		patch \| blob \| history
bcftools/filter.c		patch \| blob \| history
bcftools/filter.c.pysam.c		patch \| blob \| history
bcftools/filter.h		patch \| blob \| history
bcftools/gff.c	[new file with mode: 0644]	patch \| blob
bcftools/gff.c.pysam.c	[new file with mode: 0644]	patch \| blob
bcftools/gff.h	[new file with mode: 0644]	patch \| blob
bcftools/hex.h		patch \| blob \| history
bcftools/mpileup.c		patch \| blob \| history
bcftools/mpileup.c.pysam.c		patch \| blob \| history
bcftools/reheader.c		patch \| blob \| history
bcftools/reheader.c.pysam.c		patch \| blob \| history
bcftools/tsv2vcf.c		patch \| blob \| history
bcftools/tsv2vcf.c.pysam.c		patch \| blob \| history
bcftools/variantkey.h		patch \| blob \| history
bcftools/vcfannotate.c		patch \| blob \| history
bcftools/vcfannotate.c.pysam.c		patch \| blob \| history
bcftools/vcfcall.c		patch \| blob \| history
bcftools/vcfcall.c.pysam.c		patch \| blob \| history
bcftools/vcfconcat.c		patch \| blob \| history
bcftools/vcfconcat.c.pysam.c		patch \| blob \| history
bcftools/vcfconvert.c		patch \| blob \| history
bcftools/vcfconvert.c.pysam.c		patch \| blob \| history
bcftools/vcffilter.c		patch \| blob \| history
bcftools/vcffilter.c.pysam.c		patch \| blob \| history
bcftools/vcfgtcheck.c		patch \| blob \| history
bcftools/vcfgtcheck.c.pysam.c		patch \| blob \| history
bcftools/vcfisec.c		patch \| blob \| history
bcftools/vcfisec.c.pysam.c		patch \| blob \| history
bcftools/vcfmerge.c		patch \| blob \| history
bcftools/vcfmerge.c.pysam.c		patch \| blob \| history
bcftools/vcfnorm.c		patch \| blob \| history
bcftools/vcfnorm.c.pysam.c		patch \| blob \| history
bcftools/vcfplugin.c		patch \| blob \| history
bcftools/vcfplugin.c.pysam.c		patch \| blob \| history
bcftools/vcfquery.c		patch \| blob \| history
bcftools/vcfquery.c.pysam.c		patch \| blob \| history
bcftools/vcfsort.c		patch \| blob \| history
bcftools/vcfsort.c.pysam.c		patch \| blob \| history
bcftools/vcfstats.c		patch \| blob \| history
bcftools/vcfstats.c.pysam.c		patch \| blob \| history
bcftools/vcfview.c		patch \| blob \| history
bcftools/vcfview.c.pysam.c		patch \| blob \| history
bcftools/version.c		patch \| blob \| history
bcftools/version.c.pysam.c		patch \| blob \| history
bcftools/version.sh		patch \| blob \| history
cy_build.py	[deleted file]	patch \| blob \| history
devtools/import.py		patch \| blob \| history
devtools/install-prerequisites.sh	[new file with mode: 0755]	patch \| blob
doc/conf.py		patch \| blob \| history
doc/index.rst		patch \| blob \| history
doc/installation.rst		patch \| blob \| history
doc/release.rst		patch \| blob \| history
doc/requirements-rtd.txt	[new file with mode: 0644]	patch \| blob
pyproject.toml		patch \| blob \| history
pysam/libcalignedsegment.pyx		patch \| blob \| history
pysam/libcalignmentfile.pyi		patch \| blob \| history
pysam/libcalignmentfile.pyx		patch \| blob \| history
pysam/libcbcf.pyx		patch \| blob \| history
pysam/libcbgzf.pyx		patch \| blob \| history
pysam/libchtslib.pxd		patch \| blob \| history
pysam/libchtslib.pyi		patch \| blob \| history
pysam/libchtslib.pyx		patch \| blob \| history
pysam/libcutils.pxd		patch \| blob \| history
pysam/libcutils.pyx		patch \| blob \| history
pysam/version.h		patch \| blob \| history
pysam/version.py		patch \| blob \| history
requirements-dev.txt	[new file with mode: 0644]	patch \| blob
requirements.txt	[deleted file]	patch \| blob \| history
samtools/README		patch \| blob \| history
samtools/bam_ampliconclip.c		patch \| blob \| history
samtools/bam_ampliconclip.c.pysam.c		patch \| blob \| history
samtools/bam_consensus.c		patch \| blob \| history
samtools/bam_consensus.c.pysam.c		patch \| blob \| history
samtools/bam_fastq.c		patch \| blob \| history
samtools/bam_fastq.c.pysam.c		patch \| blob \| history
samtools/bam_import.c		patch \| blob \| history
samtools/bam_import.c.pysam.c		patch \| blob \| history
samtools/bam_index.c		patch \| blob \| history
samtools/bam_index.c.pysam.c		patch \| blob \| history
samtools/bam_markdup.c		patch \| blob \| history
samtools/bam_markdup.c.pysam.c		patch \| blob \| history
samtools/bam_md.c		patch \| blob \| history
samtools/bam_md.c.pysam.c		patch \| blob \| history
samtools/bam_reheader.c		patch \| blob \| history
samtools/bam_reheader.c.pysam.c		patch \| blob \| history
samtools/bam_sort.c		patch \| blob \| history
samtools/bam_sort.c.pysam.c		patch \| blob \| history
samtools/bam_split.c		patch \| blob \| history
samtools/bam_split.c.pysam.c		patch \| blob \| history
samtools/bamshuf.c		patch \| blob \| history
samtools/bamshuf.c.pysam.c		patch \| blob \| history
samtools/consensus_pileup.c		patch \| blob \| history
samtools/consensus_pileup.c.pysam.c		patch \| blob \| history
samtools/consensus_pileup.h		patch \| blob \| history
samtools/cram_size.c		patch \| blob \| history
samtools/cram_size.c.pysam.c		patch \| blob \| history
samtools/reset.c		patch \| blob \| history
samtools/reset.c.pysam.c		patch \| blob \| history
samtools/sam_view.c		patch \| blob \| history
samtools/sam_view.c.pysam.c		patch \| blob \| history
samtools/stats.c		patch \| blob \| history
samtools/stats.c.pysam.c		patch \| blob \| history
samtools/version.sh		patch \| blob \| history
setup.py		patch \| blob \| history
tests/AlignedSegment_test.py		patch \| blob \| history
tests/_compile_test.pyx		patch \| blob \| history
tests/_cython_flagstat.pyx		patch \| blob \| history
tox.ini	[deleted file]	patch \| blob \| history