From: Andreas Tille Date: Wed, 8 Nov 2023 08:35:50 +0000 (+0100) Subject: New upstream version 0.22.0+ds X-Git-Tag: archive/raspbian/0.23.0+ds-1+rpi1~1^2~14^2~1 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=9fc6ed26ac4ae101a9bffb2bd9b73f0a396a4693;p=python-pysam.git New upstream version 0.22.0+ds --- diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 0000000..edf235c --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,73 @@ +build_wheels_task: + only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*" + + matrix: + - compute_engine_instance: + image_project: cirrus-images + image: family/docker-builder-arm64 + architecture: arm64 + platform: linux + matrix: + - name: Build ARM Linux py3.6-9 wheels + env: + CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-*" + - name: Build ARM Linux py3.10-12 wheels + env: + CIBW_BUILD: "cp310-* cp311-* cp312-*" + + - name: Build ARM macOS wheels + macos_instance: + image: ghcr.io/cirruslabs/macos-ventura-base:latest + env: + CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" + + alias: build_wheels + + env: + CIRRUS_CLONE_DEPTH: 1 + + CIBW_SKIP: "*-musllinux_*" + CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 + + install_script: | + python3 -m pip install cibuildwheel==2.16.2 + + build_script: | + cibuildwheel + + wheels_artifacts: + path: wheelhouse/*.whl + +upload_pypi_task: + only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*" + depends_on: build_wheels + + name: Publish ARM wheels + + container: + image: python:latest + + env: + CIRRUS_CLONE_DEPTH: 1 + API_BASEURL: https://api.cirrus-ci.com/v1 + TWINE_USERNAME: __token__ + + install_script: | + python3 -m pip install twine + + get_artifacts_script: | + curl -sSLO $API_BASEURL/artifact/build/$CIRRUS_BUILD_ID/wheels.zip + unzip -q wheels.zip + + upload_script: | + case "$CIRRUS_TAG" in + v0.*) + export TWINE_REPOSITORY=pypi TWINE_PASSWORD=$PYPI_TOKEN ;; + *) + export TWINE_REPOSITORY=testpypi TWINE_PASSWORD=$TESTPYPI_TOKEN ;; + esac + + echo Uploading wheels to $TWINE_REPOSITORY... + + python3 -m twine check wheelhouse/*.whl + python3 -m twine upload --disable-progress-bar wheelhouse/*.whl diff --git a/.python-version b/.python-version deleted file mode 100644 index d8c6f97..0000000 --- a/.python-version +++ /dev/null @@ -1,2 +0,0 @@ -3.6 -3.11 diff --git a/.travis.disabled.yml b/.travis.disabled.yml deleted file mode 100644 index 5b7bcc8..0000000 --- a/.travis.disabled.yml +++ /dev/null @@ -1,114 +0,0 @@ -os: - - linux - - osx - -language: c - -stages: - - test - - name: deploy - if: tag IS present - -env: - matrix: - - CONDA_PY=2.7 - - CONDA_PY=3.6 - - CONDA_PY=3.7 - - CONDA_PY=3.8 - global: - - PYSAM_LINKING_TEST=1 - - TWINE_USERNAME=grepall - - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU= - -_cibw_common: &cibw_common - addons: {} - install: - - python3 -m pip install cibuildwheel>=1.1.0 twine - script: - - set -e - - cibuildwheel --output-dir dist - - twine check dist/* - - twine upload --skip-existing dist/* - -_cibw_linux: &cibw_linux - stage: deploy - os: linux - language: python - python: '3.5' - services: - - docker - <<: *cibw_common - -_cibw_linux_aarch64: &cibw_linux_aarch64 - stage: deploy - os: linux - arch: arm64 - language: python - python: '3.9' - services: - - docker - <<: *cibw_common - -matrix: - include: - - stage: deploy - os: linux - language: python - python: '3.5' - addons: - apt: - packages: - - gcc - - g++ - - libcurl4-openssl-dev # for libcurl support in sdist - - libssl-dev # for s3 support in sdist - install: - - python3 -m pip install Cython twine - script: - - set -e - - python3 setup.py build_ext --inplace - - python3 setup.py sdist - - twine check dist/* - - twine upload --skip-existing dist/* - - <<: *cibw_linux - env: - - CIBW_BUILD="*_x86_64" - - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt" - - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' - - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}' - - CIBW_TEST_COMMAND='python -c "import pysam"' - - <<: *cibw_linux - env: - - CIBW_BUILD="*_i686" - - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt" - - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' - - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}' - - CIBW_TEST_COMMAND='python -c "import pysam"' - - <<: *cibw_linux_aarch64 - env: - - CIBW_BUILD="*_aarch64" - - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt" - - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' - - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}' - - CIBW_TEST_COMMAND='python -c "import pysam"' - - stage: deploy - os: osx - language: generic - env: - - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt" - - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' - - CIBW_TEST_COMMAND='python -c "import pysam"' - <<: *cibw_common - -addons: - apt: - packages: - - gcc - - g++ - -script: - - ./devtools/run_tests_travis.sh - -notifications: - email: - - andreas.heger@gmail.com diff --git a/MANIFEST.in b/MANIFEST.in index 25e9a1a..5711f09 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -9,8 +9,7 @@ include NEWS include INSTALL include KNOWN_BUGS include THANKS -include cy_build.py -include requirements.txt +include requirements-dev.txt include pysam/libc*.pxd include pysam/libc*.pyx include pysam/libc*.c @@ -46,9 +45,6 @@ include htslib/configure htslib/version.sh include htslib/Makefile htslib/*.mk exclude htslib/config.mk htslib/htscodecs.mk -include cy_build.py -include requirements.txt - # documentation include doc/*.py doc/*.rst -include doc/Makefile doc/make.bat +include doc/Makefile doc/make.bat doc/requirements-rtd.txt diff --git a/NEWS b/NEWS index e0b77a9..ad7cfb1 100644 --- a/NEWS +++ b/NEWS @@ -1,17 +1,96 @@ -An online version of the installation instructions can be found here: -http://pysam.readthedocs.io/en/latest/release.html +.. An online version of the release history can be found here: +.. http://pysam.readthedocs.io/en/latest/release.html + +Release 0.22.0 +============== + +.. rubric:: 5 October 2023 + +This pysam release wraps htslib/samtools/bcftools 1.18 (PR #1208). + +It has been tested with Python versions 3.6 through 3.12, and wheels are +available via pypi_ for all of those Python versions. Python versions 3.6 +and 3.7 are end-of-life; particularly if you use pysam with either of +these versions, please vote in the version survey at issue #1230. + +The final pysam release that supported Python 2.7 was v0.20.0. + +Bugs fixed: + +* Remove Cython from runtime dependencies (PR #1186, thanks to Nicola Soranzo, + also reported by Arya Massarat in PR #1194) + +* Miscellaneous dependency improvements (PR #1216, #1217, PR #1218, PR #1219, + thanks to Martin Larralde and Arthur Vigil) + +* Suppress spurious "Could not retrieve index file" message when opening an + AlignmentFile (#939, #1214, reported by ChengYong Tham and Sebastian Röner) + +* Propagate SAM parsing errors encounted in :meth:`.AlignedSegment.fromstring` + (#1196, reported by DV Klopfenstein) + +* Accept invalid MD:A tagged fields produced by HTSeq instead of crashing + in :meth:`AlignedSegment.get_aligned_pairs(with_seq=True) + <.AlignedSegment.get_aligned_pairs>` (#1226, reported by Isaac Vock) + +* Fix multiarch macOS CI builds by removing brewed liblzma (#1205, reported + by Till Hartmann) + +* Fix :attr:`.VariantRecordSample.alleles` type hint (#1179, reported by + David Seifert) + +New functionality: + +* Add optional :meth:`HTSFile.seek(..., whence) <.HTSFile.seek>` parameter + and clarify which functions use libc.SEEK_SET vs io.SEEK_SET + (#1185, requested by luyulin) + +* File handling improvements in samtools & bcftools commands (should improve + #1193 and #1195, reported by Rob Bierman and Sam Chorlton) + +* Improve :class:`.FastxFile` performance (PR #1227, thanks to Fabian Klötzl + and Valentyn Bezshapkin) + +* Improve the accuracy of type hints for :class:`.AlignmentFile` iteration + (#1184, PR #1189, reported by @PikalaxALT) + +Documentation improvements: + +* Clarify that :meth:`.AlignedSegment.get_aligned_pairs` results are 0-based + (#1180, reported by Nick Semenkovich) + +* Clarify :meth:`.AlignedSegment.get_reference_positions` documentation + (#836, #838, reported by Liang Ou and Nick Stoler) + +* Clarify that installation via pip usually uses a wheel, and that configuring + the build via $HTSLIB_CONFIGURE_OPTIONS etc only applies when installing from + an sdist (#1086, reported by Layne Sadler) + +A message from pysam's founder, Andreas Heger: + + As many of you will have noticed, John Marshall has been effectively + maintaining pysam and supporting users over the last few years. + I, Andreas, am very grateful for the countless hours he has contributed. + Unfortunately, I will not be able to contribute much in the near and + intermediate future. To keep pysam going, John has kindly agreed to + continue maintaining and supporting pysam as the principal developer + of pysam. I am very happy to know that pysam is in good hands and want + to thank again John and the wider pysam community for their suggestions, + bug reports, code contributions and general support. + +Thank you Andreas for all your work over the years and the solid foundations +that pysam enjoys and the useful functionality it provides. -============= -Release notes -============= Release 0.21.0 ============== +.. rubric:: 2 April 2023 + This release wraps htslib/samtools/bcftools version 1.17. -Pysam is now compatible with Python 3.11. We have removed python 2.x -support. Pysam is tested with python versions 3.6 to 3.11. +Pysam is now compatible with Python 3.11. We have removed Python 2.x +support. Pysam is tested with Python versions 3.6 to 3.11. * [#1175] VariantHeader.new_record: set start/stop before alleles * [#1173] Add multiple build improvements in htscodecs on multi-arch macOS @@ -27,9 +106,12 @@ support. Pysam is tested with python versions 3.6 to 3.11. * [#1149] MacOS universal build compatibility. * [#1146] Fix build when CFLAGS/etc environment variables are set. + Release 0.20.0 ============== +.. rubric:: 29 October 2022 + This release wraps htslib/bcftools version 1.16 and samtools version 1.16.1. * [#1113] Full compatibility with setuptools v62.1.0's build directory name changes @@ -40,17 +122,23 @@ This release wraps htslib/bcftools version 1.16 and samtools version 1.16.1. Many additional type hints have been provided by the community, thanks! + Release 0.19.1 ============== +.. rubric:: 27 May 2022 + This release wraps htslib/samtools/bcftools version 1.15.1. * [#1104] add an add_samples() method to quickly add multiple samples to VCF. + Release 0.19.0 ============== +.. rubric:: 30 March 2022 + This release wraps htslib/samtools/bcftools version 1.15. * [#1085] Improve getopt()/getopt_long() resetting when running samtools/bcftools commands @@ -66,10 +154,13 @@ This release wraps htslib/samtools/bcftools version 1.15. * Fix BGZFile.read() behaviour near or at EOF * First API for the htslib modified bases interface - + + Release 0.18.0 ============== +.. rubric:: 17 November 2021 + This release wraps htslib/samtools/bcftools version 1.14. * [#1048] and [#1060], clarify documentation of index statistics with CRAM files @@ -77,9 +168,12 @@ This release wraps htslib/samtools/bcftools version 1.14. * Add new "samples" subcommand to pysam/samtools.py * Introduce TupleProxyIterator iterator object class + Release 0.17.0 ============== +.. rubric:: 30 September 2021 + This release wraps htslib/samtools/bcftools version 1.13. Corresponding to new samtools commands, `pysam.samtools` now has additional functions `ampliconclip`, `ampliconstats`, `fqimport`, and `version`. @@ -122,6 +216,8 @@ Documentation improvements: Release 0.16.0 ============== +.. rubric:: 8 June 2020 + This release wraps htslib/bcftools version 1.10.2 and samtools version 1.10. The following bugs reported against pysam are fixed due to this: @@ -162,6 +258,7 @@ version in order to fix pip install pysam with python 3.8. * [#846] Prevent segmentation fault on ID, when handling malformed records * [#829] Run configure with the correct CC/CFLAGS/LDFLAGS env vars + Release 0.15.3 ============== @@ -205,7 +302,7 @@ Bugfix release. Release 0.15.0 ============== -This release wraps htslib (and friends) version 1.9. +This release wraps htslib/samtools/bcftools version 1.9. * [#673] permit dash in chromosome name of region string * [#656] Support `text` when opening a SAM file for writing @@ -225,6 +322,7 @@ upgraded to 1.7.0. * treat border case of all bases in pileup column below quality score * [#634] Fix access to pileup reference_sequence + Release 0.14.0 ============== @@ -289,6 +387,7 @@ contains a series of bugfixes. * [#537] allow tabix index files to be created in a custom location. * [#530] add get_index_statistics() method + Release 0.12.0.1 ================ @@ -304,6 +403,7 @@ contains a series of bugfixes. * [#473] A new FastxRecord class that can be instantiated from class and modified in-place. Replaces PersistentFastqProxy. * [#521] In AligmentFile, Simplify file detection logic and allow remote index files + * Removed attempts to guess data and index file names; this is magic left to htslib. * Removed file existence check prior to opening files with htslib @@ -314,6 +414,7 @@ contains a series of bugfixes. * Allow remote indices (tested using S3 signed URLs). * Document filepath_index and make it an alias for index_filename. * Added a require_index parameter to AlignmentFile + * [#526] handle unset ref when creating new records * [#513] fix bcf_translate to skip deleted FORMAT fields to avoid segfaults @@ -554,12 +655,14 @@ Potential isses when upgrading from v0.8.3: * renamed several methods for pep8 compatibility, old names still retained for backwards compatibility, but should be considered deprecated. + * gettid() is now get_tid() * getrname() is now get_reference_name() * parseRegion() is now parse_region() * some methods have changed for pep8 compatibility without the old names being present: + * fromQualityString() is now qualitystring_to_array() * toQualityString() is now qualities_to_qualitystring() @@ -678,6 +781,7 @@ Release 0.8.1 * Pysam now wraps htslib and samtools versions 1.1. * Bugfixes, most notable: + * issue #43: uncompressed BAM output * issue #42: skip tests requiring network if none available * issue #19: multiple iterators can now be made to work on the same tabix file diff --git a/README.rst b/README.rst index 4f19003..b50e2e5 100644 --- a/README.rst +++ b/README.rst @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured compilation options. Especially for OS X this will potentially save a lot of trouble. -The current version of pysam wraps 3rd-party code from htslib-1.17, samtools-1.17, and bcftools-1.17. +The current version of pysam wraps 3rd-party code from htslib-1.18, samtools-1.18, and bcftools-1.18. Pysam is available through `pypi `_. To install, type:: @@ -42,10 +42,10 @@ Questions and comments are very welcome and should be sent to the .. _tabix: http://samtools.sourceforge.net/tabix.shtml .. _Li 2009: http://www.ncbi.nlm.nih.gov/pubmed/19505943 -.. |build-status| image:: https://travis-ci.org/pysam-developers/pysam.svg +.. |build-status| image:: https://github.com/pysam-developers/pysam/actions/workflows/ci.yaml/badge.svg :alt: build status :scale: 100% - :target: https://travis-ci.org/pysam-developers/pysam + :target: https://github.com/pysam-developers/pysam/actions/workflows/ci.yaml .. |docs| image:: https://readthedocs.org/projects/pysam/badge/?version=latest :alt: Documentation Status diff --git a/bcftools/LICENSE b/bcftools/LICENSE index 6d40ae2..46dc0e0 100644 --- a/bcftools/LICENSE +++ b/bcftools/LICENSE @@ -723,11 +723,12 @@ Public License instead of this License. But first, please read ----------------------------------------------------------------------------- -LICENSE FOR VariantKey (https://github.com/Genomicsplc/variantkey) +LICENSE FOR VariantKey (https://github.com/tecnickcom/variantkey) The MIT License Copyright (c) 2017-2018 GENOMICS plc +Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index c3f7ded..bba71e3 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -1,6 +1,6 @@ /* bcftools.h -- utility function declarations. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2 // newline will be added by the function. void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); +// For on the fly index creation with --write-index +int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname); + void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); const char *hts_bcf_wmode2(int file_type, const char *fname); diff --git a/bcftools/cigar_state.h b/bcftools/cigar_state.h index a12a709..dacac14 100644 --- a/bcftools/cigar_state.h +++ b/bcftools/cigar_state.h @@ -107,6 +107,12 @@ static inline int cstate_seek_fwd(cigar_state_t *cs, hts_pos_t *pos_ptr, int tri cs->icig++; continue; } + if ( op==BAM_CHARD_CLIP || op==BAM_CPAD ) + { + cs->icig++; + continue; + } + error("FIXME: not ready for CIGAR operator %d\n",op); } // the read starts after pos if ( trim_left ) @@ -175,6 +181,12 @@ static inline int cstate_seek_op_fwd(cigar_state_t *cs, hts_pos_t pos, int seek_ cs->icig++; continue; } + if ( op==BAM_CHARD_CLIP || op==BAM_CPAD ) + { + cs->icig++; + continue; + } + error("FIXME: not ready for CIGAR operator %d\n",op); } return cs->icig < cs->ncig ? -1 : -2; } diff --git a/bcftools/consensus.c b/bcftools/consensus.c index 397d45f..2b58670 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -54,8 +54,8 @@ #define PICK_SHORT 8 #define PICK_IUPAC 16 -#define TO_UPPER 0 -#define TO_LOWER 1 +#define TO_UPPER 1 +#define TO_LOWER 2 typedef struct { @@ -324,7 +324,7 @@ static void init_region(args_t *args, char *line) { char *ss, *se = line; while ( *se && !isspace(*se) && *se!=':' ) se++; - int from = 0, to = 0; + hts_pos_t from = 0, to = 0; char tmp = 0, *tmp_ptr = NULL; if ( *se ) { @@ -356,7 +356,14 @@ static void init_region(args_t *args, char *line) args->fa_frz_mod = -1; args->fa_case = -1; args->vcf_rbuf.n = 0; - bcf_sr_seek(args->files,line,args->fa_ori_pos); + + kstring_t str = {0,0,0}; + if ( from==0 ) from = 1; + if ( to==0 ) to = HTS_POS_MAX; + ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to); + bcf_sr_set_regions(args->files,line,0); + free(str.s); + if ( tmp_ptr ) *tmp_ptr = tmp; fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); if ( args->chain_fname ) @@ -466,25 +473,37 @@ static char *mark_del(char *ref, int rlen, char *alt, int mark) static void mark_ins(char *ref, char *alt, char mark) { int i, nref = strlen(ref), nalt = strlen(alt); - if ( mark=='l' ) + if ( mark==TO_LOWER ) for (i=nref; imark_del = optarg[0]; break; case 2 : - if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u'; - else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l'; + if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER; + else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER; + else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0]; else error("The argument is not recognised: --mark-ins %s\n",optarg); break; case 3 : - if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u'; - else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l'; + if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER; + else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER; + else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0]; else error("The argument is not recognised: --mark-snv %s\n",optarg); break; case 'p': args->chr_prefix = optarg; break; @@ -1211,7 +1231,8 @@ int main_consensus(int argc, char *argv[]) { char *tmp; args->haplotype = strtol(optarg, &tmp, 10); - if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg); + if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg); + if ( *tmp ) args->allele |= PICK_IUPAC; if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n"); } break; diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index b611925..9f0826b 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -56,8 +56,8 @@ #define PICK_SHORT 8 #define PICK_IUPAC 16 -#define TO_UPPER 0 -#define TO_LOWER 1 +#define TO_UPPER 1 +#define TO_LOWER 2 typedef struct { @@ -326,7 +326,7 @@ static void init_region(args_t *args, char *line) { char *ss, *se = line; while ( *se && !isspace(*se) && *se!=':' ) se++; - int from = 0, to = 0; + hts_pos_t from = 0, to = 0; char tmp = 0, *tmp_ptr = NULL; if ( *se ) { @@ -358,7 +358,14 @@ static void init_region(args_t *args, char *line) args->fa_frz_mod = -1; args->fa_case = -1; args->vcf_rbuf.n = 0; - bcf_sr_seek(args->files,line,args->fa_ori_pos); + + kstring_t str = {0,0,0}; + if ( from==0 ) from = 1; + if ( to==0 ) to = HTS_POS_MAX; + ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to); + bcf_sr_set_regions(args->files,line,0); + free(str.s); + if ( tmp_ptr ) *tmp_ptr = tmp; fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); if ( args->chain_fname ) @@ -468,25 +475,37 @@ static char *mark_del(char *ref, int rlen, char *alt, int mark) static void mark_ins(char *ref, char *alt, char mark) { int i, nref = strlen(ref), nalt = strlen(alt); - if ( mark=='l' ) + if ( mark==TO_LOWER ) for (i=nref; imark_del = optarg[0]; break; case 2 : - if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u'; - else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l'; + if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER; + else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER; + else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0]; else error("The argument is not recognised: --mark-ins %s\n",optarg); break; case 3 : - if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u'; - else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l'; + if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER; + else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER; + else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0]; else error("The argument is not recognised: --mark-snv %s\n",optarg); break; case 'p': args->chr_prefix = optarg; break; @@ -1213,7 +1233,8 @@ int main_consensus(int argc, char *argv[]) { char *tmp; args->haplotype = strtol(optarg, &tmp, 10); - if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg); + if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg); + if ( *tmp ) args->allele |= PICK_IUPAC; if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n"); } break; diff --git a/bcftools/convert.c b/bcftools/convert.c index 80e5474..07ff018 100644 --- a/bcftools/convert.c +++ b/bcftools/convert.c @@ -106,6 +106,7 @@ struct _convert_t char **used_tags_list; int nused_tags; int allow_undef_tags; + int force_newline; uint8_t **subset_samples; }; @@ -648,6 +649,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { vcf_format1(convert->header, line, str); + if ( str->s[str->l-1]=='\n' ) str->l--; } static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -1560,7 +1562,6 @@ void convert_destroy(convert_t *convert) int convert_header(convert_t *convert, kstring_t *str) { int i, icol = 0, l_ori = str->l; - bcf_hdr_t *hdr = convert->header; // Supress the header output if LINE is present for (i=0; infmt; i++) @@ -1568,6 +1569,12 @@ int convert_header(convert_t *convert, kstring_t *str) if ( i!=convert->nfmt ) return str->l - l_ori; + // Header formatting becomes problematic when the formatting expression contains a newline. + // Simple cases like + // -f'[%CHROM %POS %SAMPLE\n]' + // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines + // are present. + int has_fmt_newline = 0; kputc('#', str); for (i=0; infmt; i++) { @@ -1578,18 +1585,25 @@ int convert_header(convert_t *convert, kstring_t *str) while ( convert->fmt[j].is_gt_field ) j++; for (js=0; jsnsamples; js++) { - int ks = convert->samples[js]; for (k=i; kfmt[k].type == T_SEP ) { - if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str); + if ( convert->fmt[k].key ) + { + char *tmp = convert->fmt[k].key; + while ( *tmp ) + { + if ( *tmp=='\n' ) has_fmt_newline = 1; + else kputc(*tmp,str); + tmp++; + } + } } - else if ( convert->fmt[k].type == T_SAMPLE ) - ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key); else - ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key); + ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key); } + if ( has_fmt_newline ) break; } i = j-1; continue; @@ -1602,6 +1616,7 @@ int convert_header(convert_t *convert, kstring_t *str) } ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key); } + if ( has_fmt_newline ) kputc('\n',str); return str->l - l_ori; } @@ -1678,6 +1693,47 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) return str->l - l_ori; } +static void force_newline_(convert_t *convert) +{ + int i, has_newline = 0; + for (i=0; infmt; i++) + { + if ( !convert->fmt[i].key ) continue; + char *tmp = convert->fmt[i].key; + while (*tmp) + { + if ( *tmp=='\n' ) { has_newline = 1; break; } + tmp++; + } + if ( has_newline ) break; + } + if ( has_newline ) return; + + // A newline is not present, force it. But where to add it? + // Consider + // -f'%CHROM[ %SAMPLE]\n' + // vs + // -f'[%CHROM %SAMPLE\n]' + for (i=0; infmt; i++) + if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break; + + if ( i < convert->nfmt ) + register_tag(convert, "\n", 0, T_SEP); // the first case + else + { + // the second case + i = convert->nfmt - 1; + if ( !convert->fmt[i].key ) + { + convert->fmt[i].key = strdup("\n"); + convert->fmt[i].is_gt_field = 1; + register_tag(convert, NULL, 0, T_SEP); + } + else + register_tag(convert, "\n", 1, T_SEP); + } +} + int convert_set_option(convert_t *convert, enum convert_option opt, ...) { int ret = 0; @@ -1692,6 +1748,10 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) case subset_samples: convert->subset_samples = va_arg(args, uint8_t**); break; + case force_newline: + convert->force_newline = va_arg(args, int); + if ( convert->force_newline ) force_newline_(convert); + break; default: ret = -1; } diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index 92f9d01..09a7648 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -108,6 +108,7 @@ struct _convert_t char **used_tags_list; int nused_tags; int allow_undef_tags; + int force_newline; uint8_t **subset_samples; }; @@ -650,6 +651,7 @@ static void process_type(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { vcf_format1(convert->header, line, str); + if ( str->s[str->l-1]=='\n' ) str->l--; } static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { @@ -1562,7 +1564,6 @@ void convert_destroy(convert_t *convert) int convert_header(convert_t *convert, kstring_t *str) { int i, icol = 0, l_ori = str->l; - bcf_hdr_t *hdr = convert->header; // Supress the header output if LINE is present for (i=0; infmt; i++) @@ -1570,6 +1571,12 @@ int convert_header(convert_t *convert, kstring_t *str) if ( i!=convert->nfmt ) return str->l - l_ori; + // Header formatting becomes problematic when the formatting expression contains a newline. + // Simple cases like + // -f'[%CHROM %POS %SAMPLE\n]' + // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines + // are present. + int has_fmt_newline = 0; kputc('#', str); for (i=0; infmt; i++) { @@ -1580,18 +1587,25 @@ int convert_header(convert_t *convert, kstring_t *str) while ( convert->fmt[j].is_gt_field ) j++; for (js=0; jsnsamples; js++) { - int ks = convert->samples[js]; for (k=i; kfmt[k].type == T_SEP ) { - if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str); + if ( convert->fmt[k].key ) + { + char *tmp = convert->fmt[k].key; + while ( *tmp ) + { + if ( *tmp=='\n' ) has_fmt_newline = 1; + else kputc(*tmp,str); + tmp++; + } + } } - else if ( convert->fmt[k].type == T_SAMPLE ) - ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key); else - ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key); + ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key); } + if ( has_fmt_newline ) break; } i = j-1; continue; @@ -1604,6 +1618,7 @@ int convert_header(convert_t *convert, kstring_t *str) } ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key); } + if ( has_fmt_newline ) kputc('\n',str); return str->l - l_ori; } @@ -1680,6 +1695,47 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) return str->l - l_ori; } +static void force_newline_(convert_t *convert) +{ + int i, has_newline = 0; + for (i=0; infmt; i++) + { + if ( !convert->fmt[i].key ) continue; + char *tmp = convert->fmt[i].key; + while (*tmp) + { + if ( *tmp=='\n' ) { has_newline = 1; break; } + tmp++; + } + if ( has_newline ) break; + } + if ( has_newline ) return; + + // A newline is not present, force it. But where to add it? + // Consider + // -f'%CHROM[ %SAMPLE]\n' + // vs + // -f'[%CHROM %SAMPLE\n]' + for (i=0; infmt; i++) + if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break; + + if ( i < convert->nfmt ) + register_tag(convert, "\n", 0, T_SEP); // the first case + else + { + // the second case + i = convert->nfmt - 1; + if ( !convert->fmt[i].key ) + { + convert->fmt[i].key = strdup("\n"); + convert->fmt[i].is_gt_field = 1; + register_tag(convert, NULL, 0, T_SEP); + } + else + register_tag(convert, "\n", 1, T_SEP); + } +} + int convert_set_option(convert_t *convert, enum convert_option opt, ...) { int ret = 0; @@ -1694,6 +1750,10 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) case subset_samples: convert->subset_samples = va_arg(args, uint8_t**); break; + case force_newline: + convert->force_newline = va_arg(args, int); + if ( convert->force_newline ) force_newline_(convert); + break; default: ret = -1; } diff --git a/bcftools/convert.h b/bcftools/convert.h index 5bbbc2c..0626070 100644 --- a/bcftools/convert.h +++ b/bcftools/convert.h @@ -1,6 +1,6 @@ /* convert.h -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2023 Genome Research Ltd. Author: Petr Danecek @@ -32,6 +32,7 @@ enum convert_option { allow_undef_tags, subset_samples, + force_newline, }; convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str); diff --git a/bcftools/csq.c b/bcftools/csq.c index 49812d4..f619e06 100644 --- a/bcftools/csq.c +++ b/bcftools/csq.c @@ -35,7 +35,7 @@ Read about transcript types here http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html http://www.ensembl.org/info/genome/variation/predicted_data.html - http://www.gencodegenes.org/gencode_biotypes.html + https://www.gencodegenes.org/pages/biotypes.html List of supported biotypes antisense @@ -45,6 +45,7 @@ IG_LV_gene IG_V_gene lincRNA + lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping macro_lncRNA miRNA misc_RNA @@ -52,7 +53,7 @@ Mt_tRNA polymorphic_pseudogene processed_transcript - protein_coding + protein_coding, mRNA ribozyme rRNA sRNA @@ -144,6 +145,7 @@ #include #include #include +#include #include #include #include @@ -153,6 +155,7 @@ #include "kheap.h" #include "smpl_ilist.h" #include "rbuf.h" +#include "gff.h" #ifndef __FUNCTION__ # define __FUNCTION__ __func__ @@ -162,20 +165,8 @@ #define FLT_INCLUDE 1 #define FLT_EXCLUDE 2 -// Definition of splice_region, splice_acceptor and splice_donor -#define N_SPLICE_DONOR 2 -#define N_SPLICE_REGION_EXON 3 -#define N_SPLICE_REGION_INTRON 8 - #define N_REF_PAD 10 // number of bases to avoid boundary effects -#define STRAND_REV 0 -#define STRAND_FWD 1 - -#define TRIM_NONE 0 -#define TRIM_5PRIME 1 -#define TRIM_3PRIME 2 - // How to treat phased/unphased genotypes #define PHASE_REQUIRE 0 // --phase r #define PHASE_MERGE 1 // --phase m @@ -223,6 +214,7 @@ #define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) #define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) +#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING)) #define CSQ_PRN_BIOTYPE CSQ_NON_CODING // see kput_vcsq() @@ -254,119 +246,6 @@ const char *csq_strings[] = "start_retained" }; - -// GFF line types -#define GFF_UNKN_LINE 0 -#define GFF_TSCRIPT_LINE 1 -#define GFF_GENE_LINE 2 - - -/* - Genomic features, for fast lookup by position to overlapping features -*/ -#define GF_coding_bit 6 -#define GF_is_coding(x) ((x) & (1<aux) +typedef struct { - uint32_t id; // transcript id - uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive) - uint32_t strand:1, // STRAND_REV or STRAND_FWD - ncds:31, // number of exons - mcds; - gf_cds_t **cds; // ordered list of exons char *ref; // reference sequence, padded with N_REF_PAD bases on both ends char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends hap_node_t *root; // root of the haplotype tree hap_node_t **hap; // pointer to haplotype leaves, two for each sample int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD - uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types - type:30; // one of GF_* types - gf_gene_t *gene; -}; -static inline int cmp_tscript(tscript_t **a, tscript_t **b) +} +tscript_t; +static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b) { return ( (*a)->end < (*b)->end ) ? 1 : 0; } -KHEAP_INIT(trhp, tscript_t*, cmp_tscript) +KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript) typedef khp_trhp_t tr_heap_t; typedef struct { @@ -494,7 +366,7 @@ typedef struct { int mstack; hstack_t *stack; - tscript_t *tr; // tr->ref: spliced transcript on ref strand + gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand kstring_t sseq; // spliced haplotype sequence on ref strand kstring_t tseq; // the variable part of translated haplotype transcript, coding strand kstring_t tref; // the variable part of translated reference transcript, coding strand @@ -503,77 +375,20 @@ typedef struct } hap_t; - -/* - Helper structures, only for initialization - - ftr_t - temporary list of all exons, CDS, UTRs -*/ -KHASH_MAP_INIT_INT(int2tscript, tscript_t*) -KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) -typedef struct -{ - int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR - uint32_t beg; - uint32_t end; - uint32_t trid; - uint32_t strand:1; // STRAND_REV,STRAND_FWD - uint32_t phase:2; // 0, 1, 2, or 3 for unknown - uint32_t iseq:29; -} -ftr_t; -/* - Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001) - to integer id. To keep the memory requirements low, the original version - relied on IDs in the form of a string prefix and a numerical id. However, - it turns out that this assumption is not valid for some ensembl GFFs, see - for example Zea_mays.AGPv4.36.gff3.gz - */ -typedef struct -{ - void *str2id; // khash_str2int - int nstr, mstr; - char **str; // numeric id to string -} -id_tbl_t; -typedef struct -{ - // all exons, CDS, UTRs - ftr_t *ftr; - int nftr, mftr; - - // mapping from gene id to gf_gene_t - kh_int2gene_t *gid2gene; - - // mapping from transcript id to tscript, for quick CDS anchoring - kh_int2tscript_t *id2tr; - - // sequences - void *seq2int; // str2int hash - char **seq; - int nseq, mseq; - - // ignored biotypes - void *ignored_biotypes; - - id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx -} -aux_t; - typedef struct _args_t { // the main regidx lookups, from chr:beg-end to overlapping features and // index iterator + gff_t *gff; regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; regitr_t *itr; - // temporary structures, deleted after initializtion - aux_t init; - // text tab-delimited output (out) or vcf/bcf output (out_fh) FILE *out; htsFile *out_fh; + char *index_fn; + int write_index; + char *dump_gff; // vcf bcf_srs_t *sr; @@ -597,6 +412,13 @@ typedef struct _args_t int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; int brief_predictions; + int unify_chr_names; + char *chr_name; + int mchr_name; + struct { + int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + } warned; int rid; // current chromosome tr_heap_t *active_tr; // heap of active transcripts for quick flushing @@ -604,11 +426,10 @@ typedef struct _args_t vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position - tscript_t **rm_tr; // buffer of transcripts to clean + gf_tscript_t **rm_tr; // buffer of transcripts to clean int nrm_tr, mrm_tr; csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs int ncsq_buf, mcsq_buf; - id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx int force; // force run under various conditions. Currently only to skip out-of-phase transcripts int n_threads; // extra compression/decompression threads @@ -645,818 +466,6 @@ const uint8_t cnt4[] = #define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] -static const char *gf_strings_noncoding[] = -{ - "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", - "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", - "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", - "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", - "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", - "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", - "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", - "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf" -}; -static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"}; -static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" }; - -const char *gf_type2gff_string(int type) -{ - if ( !GF_is_coding(type) ) - { - if ( type < (1<init; - char c = chr_end[1]; - chr_end[1] = 0; - int iseq; - if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) - { - // check for possible mismatch in chromosome naming convention such as chrX vs X - char *new_chr = NULL; - if ( faidx_has_seq(args->fai,chr_beg) ) - new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx - else - { - int len = strlen(chr_beg); - if ( !strncmp("chr",chr_beg,3) && len>3 ) - new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not - else - { - new_chr = malloc(len+4); // gff does not have the prefix, faidx has - memcpy(new_chr,"chr",3); - memcpy(new_chr+3,chr_beg,len); - new_chr[len+3] = 0; - } - if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai - { - static int unkwn_chr_warned = 0; - if ( !unkwn_chr_warned && args->verbosity>0 ) - fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg); - unkwn_chr_warned = 1; - free(new_chr); - new_chr = strdup(chr_beg); // use the original sequence name - } - } - if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 ) - { - hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); - aux->seq[aux->nseq] = new_chr; - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; - assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq - } - else - free(new_chr); - } - chr_end[1] = c; - return iseq; -} -static inline char *gff_skip(const char *line, char *ss) -{ - while ( *ss && *ss!='\t' ) ss++; - if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - return ss+1; -} -static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end) -{ - char *se = (char*) line; - while ( *se && *se!='\t' ) se++; - if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - *chr_beg = (char*) line; - *chr_end = se-1; -} -static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end) -{ - char *se = ss; - *beg = strtol(ss, &se, 10) - 1; - if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss); - ss = se+1; - *end = strtol(ss, &se, 10) - 1; - if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - return se+1; -} -static void gff_id_init(id_tbl_t *tbl) -{ - memset(tbl, 0, sizeof(*tbl)); - tbl->str2id = khash_str2int_init(); -} -static void gff_id_destroy(id_tbl_t *tbl) -{ - khash_str2int_destroy_free(tbl->str2id); - free(tbl->str); -} -// returns 0 on success, -1 on failure -static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr) -{ - ss = strstr(ss,needle); // e.g. "ID=transcript:" - if ( !ss ) return -1; - ss += strlen(needle); - - char *se = ss; - while ( *se && *se!=';' && !isspace(*se) ) se++; - char tmp = *se; - *se = 0; - - int id; - if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 ) - { - id = tbl->nstr++; - hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); - tbl->str[id] = strdup(ss); - khash_str2int_set(tbl->str2id, tbl->str[id], id); - } - *se = tmp; - *id_ptr = id; - return 0; -} -static inline int gff_parse_type(char *line) -{ - line = strstr(line,"ID="); - if ( !line ) return -1; - line += 3; - if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE; - else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE; - return -1; -} -static inline int gff_parse_biotype(char *_line) -{ - char *line = strstr(_line,"biotype="); - if ( !line ) return -1; - - line += 8; - switch (*line) - { - case 'p': - if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; - else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; - else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; - else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE; - break; - case 'a': - if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT; - else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE; - else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF; - break; - case 'I': - if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C; - else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D; - else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J; - else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV; - else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V; - else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE; - else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE; - else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE; - else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE; - break; - case 'T': - if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C; - else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D; - else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J; - else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V; - else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE; - else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE; - break; - case 'M': - if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE; - else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA; - else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA; - break; - case 'l': - if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA; - break; - case 'm': - if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA; - else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE; - else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE; - else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA; - else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA; - break; - case 'r': - if ( !strncmp(line,"rRNA",4) ) return GF_rRNA; - else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME; - else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON; - else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED; - break; - case 's': - if ( !strncmp(line,"snRNA",5) ) return GF_snRNA; - else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA; - else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA; - else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA; - else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA; - else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC; - else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING; - break; - case 't': - if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; - else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; - break; - case 'n': - if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD; - else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY; - break; - case 'k': - if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA; - break; - case 'u': - if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE; - else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE; - break; - case 'L': - if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE; - break; - case '3': - if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA; - break; - case 'd': - if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN; - break; - case 'v': - if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA; - break; - case 'b': - if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA; - break; - } - return 0; -} -static inline int gff_ignored_biotype(args_t *args, char *ss) -{ - ss = strstr(ss,"biotype="); - if ( !ss ) return 0; - - ss += 8; - char *se = ss, tmp; - while ( *se && *se!=';' ) se++; - tmp = *se; - *se = 0; - - char *key = ss; - int n = 0; - if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss); - khash_str2int_set(args->init.ignored_biotypes, key, n+1); - - *se = tmp; - return 1; -} -gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id) -{ - khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id); - gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k); - if ( !gene ) - { - gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t)); - int ret; - k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret); - kh_val(aux->gid2gene,k) = gene; - } - return gene; -} -void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) -{ - aux_t *aux = &args->init; - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript, unknown biotype: %s\n",line); - return; - } - - // create a mapping from transcript_id to gene_id - uint32_t trid, gene_id; - if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) ) - { - if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) ) - error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line); - warned = 1; - } - } - if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) ) - { - if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) ) - error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line); - warned = 1; - } - } - - tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); - tr->id = trid; - tr->strand = ftr->strand; - tr->gene = gene_init(aux, gene_id); - tr->type = biotype; - tr->beg = ftr->beg; - tr->end = ftr->end; - - khint_t k; - int ret; - k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret); - kh_val(aux->id2tr,k) = tr; -} -void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr) -{ - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene, unknown biotype: %s\n",line); - return; - } - - aux_t *aux = &args->init; - - // substring search for "ID=gene:ENSG00000437963" - uint32_t gene_id; - if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) ) - { - if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) ) - error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line); - warned = 1; - } - } - - gf_gene_t *gene = gene_init(aux, gene_id); - assert( !gene->name ); // the gene_id should be unique - - gene->iseq = feature_set_seq(args, chr_beg,chr_end); - - // substring search for "Name=OR4F5" - ss = strstr(chr_end+2,"Name="); - if ( ss ) - { - ss += 5; - char *se = ss; - while ( *se && *se!=';' && !isspace(*se) ) se++; - gene->name = (char*) malloc(se-ss+1); - memcpy(gene->name,ss,se-ss); - gene->name[se-ss] = 0; - } - else - gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead -} -int gff_parse(args_t *args, char *line, ftr_t *ftr) -{ - // - skip empty lines and commented lines - // - columns - // 1. chr - // 2. - // 3. CDS, transcript, gene, ... - // 4-5. beg,end - // 6. - // 7. strand - // 8. phase - // 9. Parent=transcript:ENST(\d+);ID=... etc - - char *ss = line; - if ( !*ss ) return -1; // skip blank lines - if ( *ss=='#' ) return -1; // skip comments - - char *chr_beg, *chr_end; - gff_parse_chr(line, &chr_beg, &chr_end); - ss = gff_skip(line, chr_end + 2); - - // 3. column: is this a CDS, transcript, gene, etc. - if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; } - else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; } - else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; } - else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } - else - { - int type = GFF_UNKN_LINE; - if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE; - else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE; - ss = gff_skip(line, ss); - ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); - ss = gff_skip(line, ss); - if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene: - if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) - { - // we ignore these, debug print to see new types: - ss = strstr(ss,"ID="); - if ( !ss ) return -1; // no ID, ignore the line - if ( !strncmp("chromosome",ss+3,10) ) return -1; - if ( !strncmp("supercontig",ss+3,11) ) return -1; - if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line); - return -1; - } - - // 7. column: strand - if ( *ss == '+' ) ftr->strand = STRAND_FWD; - else if ( *ss == '-' ) ftr->strand = STRAND_REV; - else error("Unknown strand: %c .. %s\n", *ss,ss); - - if ( type==GFF_TSCRIPT_LINE ) - gff_parse_transcript(args, line, ss, ftr); - else - gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr); - - return -1; - } - ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); - ss = gff_skip(line, ss); - - // 7. column: strand - if ( *ss == '+' ) ftr->strand = STRAND_FWD; - else if ( *ss == '-' ) ftr->strand = STRAND_REV; - else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; } - ss += 2; - - // 8. column: phase (codon offset) - if ( *ss == '0' ) ftr->phase = 0; - else if ( *ss == '1' ) ftr->phase = 1; - else if ( *ss == '2' ) ftr->phase = 2; - else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase - else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } - ss += 2; - - // substring search for "Parent=transcript:ENST00000437963" - if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) ) - { - if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) ) - error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line); - warned = 1; - } - } - - ftr->iseq = feature_set_seq(args, chr_beg,chr_end); - return 0; -} - -static int cmp_cds_ptr(const void *a, const void *b) -{ - // comparison function for qsort of transcripts's CDS - if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1; - if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1; - return 0; -} - -static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) -{ - *chr_beg = *chr_end = aux->seq[iseq]; - while ( (*chr_end)[1] ) (*chr_end)++; -} -tscript_t *tscript_init(aux_t *aux, uint32_t trid) -{ - khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid); - tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k); - assert( tr ); - return tr; -} -void register_cds(args_t *args, ftr_t *ftr) -{ - // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet. - // ftr is the result of parsing a gff CDS line - aux_t *aux = &args->init; - - tscript_t *tr = tscript_init(aux, ftr->trid); - if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); - - gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); - cds->tr = tr; - cds->beg = ftr->beg; - cds->len = ftr->end - ftr->beg + 1; - cds->icds = 0; // to keep valgrind on mac happy - cds->phase = ftr->phase; - - hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); - tr->cds[tr->ncds++] = cds; -} -void register_utr(args_t *args, ftr_t *ftr) -{ - aux_t *aux = &args->init; - gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t)); - utr->which = ftr->type==GF_UTR3 ? prime3 : prime5; - utr->beg = ftr->beg; - utr->end = ftr->end; - utr->tr = tscript_init(aux, ftr->trid); - - char *chr_beg, *chr_end; - chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); -} -void register_exon(args_t *args, ftr_t *ftr) -{ - aux_t *aux = &args->init; - gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t)); - exon->beg = ftr->beg; - exon->end = ftr->end; - exon->tr = tscript_init(aux, ftr->trid); - - char *chr_beg, *chr_end; - chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); -} - -void tscript_init_cds(args_t *args) -{ - aux_t *aux = &args->init; - - // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) - khint_t k; - int warn_phase_unkn = 0; - for (k=0; kid2tr); k++) - { - if ( !kh_exist(aux->id2tr, k) ) continue; - tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k); - - // position-to-tscript lookup - char *chr_beg, *chr_end; - chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); - - if ( !tr->ncds ) continue; // transcript with no CDS - - // sort CDs - qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr); - - // trim non-coding start - int i, len = 0; - if ( tr->strand==STRAND_FWD ) - { - if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) - { - if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; - tr->cds[0]->beg += tr->cds[0]->phase; - tr->cds[0]->len -= tr->cds[0]->phase; - tr->cds[0]->phase = 0; - } - - // sanity check phase; the phase number in gff tells us how many bases to skip in this - // feature to reach the first base of the next codon - int tscript_ok = 1; - for (i=0; incds; i++) - { - if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) - { - warn_phase_unkn = 1; - len += tr->cds[i]->len; - continue; - } - int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3 ) - { - if ( args->force ) - { - if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; - } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - } - len += tr->cds[i]->len; - } - if ( !tscript_ok ) continue; // skip this transcript - } - else - { - if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) - { - // Check that the phase is not bigger than CDS length. Curiously, this can really happen, - // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 - // todo: the same for the fwd strand - i = tr->ncds - 1; - int phase = tr->cds[i]->phase; - if ( phase ) tr->trim |= TRIM_5PRIME; - while ( i>=0 && phase > tr->cds[i]->len ) - { - phase -= tr->cds[i]->len; - tr->cds[i]->phase = 0; - tr->cds[i]->len = 0; - i--; - } - tr->cds[i]->len -= tr->cds[i]->phase; - tr->cds[i]->phase = 0; - } - - // sanity check phase - int tscript_ok = 1; - for (i=tr->ncds-1; i>=0; i--) - { - if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) - { - warn_phase_unkn = 1; - len += tr->cds[i]->len; - continue; - } - int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3) - { - if ( args->force ) - { - if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; - } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - } - len += tr->cds[i]->len; - } - if ( !tscript_ok ) continue; // skip this transcript - } - - // set len. At the same check that CDS within a transcript do not overlap - len = 0; - for (i=0; incds; i++) - { - tr->cds[i]->icds = i; - len += tr->cds[i]->len; - if ( !i ) continue; - - gf_cds_t *a = tr->cds[i-1]; - gf_cds_t *b = tr->cds[i]; - if ( a->beg + a->len - 1 >= b->beg ) - { - if ( args->force ) - { - fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n", - args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); - } - else - error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n" - " Use the --force option to override (at your own risk).\n", - args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); - } - } - if ( len%3 != 0 ) - { - // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289 - // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289 - // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one. - - tr->trim |= TRIM_3PRIME; - if ( tr->strand==STRAND_FWD ) - { - i = tr->ncds - 1; - while ( i>=0 && len%3 ) - { - int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; - tr->cds[i]->len -= dlen; - len -= dlen; - i--; - } - } - else - { - i = 0; - while ( incds && len%3 ) - { - int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; - tr->cds[i]->len -= dlen; - tr->cds[i]->beg += dlen; - len -= dlen; - i++; - } - } - } - - // set CDS offsets and insert into regidx - len=0; - for (i=0; incds; i++) - { - tr->cds[i]->pos = len; - len += tr->cds[i]->len; - regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); - } - } - if ( warn_phase_unkn && args->verbosity > 0 ) - fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n"); -} - -void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } -void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); } - -void init_gff(args_t *args) -{ - aux_t *aux = &args->init; - aux->seq2int = khash_str2int_init(); // chrom's numeric id - aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene - aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t - args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL); - aux->ignored_biotypes = khash_str2int_init(); - gff_id_init(&aux->gene_ids); - gff_id_init(&args->tscript_ids); - - // parse gff - kstring_t str = {0,0,0}; - htsFile *fp = hts_open(args->gff_fname,"r"); - if ( !fp ) error("Failed to read %s\n", args->gff_fname); - while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) - { - hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr); - int ret = gff_parse(args, str.s, aux->ftr + aux->nftr); - if ( !ret ) aux->nftr++; - } - free(str.s); - if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname); - - - // process gff information: connect CDS and exons to transcripts - args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL); - args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL); - args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL); - args->itr = regitr_init(NULL); - - int i; - for (i=0; inftr; i++) - { - ftr_t *ftr = &aux->ftr[i]; - - // check whether to keep this feature: is there a mapping trid -> gene_id -> gene? - khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); - if ( k==kh_end(aux->id2tr) ) continue; // no such transcript - - tscript_t *tr = kh_val(aux->id2tr,k); - if ( !tr->gene->name ) - { - // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript) - regidx_free_tscript(&tr); - kh_del(int2tscript, aux->id2tr,k); - continue; - } - - // populate regidx by category: - // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 - // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... - if ( ftr->type==GF_CDS ) register_cds(args, ftr); - else if ( ftr->type==GF_EXON ) register_exon(args, ftr); - else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr); - else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr); - else - error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); - } - tscript_init_cds(args); - - if ( args->verbosity > 0 ) - { - fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", - regidx_nregs(args->idx_tscript), - regidx_nregs(args->idx_exon), - regidx_nregs(args->idx_cds), - regidx_nregs(args->idx_utr)); - } - if ( !regidx_nregs(args->idx_tscript) ) - fprintf(stderr, - "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n" - " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" - " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); - - free(aux->ftr); - khash_str2int_destroy_free(aux->seq2int); - // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); - kh_destroy(int2tscript,aux->id2tr); - free(aux->seq); - gff_id_destroy(&aux->gene_ids); - - if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) - { - khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; - fprintf(stderr,"Ignored the following biotypes:\n"); - for (i = kh_begin(ign); i < kh_end(ign); i++) - { - if ( !kh_exist(ign,i)) continue; - const char *biotype = kh_key(ign,i); - if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; - fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); - } - } - khash_str2int_destroy_free(aux->ignored_biotypes); -} - static inline int ncsq2_to_nfmt(int ncsq2) { return 1 + (ncsq2 - 1) / 30; @@ -1474,8 +483,17 @@ void init_data(args_t *args) args->fai = fai_load(args->fa_fname); if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); - if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); - init_gff(args); + args->gff = gff_init(args->gff_fname); + gff_set(args->gff,verbosity,args->verbosity); + gff_set(args->gff,strip_chr_names,args->unify_chr_names); + gff_set(args->gff,force_out_of_phase,args->force); + gff_set(args->gff,dump_fname,args->dump_gff); + gff_parse(args->gff); + args->idx_cds = gff_get(args->gff,idx_cds); + args->idx_utr = gff_get(args->gff,idx_utr); + args->idx_exon = gff_get(args->gff,idx_exon); + args->idx_tscript = gff_get(args->gff,idx_tscript); + args->itr = regitr_init(NULL); args->rid = -1; @@ -1536,6 +554,7 @@ void init_data(args_t *args) if ( args->hdr_nsmpl ) bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n"); } @@ -1547,21 +566,8 @@ void destroy_data(args_t *args) "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n" " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2); - regidx_destroy(args->idx_cds); - regidx_destroy(args->idx_utr); - regidx_destroy(args->idx_exon); - regidx_destroy(args->idx_tscript); regitr_destroy(args->itr); - - khint_t k,i,j; - for (k=0; kinit.gid2gene); k++) - { - if ( !kh_exist(args->init.gid2gene, k) ) continue; - gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k); - free(gene->name); - free(gene); - } - kh_destroy(int2gene,args->init.gid2gene); + gff_destroy(args->gff); if ( args->filter ) filter_destroy(args->filter); @@ -1569,9 +575,20 @@ void destroy_data(args_t *args) khp_destroy(trhp,args->active_tr); kh_destroy(pos2vbuf,args->pos2vbuf); if ( args->smpl ) smpl_ilist_destroy(args->smpl); - int ret; + int i,j,ret; if ( args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } ret = hts_close(args->out_fh); + } else ret = fclose(args->out); if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); @@ -1602,7 +619,7 @@ void destroy_data(args_t *args) free(args->gt_arr); free(args->str.s); free(args->str2.s); - gff_id_destroy(&args->tscript_ids); + free(args->chr_name); } /* @@ -1614,7 +631,7 @@ void destroy_data(args_t *args) #define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq typedef struct { - tscript_t *tr; + gf_tscript_t *tr; struct { int32_t pos, rlen, alen, ial; char *ref, *alt; @@ -1678,7 +695,7 @@ fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); if ( rbeg < splice->vcf.pos ) { assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD - kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); roff = 0; } else @@ -1703,7 +720,7 @@ fprintf(stderr,"r2: %s\n",splice->kref.s); if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD) rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end; if ( splice->kref.l < rlen ) - kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); } #if XDBG fprintf(stderr,"r3: %s\n",splice->kref.s); @@ -1714,7 +731,7 @@ fprintf(stderr,"r3: %s\n",splice->kref.s); if ( abeg < splice->vcf.pos ) { assert( splice->tr->beg <= abeg ); - kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); aoff = 0; } else @@ -1742,7 +759,7 @@ fprintf(stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end; if ( alen > 0 && alen > splice->kalt.l ) - kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); + kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); } #if XDBG fprintf(stderr,"a3: %s\n",splice->kalt.s); @@ -1755,7 +772,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 while ( regitr_overlap(itr) ) { gf_utr_t *utr = regitr_payload(itr, gf_utr_t*); - tscript_t *tr = utr->tr; + gf_tscript_t *tr = utr->tr; if ( tr->id != trid ) continue; csq_t csq; memset(&csq, 0, sizeof(csq_t)); @@ -1771,7 +788,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 } return 0; } -static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial) +static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial) { #if XDBG fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); @@ -1788,6 +805,21 @@ fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); } +static inline const char *drop_chr_prefix(args_t *args, const char *chr) +{ + if ( !args->unify_chr_names ) return chr; + if ( !strncasecmp("chr",chr,3) ) return chr+3; + return chr; +} +static inline const char *add_chr_prefix(args_t *args, const char *chr) +{ + if ( !args->unify_chr_names ) return chr; + int len = strlen(chr); + hts_expand(char,len+4,args->mchr_name,args->chr_name); + memcpy(args->chr_name,"chr",3); + memcpy(args->chr_name+3,chr,len+1); + return args->chr_name; +} static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) { // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp @@ -1813,7 +845,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -1851,7 +883,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -1924,7 +956,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) { static int small_ref_padding_warned = 0; - tscript_t *tr = splice->tr; + gf_tscript_t *tr = splice->tr; // We know the VCF record overlaps the exon, but does it overlap the start codon? if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0; @@ -1956,7 +988,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint } char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele - char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted + char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted #if XDBG fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); #endif @@ -1985,7 +1017,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint } char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele - char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block + char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block #if XDBG fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); #endif @@ -2030,7 +1062,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2086,7 +1118,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2175,7 +1207,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2205,7 +1237,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2291,7 +1323,7 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, { int i; kstring_t str = {0,0,0}; - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; child->icds = cds->icds; // index of cds in the tscript's list of exons child->vcf_ial = ial; @@ -2313,8 +1345,8 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, } if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M { - if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } - else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } + if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } + else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } } if ( child->icds!=0 ) splice.check_region_beg = 1; if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; @@ -2373,12 +1405,12 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n // the variant is on a new exon, finish up the previous int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg; if ( len > 0 ) - kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); } // append any skipped non-variant exons while ( ++i < cds->icds ) - kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); if ( parent->icds==child->icds ) { @@ -2390,10 +1422,10 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n free(splice.kalt.s); return 1; } - kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); } else - kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); } kputs(splice.kalt.s + dbeg, &str); @@ -2645,28 +1677,28 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, #endif } -void tscript_splice_ref(tscript_t *tr) +void tscript_splice_ref(gf_tscript_t *tr) { int i, len = 0; for (i=0; incds; i++) len += tr->cds[i]->len; - tr->nsref = len + 2*N_REF_PAD; - tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); + TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD; + TSCRIPT_AUX(tr)->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); len = 0; - memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); + memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); len += N_REF_PAD; for (i=0; incds; i++) { - memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); + memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); len += tr->cds[i]->len; } - memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); + memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); len += N_REF_PAD; - tr->sref[len] = 0; + TSCRIPT_AUX(tr)->sref[len] = 0; } // returns: 0 if consequence was added, 1 if it already exists or could not be added @@ -2800,18 +1832,25 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) if ( csq->type & CSQ_UPSTREAM_STOP ) kputc_('*',str); - int i, n = sizeof(csq_strings)/sizeof(char*); + int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*); for (i=1; itype&(1<type&(1<type&(1<type&(1<biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) ) + { + if ( has_csq ) kputc_('&',str); // just in case, this should always be true + kputs("NMD_transcript",str); + } kputc_('|', str); if ( csq->gene ) kputs(csq->gene , str); kputc_('|', str); - if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str); +// if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str); + if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str); kputc_('|', str); kputs(gf_type2gff_string(csq->biotype), str); @@ -2840,7 +1879,7 @@ void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) { int i; - tscript_t *tr = hap->tr; + gf_tscript_t *tr = hap->tr; int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; int icsq = node->ncsq_list++; hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); @@ -2954,7 +1993,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, str.l = 0; // create the aa variant string - int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; + int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); @@ -3020,13 +2059,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, void hap_finalize(args_t *args, hap_t *hap) { - tscript_t *tr = hap->tr; - if ( !tr->sref ) + gf_tscript_t *tr = hap->tr; + if ( !TSCRIPT_AUX(tr)->sref ) tscript_splice_ref(tr); kstring_t sref; - sref.s = tr->sref; - sref.l = tr->nsref; + sref.s = TSCRIPT_AUX(tr)->sref; + sref.l = TSCRIPT_AUX(tr)->nsref; sref.m = sref.l; int istack = 0; @@ -3034,7 +2073,7 @@ void hap_finalize(args_t *args, hap_t *hap) hap->sseq.l = 0; hap->tseq.l = 0; - hap->stack[0].node = tr->root; + hap->stack[0].node = TSCRIPT_AUX(tr)->root; hap->stack[0].ichild = -1; hap->stack[0].slen = 0; hap->stack[0].dlen = 0; @@ -3214,7 +2253,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap) kput_vcsq(args, &csq->type, &args->str); fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); } -static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node) { if ( !node || !node->ncsq_list ) return; @@ -3240,7 +2279,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih } } -static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node) { if ( !node || !node->ncsq_list || ismpl<0 ) return; @@ -3276,23 +2315,23 @@ void hap_flush(args_t *args, uint32_t pos) tr_heap_t *heap = args->active_tr; while ( heap->ndat && heap->dat[0]->end<=pos ) { - tscript_t *tr = heap->dat[0]; + gf_tscript_t *tr = heap->dat[0]; khp_delete(trhp, heap); args->hap->tr = tr; - if ( tr->root && tr->root->nchild ) // normal, non-localized calling + if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling { hap_finalize(args, args->hap); if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf { if ( args->phase==PHASE_DROP_GT ) - hap_print_text(args, tr, -1,0, tr->hap[0]); + hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]); else { for (i=0; ismpl->n; i++) { for (j=0; j<2; j++) - hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]); + hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]); } } } @@ -3301,7 +2340,7 @@ void hap_flush(args_t *args, uint32_t pos) for (i=0; ismpl->n; i++) { for (j=0; j<2; j++) - hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]); + hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]); } } } @@ -3309,7 +2348,7 @@ void hap_flush(args_t *args, uint32_t pos) // mark the transcript for deletion. Cannot delete it immediately because // by-position VCF output will need them when flushed by vcf_buf_push args->nrm_tr++; - hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); + hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); args->rm_tr[args->nrm_tr-1] = tr; } } @@ -3424,24 +2463,33 @@ void vbuf_flush(args_t *args, uint32_t pos) for (i=0; inrm_tr; i++) { - tscript_t *tr = args->rm_tr[i]; - if ( tr->root ) hap_destroy(tr->root); - tr->root = NULL; - free(tr->hap); - free(tr->ref); - free(tr->sref); + gf_tscript_t *tr = args->rm_tr[i]; + tscript_t *aux = TSCRIPT_AUX(tr); + if ( aux->root ) hap_destroy(aux->root); + aux->root = NULL; + free(aux->hap); + free(aux->ref); + free(aux->sref); + free(aux); + tr->aux = NULL; } args->nrm_tr = 0; args->ncsq_buf = 0; } -void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr) +void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr) { int i, len; int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg; - tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); - if ( !tr->ref ) + const char *tmp_chr = chr; + if ( !faidx_has_seq(args->fai,tmp_chr) ) + { + tmp_chr = drop_chr_prefix(args,chr); + if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr); + } + TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); + if ( !TSCRIPT_AUX(tr)->ref ) error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1); int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); @@ -3449,23 +2497,23 @@ void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr) { char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1); for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; - memcpy(ref+i, tr->ref, len); + memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len); len += i; for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; ref[i+len] = 0; - free(tr->ref); - tr->ref = ref; + free(TSCRIPT_AUX(tr)->ref); + TSCRIPT_AUX(tr)->ref = ref; } } -static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) +static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) { int vbeg = 0; int rbeg = rec->pos - tr->beg + N_REF_PAD; if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; } - char *ref = tr->ref + rbeg; + char *ref = TSCRIPT_AUX(tr)->ref + rbeg; char *vcf = rec->d.allele[0] + vbeg; - assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD ); + assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD ); int i = 0; while ( ref[i] && vcf[i] ) { @@ -3479,7 +2527,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) int test_cds_local(args_t *args, bcf1_t *rec) { int i,j, ret = 0; - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; @@ -3491,12 +2539,13 @@ int test_cds_local(args_t *args, bcf1_t *rec) while ( regitr_overlap(args->itr) ) { gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; if ( !GF_is_coding(tr->type) ) continue; ret = 1; - if ( !tr->ref ) + if ( !TSCRIPT_AUX(tr) ) { + tr->aux = calloc(sizeof(tscript_t),1); tscript_init_ref(args, tr, chr); tscript_splice_ref(tr); khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards @@ -3505,8 +2554,8 @@ int test_cds_local(args_t *args, bcf1_t *rec) sanity_check_ref(args, tr, rec); kstring_t sref; - sref.s = tr->sref; - sref.l = tr->nsref; + sref.s = TSCRIPT_AUX(tr)->sref; + sref.l = TSCRIPT_AUX(tr)->nsref; sref.m = sref.l; for (i=1; in_allele; i++) @@ -3614,8 +2663,8 @@ int test_cds_local(args_t *args, bcf1_t *rec) { // create the aa variant string kstring_t str = {0,0,0}; - int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; - int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; + int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; + int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); kprint_aa_prediction(args,aa_rbeg,tref,&str); @@ -3633,11 +2682,11 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_stage(args, &csq, rec); // all this only to clean vstr when vrec is flushed - if ( !tr->root ) - tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); - tr->root->ncsq_list++; - hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list); - csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1; + if ( !TSCRIPT_AUX(tr)->root ) + TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + TSCRIPT_AUX(tr)->root->ncsq_list++; + hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list); + csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1; rm_csq->type.vstr = str; } if ( csq_type & ~CSQ_COMPOUND ) @@ -3659,27 +2708,28 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) static int overlaps_warned = 0, multiploid_warned = 0; int i, ret = 0, hap_ret; - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; while ( regitr_overlap(args->itr) ) { gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; if ( !GF_is_coding(tr->type) ) continue; if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end; ret = 1; - if ( !tr->root ) + if ( !TSCRIPT_AUX(tr) ) { // initialize the transcript and its haplotype tree, fetch the reference sequence + tr->aux = calloc(sizeof(tscript_t),1); tscript_init_ref(args, tr, chr); - tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); - tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid - tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*)); - for (i=0; inhap; i++) tr->hap[i] = NULL; - tr->root->nend = tr->nhap; - tr->root->type = HAP_ROOT; + TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid + TSCRIPT_AUX(tr)->hap = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*)); + for (i=0; inhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL; + TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap; + TSCRIPT_AUX(tr)->root->type = HAP_ROOT; khp_insert(trhp, args->active_tr, &tr); } @@ -3689,7 +2739,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) if ( args->phase==PHASE_DROP_GT ) { if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } - hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root; + hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root; hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); hap_ret = hap_init(args, parent, child, cds, rec, 1); if ( hap_ret!=0 ) @@ -3734,8 +2784,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) parent->mchild = 1; parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*)); parent->child[0] = child; - tr->hap[0] = child; - tr->hap[0]->nend = 1; + TSCRIPT_AUX(tr)->hap[0] = child; + TSCRIPT_AUX(tr)->hap[0]->nend = 1; continue; } @@ -3793,12 +2843,12 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) assert( ial < rec->n_allele ); if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; } - hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root; + hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root; if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 ) { // this haplotype has been seen in another sample - tr->hap[i] = parent->child[ parent->cur_child[ial] ]; - tr->hap[i]->nend++; + TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ]; + TSCRIPT_AUX(tr)->hap[i]->nend++; parent->nend--; continue; } @@ -3852,8 +2902,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child); parent->cur_child[ial] = j; parent->child[j] = child; - tr->hap[i] = child; - tr->hap[i]->nend++; + TSCRIPT_AUX(tr)->hap[i] = child; + TSCRIPT_AUX(tr)->hap[i]->nend++; parent->nend--; } } @@ -3933,7 +2983,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) } int test_utr(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; @@ -3944,7 +2994,7 @@ int test_utr(args_t *args, bcf1_t *rec) while ( regitr_overlap(args->itr) ) { gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); - tscript_t *tr = splice.tr = utr->tr; + gf_tscript_t *tr = splice.tr = utr->tr; for (i=1; in_allele; i++) { if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } @@ -3971,7 +3021,7 @@ int test_utr(args_t *args, bcf1_t *rec) } int test_splice(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; splice_t splice; @@ -4003,7 +3053,7 @@ int test_splice(args_t *args, bcf1_t *rec) } int test_tscript(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; splice_t splice; @@ -4012,7 +3062,7 @@ int test_tscript(args_t *args, bcf1_t *rec) int i, ret = 0; while ( regitr_overlap(args->itr) ) { - tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); + gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*); for (i=1; in_allele; i++) { if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } @@ -4046,7 +3096,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) warned = 1; } - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // only insertions atm int beg = rec->pos + 1; @@ -4061,7 +3111,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class; csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -4079,7 +3129,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); - tscript_t *tr = utr->tr; + gf_tscript_t *tr = utr->tr; csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class; csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -4118,7 +3168,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) { csq_t csq; memset(&csq, 0, sizeof(csq_t)); - tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); + gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*); splice.vcf.alt = rec->d.allele[1]; splice.csq = csq_class; int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); @@ -4179,7 +3229,10 @@ static void process(args_t *args, bcf1_t **rec_ptr) // Perform a simple sanity check (that does not catch much), the chromosome must be present in the // reference file if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) ) - error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + { + if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) ) + error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + } } if ( prev_pos > rec->pos ) error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); @@ -4254,9 +3307,12 @@ static const char *usage(void) " r: require phased GTs, throw an error on unphased het GTs\n" " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" " s: skip unphased hets\n" - "Options:\n" - " -e, --exclude EXPR Exclude sites for which the expression is true\n" + "GFF options:\n" + " --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n" " --force Run even if some sanity checks fail\n" + " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n" + "General options:\n" + " -e, --exclude EXPR Exclude sites for which the expression is true\n" " -i, --include EXPR Select sites for which the expression is true\n" " --no-version Do not append version and command line to the header\n" " -o, --output FILE Write output to a file [standard output]\n" @@ -4272,6 +3328,7 @@ static const char *usage(void) " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " --threads INT Use multithreading with worker threads [0]\n" " -v, --verbose INT Verbosity level 0-2 [1]\n" + " --write-index Automatically index the output files [off]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4292,6 +3349,7 @@ int main_csq(int argc, char *argv[]) args->verbosity = 1; args->record_cmd_line = 1; args->clevel = -1; + args->unify_chr_names = 1; static struct option loptions[] = { @@ -4321,6 +3379,9 @@ int main_csq(int argc, char *argv[]) {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,5}, {"no-version",no_argument,NULL,3}, + {"write-index",no_argument,NULL,6}, + {"dump-gff",required_argument,NULL,7}, + {"unify-chr-names",required_argument,NULL,8}, {0,0,0,0} }; int c, targets_is_file = 0, regions_is_file = 0; @@ -4339,7 +3400,7 @@ int main_csq(int argc, char *argv[]) case 3 : args->record_cmd_line = 0; break; case 'b': args->brief_predictions = 1; - fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n"); + fprintf(stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n"); break; case 'B': args->brief_predictions = strtol(optarg,&tmp,10); @@ -4409,6 +3470,13 @@ int main_csq(int argc, char *argv[]) targets_overlap = parse_overlap_option(optarg); if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 6 : args->write_index = 1; break; + case 7 : args->dump_gff = optarg; break; + case 8 : + if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0; + else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1; + else error("Could not parse: --unify-chr-names %s\n",optarg); + break; case 'h': case '?': error("%s",usage()); default: error("The option not recognised: %s\n\n", optarg); break; diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index 8feb7af..5f590d1 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -37,7 +37,7 @@ Read about transcript types here http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html http://www.ensembl.org/info/genome/variation/predicted_data.html - http://www.gencodegenes.org/gencode_biotypes.html + https://www.gencodegenes.org/pages/biotypes.html List of supported biotypes antisense @@ -47,6 +47,7 @@ IG_LV_gene IG_V_gene lincRNA + lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping macro_lncRNA miRNA misc_RNA @@ -54,7 +55,7 @@ Mt_tRNA polymorphic_pseudogene processed_transcript - protein_coding + protein_coding, mRNA ribozyme rRNA sRNA @@ -146,6 +147,7 @@ #include #include #include +#include #include #include #include @@ -155,6 +157,7 @@ #include "kheap.h" #include "smpl_ilist.h" #include "rbuf.h" +#include "gff.h" #ifndef __FUNCTION__ # define __FUNCTION__ __func__ @@ -164,20 +167,8 @@ #define FLT_INCLUDE 1 #define FLT_EXCLUDE 2 -// Definition of splice_region, splice_acceptor and splice_donor -#define N_SPLICE_DONOR 2 -#define N_SPLICE_REGION_EXON 3 -#define N_SPLICE_REGION_INTRON 8 - #define N_REF_PAD 10 // number of bases to avoid boundary effects -#define STRAND_REV 0 -#define STRAND_FWD 1 - -#define TRIM_NONE 0 -#define TRIM_5PRIME 1 -#define TRIM_3PRIME 2 - // How to treat phased/unphased genotypes #define PHASE_REQUIRE 0 // --phase r #define PHASE_MERGE 1 // --phase m @@ -225,6 +216,7 @@ #define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) #define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) +#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING)) #define CSQ_PRN_BIOTYPE CSQ_NON_CODING // see kput_vcsq() @@ -256,119 +248,6 @@ const char *csq_strings[] = "start_retained" }; - -// GFF line types -#define GFF_UNKN_LINE 0 -#define GFF_TSCRIPT_LINE 1 -#define GFF_GENE_LINE 2 - - -/* - Genomic features, for fast lookup by position to overlapping features -*/ -#define GF_coding_bit 6 -#define GF_is_coding(x) ((x) & (1<aux) +typedef struct { - uint32_t id; // transcript id - uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive) - uint32_t strand:1, // STRAND_REV or STRAND_FWD - ncds:31, // number of exons - mcds; - gf_cds_t **cds; // ordered list of exons char *ref; // reference sequence, padded with N_REF_PAD bases on both ends char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends hap_node_t *root; // root of the haplotype tree hap_node_t **hap; // pointer to haplotype leaves, two for each sample int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD - uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types - type:30; // one of GF_* types - gf_gene_t *gene; -}; -static inline int cmp_tscript(tscript_t **a, tscript_t **b) +} +tscript_t; +static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b) { return ( (*a)->end < (*b)->end ) ? 1 : 0; } -KHEAP_INIT(trhp, tscript_t*, cmp_tscript) +KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript) typedef khp_trhp_t tr_heap_t; typedef struct { @@ -496,7 +368,7 @@ typedef struct { int mstack; hstack_t *stack; - tscript_t *tr; // tr->ref: spliced transcript on ref strand + gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand kstring_t sseq; // spliced haplotype sequence on ref strand kstring_t tseq; // the variable part of translated haplotype transcript, coding strand kstring_t tref; // the variable part of translated reference transcript, coding strand @@ -505,77 +377,20 @@ typedef struct } hap_t; - -/* - Helper structures, only for initialization - - ftr_t - temporary list of all exons, CDS, UTRs -*/ -KHASH_MAP_INIT_INT(int2tscript, tscript_t*) -KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) -typedef struct -{ - int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR - uint32_t beg; - uint32_t end; - uint32_t trid; - uint32_t strand:1; // STRAND_REV,STRAND_FWD - uint32_t phase:2; // 0, 1, 2, or 3 for unknown - uint32_t iseq:29; -} -ftr_t; -/* - Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001) - to integer id. To keep the memory requirements low, the original version - relied on IDs in the form of a string prefix and a numerical id. However, - it turns out that this assumption is not valid for some ensembl GFFs, see - for example Zea_mays.AGPv4.36.gff3.gz - */ -typedef struct -{ - void *str2id; // khash_str2int - int nstr, mstr; - char **str; // numeric id to string -} -id_tbl_t; -typedef struct -{ - // all exons, CDS, UTRs - ftr_t *ftr; - int nftr, mftr; - - // mapping from gene id to gf_gene_t - kh_int2gene_t *gid2gene; - - // mapping from transcript id to tscript, for quick CDS anchoring - kh_int2tscript_t *id2tr; - - // sequences - void *seq2int; // str2int hash - char **seq; - int nseq, mseq; - - // ignored biotypes - void *ignored_biotypes; - - id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx -} -aux_t; - typedef struct _args_t { // the main regidx lookups, from chr:beg-end to overlapping features and // index iterator + gff_t *gff; regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; regitr_t *itr; - // temporary structures, deleted after initializtion - aux_t init; - // text tab-delimited output (out) or vcf/bcf output (out_fh) FILE *out; htsFile *out_fh; + char *index_fn; + int write_index; + char *dump_gff; // vcf bcf_srs_t *sr; @@ -599,6 +414,13 @@ typedef struct _args_t int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; int brief_predictions; + int unify_chr_names; + char *chr_name; + int mchr_name; + struct { + int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + } warned; int rid; // current chromosome tr_heap_t *active_tr; // heap of active transcripts for quick flushing @@ -606,11 +428,10 @@ typedef struct _args_t vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position - tscript_t **rm_tr; // buffer of transcripts to clean + gf_tscript_t **rm_tr; // buffer of transcripts to clean int nrm_tr, mrm_tr; csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs int ncsq_buf, mcsq_buf; - id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx int force; // force run under various conditions. Currently only to skip out-of-phase transcripts int n_threads; // extra compression/decompression threads @@ -647,818 +468,6 @@ const uint8_t cnt4[] = #define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] -static const char *gf_strings_noncoding[] = -{ - "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", - "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", - "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", - "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", - "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", - "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", - "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", - "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf" -}; -static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"}; -static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" }; - -const char *gf_type2gff_string(int type) -{ - if ( !GF_is_coding(type) ) - { - if ( type < (1<init; - char c = chr_end[1]; - chr_end[1] = 0; - int iseq; - if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) - { - // check for possible mismatch in chromosome naming convention such as chrX vs X - char *new_chr = NULL; - if ( faidx_has_seq(args->fai,chr_beg) ) - new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx - else - { - int len = strlen(chr_beg); - if ( !strncmp("chr",chr_beg,3) && len>3 ) - new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not - else - { - new_chr = malloc(len+4); // gff does not have the prefix, faidx has - memcpy(new_chr,"chr",3); - memcpy(new_chr+3,chr_beg,len); - new_chr[len+3] = 0; - } - if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai - { - static int unkwn_chr_warned = 0; - if ( !unkwn_chr_warned && args->verbosity>0 ) - fprintf(bcftools_stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg); - unkwn_chr_warned = 1; - free(new_chr); - new_chr = strdup(chr_beg); // use the original sequence name - } - } - if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 ) - { - hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); - aux->seq[aux->nseq] = new_chr; - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; - assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq - } - else - free(new_chr); - } - chr_end[1] = c; - return iseq; -} -static inline char *gff_skip(const char *line, char *ss) -{ - while ( *ss && *ss!='\t' ) ss++; - if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - return ss+1; -} -static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end) -{ - char *se = (char*) line; - while ( *se && *se!='\t' ) se++; - if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - *chr_beg = (char*) line; - *chr_end = se-1; -} -static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end) -{ - char *se = ss; - *beg = strtol(ss, &se, 10) - 1; - if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss); - ss = se+1; - *end = strtol(ss, &se, 10) - 1; - if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - return se+1; -} -static void gff_id_init(id_tbl_t *tbl) -{ - memset(tbl, 0, sizeof(*tbl)); - tbl->str2id = khash_str2int_init(); -} -static void gff_id_destroy(id_tbl_t *tbl) -{ - khash_str2int_destroy_free(tbl->str2id); - free(tbl->str); -} -// returns 0 on success, -1 on failure -static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr) -{ - ss = strstr(ss,needle); // e.g. "ID=transcript:" - if ( !ss ) return -1; - ss += strlen(needle); - - char *se = ss; - while ( *se && *se!=';' && !isspace(*se) ) se++; - char tmp = *se; - *se = 0; - - int id; - if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 ) - { - id = tbl->nstr++; - hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); - tbl->str[id] = strdup(ss); - khash_str2int_set(tbl->str2id, tbl->str[id], id); - } - *se = tmp; - *id_ptr = id; - return 0; -} -static inline int gff_parse_type(char *line) -{ - line = strstr(line,"ID="); - if ( !line ) return -1; - line += 3; - if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE; - else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE; - return -1; -} -static inline int gff_parse_biotype(char *_line) -{ - char *line = strstr(_line,"biotype="); - if ( !line ) return -1; - - line += 8; - switch (*line) - { - case 'p': - if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; - else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; - else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; - else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE; - break; - case 'a': - if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT; - else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE; - else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF; - break; - case 'I': - if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C; - else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D; - else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J; - else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV; - else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V; - else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE; - else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE; - else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE; - else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE; - break; - case 'T': - if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C; - else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D; - else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J; - else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V; - else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE; - else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE; - break; - case 'M': - if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE; - else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA; - else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA; - break; - case 'l': - if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA; - break; - case 'm': - if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA; - else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE; - else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE; - else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA; - else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA; - break; - case 'r': - if ( !strncmp(line,"rRNA",4) ) return GF_rRNA; - else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME; - else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON; - else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED; - break; - case 's': - if ( !strncmp(line,"snRNA",5) ) return GF_snRNA; - else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA; - else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA; - else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA; - else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA; - else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC; - else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING; - break; - case 't': - if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; - else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; - break; - case 'n': - if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD; - else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY; - break; - case 'k': - if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA; - break; - case 'u': - if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE; - else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE; - break; - case 'L': - if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE; - break; - case '3': - if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA; - break; - case 'd': - if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN; - break; - case 'v': - if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA; - break; - case 'b': - if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA; - break; - } - return 0; -} -static inline int gff_ignored_biotype(args_t *args, char *ss) -{ - ss = strstr(ss,"biotype="); - if ( !ss ) return 0; - - ss += 8; - char *se = ss, tmp; - while ( *se && *se!=';' ) se++; - tmp = *se; - *se = 0; - - char *key = ss; - int n = 0; - if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss); - khash_str2int_set(args->init.ignored_biotypes, key, n+1); - - *se = tmp; - return 1; -} -gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id) -{ - khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id); - gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k); - if ( !gene ) - { - gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t)); - int ret; - k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret); - kh_val(aux->gid2gene,k) = gene; - } - return gene; -} -void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) -{ - aux_t *aux = &args->init; - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript, unknown biotype: %s\n",line); - return; - } - - // create a mapping from transcript_id to gene_id - uint32_t trid, gene_id; - if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) ) - { - if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) ) - error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(bcftools_stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line); - warned = 1; - } - } - if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) ) - { - if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) ) - error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(bcftools_stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line); - warned = 1; - } - } - - tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); - tr->id = trid; - tr->strand = ftr->strand; - tr->gene = gene_init(aux, gene_id); - tr->type = biotype; - tr->beg = ftr->beg; - tr->end = ftr->end; - - khint_t k; - int ret; - k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret); - kh_val(aux->id2tr,k) = tr; -} -void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr) -{ - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene, unknown biotype: %s\n",line); - return; - } - - aux_t *aux = &args->init; - - // substring search for "ID=gene:ENSG00000437963" - uint32_t gene_id; - if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) ) - { - if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) ) - error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(bcftools_stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line); - warned = 1; - } - } - - gf_gene_t *gene = gene_init(aux, gene_id); - assert( !gene->name ); // the gene_id should be unique - - gene->iseq = feature_set_seq(args, chr_beg,chr_end); - - // substring search for "Name=OR4F5" - ss = strstr(chr_end+2,"Name="); - if ( ss ) - { - ss += 5; - char *se = ss; - while ( *se && *se!=';' && !isspace(*se) ) se++; - gene->name = (char*) malloc(se-ss+1); - memcpy(gene->name,ss,se-ss); - gene->name[se-ss] = 0; - } - else - gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead -} -int gff_parse(args_t *args, char *line, ftr_t *ftr) -{ - // - skip empty lines and commented lines - // - columns - // 1. chr - // 2. - // 3. CDS, transcript, gene, ... - // 4-5. beg,end - // 6. - // 7. strand - // 8. phase - // 9. Parent=transcript:ENST(\d+);ID=... etc - - char *ss = line; - if ( !*ss ) return -1; // skip blank lines - if ( *ss=='#' ) return -1; // skip comments - - char *chr_beg, *chr_end; - gff_parse_chr(line, &chr_beg, &chr_end); - ss = gff_skip(line, chr_end + 2); - - // 3. column: is this a CDS, transcript, gene, etc. - if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; } - else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; } - else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; } - else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } - else - { - int type = GFF_UNKN_LINE; - if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE; - else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE; - ss = gff_skip(line, ss); - ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); - ss = gff_skip(line, ss); - if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene: - if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) - { - // we ignore these, debug print to see new types: - ss = strstr(ss,"ID="); - if ( !ss ) return -1; // no ID, ignore the line - if ( !strncmp("chromosome",ss+3,10) ) return -1; - if ( !strncmp("supercontig",ss+3,11) ) return -1; - if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored: %s\n", line); - return -1; - } - - // 7. column: strand - if ( *ss == '+' ) ftr->strand = STRAND_FWD; - else if ( *ss == '-' ) ftr->strand = STRAND_REV; - else error("Unknown strand: %c .. %s\n", *ss,ss); - - if ( type==GFF_TSCRIPT_LINE ) - gff_parse_transcript(args, line, ss, ftr); - else - gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr); - - return -1; - } - ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); - ss = gff_skip(line, ss); - - // 7. column: strand - if ( *ss == '+' ) ftr->strand = STRAND_FWD; - else if ( *ss == '-' ) ftr->strand = STRAND_REV; - else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown strand: %c\n", *ss); return -1; } - ss += 2; - - // 8. column: phase (codon offset) - if ( *ss == '0' ) ftr->phase = 0; - else if ( *ss == '1' ) ftr->phase = 1; - else if ( *ss == '2' ) ftr->phase = 2; - else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase - else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } - ss += 2; - - // substring search for "Parent=transcript:ENST00000437963" - if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) ) - { - if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) ) - error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - static int warned = 0; - if ( !warned && args->verbosity > 0 ) - { - fprintf(bcftools_stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line); - warned = 1; - } - } - - ftr->iseq = feature_set_seq(args, chr_beg,chr_end); - return 0; -} - -static int cmp_cds_ptr(const void *a, const void *b) -{ - // comparison function for qsort of transcripts's CDS - if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1; - if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1; - return 0; -} - -static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) -{ - *chr_beg = *chr_end = aux->seq[iseq]; - while ( (*chr_end)[1] ) (*chr_end)++; -} -tscript_t *tscript_init(aux_t *aux, uint32_t trid) -{ - khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid); - tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k); - assert( tr ); - return tr; -} -void register_cds(args_t *args, ftr_t *ftr) -{ - // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet. - // ftr is the result of parsing a gff CDS line - aux_t *aux = &args->init; - - tscript_t *tr = tscript_init(aux, ftr->trid); - if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); - - gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); - cds->tr = tr; - cds->beg = ftr->beg; - cds->len = ftr->end - ftr->beg + 1; - cds->icds = 0; // to keep valgrind on mac happy - cds->phase = ftr->phase; - - hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); - tr->cds[tr->ncds++] = cds; -} -void register_utr(args_t *args, ftr_t *ftr) -{ - aux_t *aux = &args->init; - gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t)); - utr->which = ftr->type==GF_UTR3 ? prime3 : prime5; - utr->beg = ftr->beg; - utr->end = ftr->end; - utr->tr = tscript_init(aux, ftr->trid); - - char *chr_beg, *chr_end; - chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); -} -void register_exon(args_t *args, ftr_t *ftr) -{ - aux_t *aux = &args->init; - gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t)); - exon->beg = ftr->beg; - exon->end = ftr->end; - exon->tr = tscript_init(aux, ftr->trid); - - char *chr_beg, *chr_end; - chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); -} - -void tscript_init_cds(args_t *args) -{ - aux_t *aux = &args->init; - - // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) - khint_t k; - int warn_phase_unkn = 0; - for (k=0; kid2tr); k++) - { - if ( !kh_exist(aux->id2tr, k) ) continue; - tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k); - - // position-to-tscript lookup - char *chr_beg, *chr_end; - chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); - regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); - - if ( !tr->ncds ) continue; // transcript with no CDS - - // sort CDs - qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr); - - // trim non-coding start - int i, len = 0; - if ( tr->strand==STRAND_FWD ) - { - if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) - { - if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; - tr->cds[0]->beg += tr->cds[0]->phase; - tr->cds[0]->len -= tr->cds[0]->phase; - tr->cds[0]->phase = 0; - } - - // sanity check phase; the phase number in gff tells us how many bases to skip in this - // feature to reach the first base of the next codon - int tscript_ok = 1; - for (i=0; incds; i++) - { - if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) - { - warn_phase_unkn = 1; - len += tr->cds[i]->len; - continue; - } - int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3 ) - { - if ( args->force ) - { - if ( args->verbosity > 0 ) - fprintf(bcftools_stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; - } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - } - len += tr->cds[i]->len; - } - if ( !tscript_ok ) continue; // skip this transcript - } - else - { - if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) - { - // Check that the phase is not bigger than CDS length. Curiously, this can really happen, - // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 - // todo: the same for the fwd strand - i = tr->ncds - 1; - int phase = tr->cds[i]->phase; - if ( phase ) tr->trim |= TRIM_5PRIME; - while ( i>=0 && phase > tr->cds[i]->len ) - { - phase -= tr->cds[i]->len; - tr->cds[i]->phase = 0; - tr->cds[i]->len = 0; - i--; - } - tr->cds[i]->len -= tr->cds[i]->phase; - tr->cds[i]->phase = 0; - } - - // sanity check phase - int tscript_ok = 1; - for (i=tr->ncds-1; i>=0; i--) - { - if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) - { - warn_phase_unkn = 1; - len += tr->cds[i]->len; - continue; - } - int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3) - { - if ( args->force ) - { - if ( args->verbosity > 0 ) - fprintf(bcftools_stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; - } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - } - len += tr->cds[i]->len; - } - if ( !tscript_ok ) continue; // skip this transcript - } - - // set len. At the same check that CDS within a transcript do not overlap - len = 0; - for (i=0; incds; i++) - { - tr->cds[i]->icds = i; - len += tr->cds[i]->len; - if ( !i ) continue; - - gf_cds_t *a = tr->cds[i-1]; - gf_cds_t *b = tr->cds[i]; - if ( a->beg + a->len - 1 >= b->beg ) - { - if ( args->force ) - { - fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n", - args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); - } - else - error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n" - " Use the --force option to override (at your own risk).\n", - args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); - } - } - if ( len%3 != 0 ) - { - // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289 - // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289 - // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one. - - tr->trim |= TRIM_3PRIME; - if ( tr->strand==STRAND_FWD ) - { - i = tr->ncds - 1; - while ( i>=0 && len%3 ) - { - int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; - tr->cds[i]->len -= dlen; - len -= dlen; - i--; - } - } - else - { - i = 0; - while ( incds && len%3 ) - { - int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; - tr->cds[i]->len -= dlen; - tr->cds[i]->beg += dlen; - len -= dlen; - i++; - } - } - } - - // set CDS offsets and insert into regidx - len=0; - for (i=0; incds; i++) - { - tr->cds[i]->pos = len; - len += tr->cds[i]->len; - regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); - } - } - if ( warn_phase_unkn && args->verbosity > 0 ) - fprintf(bcftools_stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n"); -} - -void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } -void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); } - -void init_gff(args_t *args) -{ - aux_t *aux = &args->init; - aux->seq2int = khash_str2int_init(); // chrom's numeric id - aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene - aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t - args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL); - aux->ignored_biotypes = khash_str2int_init(); - gff_id_init(&aux->gene_ids); - gff_id_init(&args->tscript_ids); - - // parse gff - kstring_t str = {0,0,0}; - htsFile *fp = hts_open(args->gff_fname,"r"); - if ( !fp ) error("Failed to read %s\n", args->gff_fname); - while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) - { - hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr); - int ret = gff_parse(args, str.s, aux->ftr + aux->nftr); - if ( !ret ) aux->nftr++; - } - free(str.s); - if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname); - - - // process gff information: connect CDS and exons to transcripts - args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL); - args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL); - args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL); - args->itr = regitr_init(NULL); - - int i; - for (i=0; inftr; i++) - { - ftr_t *ftr = &aux->ftr[i]; - - // check whether to keep this feature: is there a mapping trid -> gene_id -> gene? - khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); - if ( k==kh_end(aux->id2tr) ) continue; // no such transcript - - tscript_t *tr = kh_val(aux->id2tr,k); - if ( !tr->gene->name ) - { - // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript) - regidx_free_tscript(&tr); - kh_del(int2tscript, aux->id2tr,k); - continue; - } - - // populate regidx by category: - // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 - // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... - if ( ftr->type==GF_CDS ) register_cds(args, ftr); - else if ( ftr->type==GF_EXON ) register_exon(args, ftr); - else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr); - else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr); - else - error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); - } - tscript_init_cds(args); - - if ( args->verbosity > 0 ) - { - fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", - regidx_nregs(args->idx_tscript), - regidx_nregs(args->idx_exon), - regidx_nregs(args->idx_cds), - regidx_nregs(args->idx_utr)); - } - if ( !regidx_nregs(args->idx_tscript) ) - fprintf(bcftools_stderr, - "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n" - " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" - " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); - - free(aux->ftr); - khash_str2int_destroy_free(aux->seq2int); - // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); - kh_destroy(int2tscript,aux->id2tr); - free(aux->seq); - gff_id_destroy(&aux->gene_ids); - - if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) - { - khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; - fprintf(bcftools_stderr,"Ignored the following biotypes:\n"); - for (i = kh_begin(ign); i < kh_end(ign); i++) - { - if ( !kh_exist(ign,i)) continue; - const char *biotype = kh_key(ign,i); - if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; - fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); - } - } - khash_str2int_destroy_free(aux->ignored_biotypes); -} - static inline int ncsq2_to_nfmt(int ncsq2) { return 1 + (ncsq2 - 1) / 30; @@ -1476,8 +485,17 @@ void init_data(args_t *args) args->fai = fai_load(args->fa_fname); if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); - if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname); - init_gff(args); + args->gff = gff_init(args->gff_fname); + gff_set(args->gff,verbosity,args->verbosity); + gff_set(args->gff,strip_chr_names,args->unify_chr_names); + gff_set(args->gff,force_out_of_phase,args->force); + gff_set(args->gff,dump_fname,args->dump_gff); + gff_parse(args->gff); + args->idx_cds = gff_get(args->gff,idx_cds); + args->idx_utr = gff_get(args->gff,idx_utr); + args->idx_exon = gff_get(args->gff,idx_exon); + args->idx_tscript = gff_get(args->gff,idx_tscript); + args->itr = regitr_init(NULL); args->rid = -1; @@ -1538,6 +556,7 @@ void init_data(args_t *args) if ( args->hdr_nsmpl ) bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Calling...\n"); } @@ -1549,21 +568,8 @@ void destroy_data(args_t *args) "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n" " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2); - regidx_destroy(args->idx_cds); - regidx_destroy(args->idx_utr); - regidx_destroy(args->idx_exon); - regidx_destroy(args->idx_tscript); regitr_destroy(args->itr); - - khint_t k,i,j; - for (k=0; kinit.gid2gene); k++) - { - if ( !kh_exist(args->init.gid2gene, k) ) continue; - gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k); - free(gene->name); - free(gene); - } - kh_destroy(int2gene,args->init.gid2gene); + gff_destroy(args->gff); if ( args->filter ) filter_destroy(args->filter); @@ -1571,9 +577,20 @@ void destroy_data(args_t *args) khp_destroy(trhp,args->active_tr); kh_destroy(pos2vbuf,args->pos2vbuf); if ( args->smpl ) smpl_ilist_destroy(args->smpl); - int ret; + int i,j,ret; if ( args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } ret = hts_close(args->out_fh); + } else ret = fclose(args->out); if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); @@ -1604,7 +621,7 @@ void destroy_data(args_t *args) free(args->gt_arr); free(args->str.s); free(args->str2.s); - gff_id_destroy(&args->tscript_ids); + free(args->chr_name); } /* @@ -1616,7 +633,7 @@ void destroy_data(args_t *args) #define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq typedef struct { - tscript_t *tr; + gf_tscript_t *tr; struct { int32_t pos, rlen, alen, ial; char *ref, *alt; @@ -1680,7 +697,7 @@ fprintf(bcftools_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg) if ( rbeg < splice->vcf.pos ) { assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD - kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); roff = 0; } else @@ -1705,7 +722,7 @@ fprintf(bcftools_stderr,"r2: %s\n",splice->kref.s); if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD) rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end; if ( splice->kref.l < rlen ) - kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); } #if XDBG fprintf(bcftools_stderr,"r3: %s\n",splice->kref.s); @@ -1716,7 +733,7 @@ fprintf(bcftools_stderr,"r3: %s\n",splice->kref.s); if ( abeg < splice->vcf.pos ) { assert( splice->tr->beg <= abeg ); - kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); + kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); aoff = 0; } else @@ -1744,7 +761,7 @@ fprintf(bcftools_stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end; if ( alen > 0 && alen > splice->kalt.l ) - kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); + kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); } #if XDBG fprintf(bcftools_stderr,"a3: %s\n",splice->kalt.s); @@ -1757,7 +774,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 while ( regitr_overlap(itr) ) { gf_utr_t *utr = regitr_payload(itr, gf_utr_t*); - tscript_t *tr = utr->tr; + gf_tscript_t *tr = utr->tr; if ( tr->id != trid ) continue; csq_t csq; memset(&csq, 0, sizeof(csq_t)); @@ -1773,7 +790,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 } return 0; } -static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial) +static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial) { #if XDBG fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); @@ -1790,6 +807,21 @@ fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); } +static inline const char *drop_chr_prefix(args_t *args, const char *chr) +{ + if ( !args->unify_chr_names ) return chr; + if ( !strncasecmp("chr",chr,3) ) return chr+3; + return chr; +} +static inline const char *add_chr_prefix(args_t *args, const char *chr) +{ + if ( !args->unify_chr_names ) return chr; + int len = strlen(chr); + hts_expand(char,len+4,args->mchr_name,args->chr_name); + memcpy(args->chr_name,"chr",3); + memcpy(args->chr_name+3,chr,len+1); + return args->chr_name; +} static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) { // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp @@ -1815,7 +847,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -1853,7 +885,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); @@ -1926,7 +958,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) { static int small_ref_padding_warned = 0; - tscript_t *tr = splice->tr; + gf_tscript_t *tr = splice->tr; // We know the VCF record overlaps the exon, but does it overlap the start codon? if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0; @@ -1958,7 +990,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint } char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele - char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted + char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted #if XDBG fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); #endif @@ -1987,7 +1019,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint } char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele - char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block + char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block #if XDBG fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); #endif @@ -2032,7 +1064,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2088,7 +1120,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2177,7 +1209,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2207,7 +1239,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_utr ) { regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); @@ -2293,7 +1325,7 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, { int i; kstring_t str = {0,0,0}; - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; child->icds = cds->icds; // index of cds in the tscript's list of exons child->vcf_ial = ial; @@ -2315,8 +1347,8 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, } if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M { - if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } - else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } + if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } + else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } } if ( child->icds!=0 ) splice.check_region_beg = 1; if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; @@ -2375,12 +1407,12 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c // the variant is on a new exon, finish up the previous int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg; if ( len > 0 ) - kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); } // append any skipped non-variant exons while ( ++i < cds->icds ) - kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); if ( parent->icds==child->icds ) { @@ -2392,10 +1424,10 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c free(splice.kalt.s); return 1; } - kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); } else - kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); + kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); } kputs(splice.kalt.s + dbeg, &str); @@ -2647,28 +1679,28 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r #endif } -void tscript_splice_ref(tscript_t *tr) +void tscript_splice_ref(gf_tscript_t *tr) { int i, len = 0; for (i=0; incds; i++) len += tr->cds[i]->len; - tr->nsref = len + 2*N_REF_PAD; - tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); + TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD; + TSCRIPT_AUX(tr)->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); len = 0; - memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); + memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); len += N_REF_PAD; for (i=0; incds; i++) { - memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); + memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); len += tr->cds[i]->len; } - memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); + memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); len += N_REF_PAD; - tr->sref[len] = 0; + TSCRIPT_AUX(tr)->sref[len] = 0; } // returns: 0 if consequence was added, 1 if it already exists or could not be added @@ -2802,18 +1834,25 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) if ( csq->type & CSQ_UPSTREAM_STOP ) kputc_('*',str); - int i, n = sizeof(csq_strings)/sizeof(char*); + int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*); for (i=1; itype&(1<type&(1<type&(1<type&(1<biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) ) + { + if ( has_csq ) kputc_('&',str); // just in case, this should always be true + kputs("NMD_transcript",str); + } kputc_('|', str); if ( csq->gene ) kputs(csq->gene , str); kputc_('|', str); - if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str); +// if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str); + if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str); kputc_('|', str); kputs(gf_type2gff_string(csq->biotype), str); @@ -2842,7 +1881,7 @@ void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) { int i; - tscript_t *tr = hap->tr; + gf_tscript_t *tr = hap->tr; int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; int icsq = node->ncsq_list++; hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); @@ -2956,7 +1995,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, str.l = 0; // create the aa variant string - int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; + int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); @@ -3022,13 +2061,13 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, void hap_finalize(args_t *args, hap_t *hap) { - tscript_t *tr = hap->tr; - if ( !tr->sref ) + gf_tscript_t *tr = hap->tr; + if ( !TSCRIPT_AUX(tr)->sref ) tscript_splice_ref(tr); kstring_t sref; - sref.s = tr->sref; - sref.l = tr->nsref; + sref.s = TSCRIPT_AUX(tr)->sref; + sref.l = TSCRIPT_AUX(tr)->nsref; sref.m = sref.l; int istack = 0; @@ -3036,7 +2075,7 @@ void hap_finalize(args_t *args, hap_t *hap) hap->sseq.l = 0; hap->tseq.l = 0; - hap->stack[0].node = tr->root; + hap->stack[0].node = TSCRIPT_AUX(tr)->root; hap->stack[0].ichild = -1; hap->stack[0].slen = 0; hap->stack[0].dlen = 0; @@ -3216,7 +2255,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap) kput_vcsq(args, &csq->type, &args->str); fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); } -static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node) { if ( !node || !node->ncsq_list ) return; @@ -3242,7 +2281,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih } } -static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) +static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node) { if ( !node || !node->ncsq_list || ismpl<0 ) return; @@ -3278,23 +2317,23 @@ void hap_flush(args_t *args, uint32_t pos) tr_heap_t *heap = args->active_tr; while ( heap->ndat && heap->dat[0]->end<=pos ) { - tscript_t *tr = heap->dat[0]; + gf_tscript_t *tr = heap->dat[0]; khp_delete(trhp, heap); args->hap->tr = tr; - if ( tr->root && tr->root->nchild ) // normal, non-localized calling + if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling { hap_finalize(args, args->hap); if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf { if ( args->phase==PHASE_DROP_GT ) - hap_print_text(args, tr, -1,0, tr->hap[0]); + hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]); else { for (i=0; ismpl->n; i++) { for (j=0; j<2; j++) - hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]); + hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]); } } } @@ -3303,7 +2342,7 @@ void hap_flush(args_t *args, uint32_t pos) for (i=0; ismpl->n; i++) { for (j=0; j<2; j++) - hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]); + hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]); } } } @@ -3311,7 +2350,7 @@ void hap_flush(args_t *args, uint32_t pos) // mark the transcript for deletion. Cannot delete it immediately because // by-position VCF output will need them when flushed by vcf_buf_push args->nrm_tr++; - hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); + hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); args->rm_tr[args->nrm_tr-1] = tr; } } @@ -3426,24 +2465,33 @@ void vbuf_flush(args_t *args, uint32_t pos) for (i=0; inrm_tr; i++) { - tscript_t *tr = args->rm_tr[i]; - if ( tr->root ) hap_destroy(tr->root); - tr->root = NULL; - free(tr->hap); - free(tr->ref); - free(tr->sref); + gf_tscript_t *tr = args->rm_tr[i]; + tscript_t *aux = TSCRIPT_AUX(tr); + if ( aux->root ) hap_destroy(aux->root); + aux->root = NULL; + free(aux->hap); + free(aux->ref); + free(aux->sref); + free(aux); + tr->aux = NULL; } args->nrm_tr = 0; args->ncsq_buf = 0; } -void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr) +void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr) { int i, len; int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg; - tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); - if ( !tr->ref ) + const char *tmp_chr = chr; + if ( !faidx_has_seq(args->fai,tmp_chr) ) + { + tmp_chr = drop_chr_prefix(args,chr); + if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr); + } + TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); + if ( !TSCRIPT_AUX(tr)->ref ) error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1); int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); @@ -3451,23 +2499,23 @@ void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr) { char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1); for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; - memcpy(ref+i, tr->ref, len); + memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len); len += i; for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; ref[i+len] = 0; - free(tr->ref); - tr->ref = ref; + free(TSCRIPT_AUX(tr)->ref); + TSCRIPT_AUX(tr)->ref = ref; } } -static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) +static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) { int vbeg = 0; int rbeg = rec->pos - tr->beg + N_REF_PAD; if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; } - char *ref = tr->ref + rbeg; + char *ref = TSCRIPT_AUX(tr)->ref + rbeg; char *vcf = rec->d.allele[0] + vbeg; - assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD ); + assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD ); int i = 0; while ( ref[i] && vcf[i] ) { @@ -3481,7 +2529,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) int test_cds_local(args_t *args, bcf1_t *rec) { int i,j, ret = 0; - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; @@ -3493,12 +2541,13 @@ int test_cds_local(args_t *args, bcf1_t *rec) while ( regitr_overlap(args->itr) ) { gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; if ( !GF_is_coding(tr->type) ) continue; ret = 1; - if ( !tr->ref ) + if ( !TSCRIPT_AUX(tr) ) { + tr->aux = calloc(sizeof(tscript_t),1); tscript_init_ref(args, tr, chr); tscript_splice_ref(tr); khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards @@ -3507,8 +2556,8 @@ int test_cds_local(args_t *args, bcf1_t *rec) sanity_check_ref(args, tr, rec); kstring_t sref; - sref.s = tr->sref; - sref.l = tr->nsref; + sref.s = TSCRIPT_AUX(tr)->sref; + sref.l = TSCRIPT_AUX(tr)->nsref; sref.m = sref.l; for (i=1; in_allele; i++) @@ -3616,8 +2665,8 @@ int test_cds_local(args_t *args, bcf1_t *rec) { // create the aa variant string kstring_t str = {0,0,0}; - int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; - int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; + int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; + int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; kputc_('|', &str); kputw(aa_rbeg, &str); kprint_aa_prediction(args,aa_rbeg,tref,&str); @@ -3635,11 +2684,11 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_stage(args, &csq, rec); // all this only to clean vstr when vrec is flushed - if ( !tr->root ) - tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); - tr->root->ncsq_list++; - hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list); - csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1; + if ( !TSCRIPT_AUX(tr)->root ) + TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + TSCRIPT_AUX(tr)->root->ncsq_list++; + hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list); + csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1; rm_csq->type.vstr = str; } if ( csq_type & ~CSQ_COMPOUND ) @@ -3661,27 +2710,28 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) static int overlaps_warned = 0, multiploid_warned = 0; int i, ret = 0, hap_ret; - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; while ( regitr_overlap(args->itr) ) { gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; if ( !GF_is_coding(tr->type) ) continue; if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end; ret = 1; - if ( !tr->root ) + if ( !TSCRIPT_AUX(tr) ) { // initialize the transcript and its haplotype tree, fetch the reference sequence + tr->aux = calloc(sizeof(tscript_t),1); tscript_init_ref(args, tr, chr); - tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); - tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid - tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*)); - for (i=0; inhap; i++) tr->hap[i] = NULL; - tr->root->nend = tr->nhap; - tr->root->type = HAP_ROOT; + TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); + TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid + TSCRIPT_AUX(tr)->hap = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*)); + for (i=0; inhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL; + TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap; + TSCRIPT_AUX(tr)->root->type = HAP_ROOT; khp_insert(trhp, args->active_tr, &tr); } @@ -3691,7 +2741,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) if ( args->phase==PHASE_DROP_GT ) { if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } - hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root; + hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root; hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); hap_ret = hap_init(args, parent, child, cds, rec, 1); if ( hap_ret!=0 ) @@ -3736,8 +2786,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) parent->mchild = 1; parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*)); parent->child[0] = child; - tr->hap[0] = child; - tr->hap[0]->nend = 1; + TSCRIPT_AUX(tr)->hap[0] = child; + TSCRIPT_AUX(tr)->hap[0]->nend = 1; continue; } @@ -3795,12 +2845,12 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) assert( ial < rec->n_allele ); if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; } - hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root; + hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root; if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 ) { // this haplotype has been seen in another sample - tr->hap[i] = parent->child[ parent->cur_child[ial] ]; - tr->hap[i]->nend++; + TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ]; + TSCRIPT_AUX(tr)->hap[i]->nend++; parent->nend--; continue; } @@ -3854,8 +2904,8 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child); parent->cur_child[ial] = j; parent->child[j] = child; - tr->hap[i] = child; - tr->hap[i]->nend++; + TSCRIPT_AUX(tr)->hap[i] = child; + TSCRIPT_AUX(tr)->hap[i]->nend++; parent->nend--; } } @@ -3935,7 +2985,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) } int test_utr(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // note that the off-by-one extension of rlen is deliberate to account for insertions if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; @@ -3946,7 +2996,7 @@ int test_utr(args_t *args, bcf1_t *rec) while ( regitr_overlap(args->itr) ) { gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); - tscript_t *tr = splice.tr = utr->tr; + gf_tscript_t *tr = splice.tr = utr->tr; for (i=1; in_allele; i++) { if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } @@ -3973,7 +3023,7 @@ int test_utr(args_t *args, bcf1_t *rec) } int test_splice(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; splice_t splice; @@ -4005,7 +3055,7 @@ int test_splice(args_t *args, bcf1_t *rec) } int test_tscript(args_t *args, bcf1_t *rec) { - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; splice_t splice; @@ -4014,7 +3064,7 @@ int test_tscript(args_t *args, bcf1_t *rec) int i, ret = 0; while ( regitr_overlap(args->itr) ) { - tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); + gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*); for (i=1; in_allele; i++) { if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } @@ -4048,7 +3098,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) warned = 1; } - const char *chr = bcf_seqname(args->hdr,rec); + const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); // only insertions atm int beg = rec->pos + 1; @@ -4063,7 +3113,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; + gf_tscript_t *tr = cds->tr; csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class; csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -4081,7 +3131,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); - tscript_t *tr = utr->tr; + gf_tscript_t *tr = utr->tr; csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class; csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -4120,7 +3170,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) { csq_t csq; memset(&csq, 0, sizeof(csq_t)); - tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); + gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*); splice.vcf.alt = rec->d.allele[1]; splice.csq = csq_class; int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); @@ -4181,7 +3231,10 @@ static void process(args_t *args, bcf1_t **rec_ptr) // Perform a simple sanity check (that does not catch much), the chromosome must be present in the // reference file if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) ) - error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + { + if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) ) + error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + } } if ( prev_pos > rec->pos ) error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); @@ -4256,9 +3309,12 @@ static const char *usage(void) " r: require phased GTs, throw an error on unphased het GTs\n" " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" " s: skip unphased hets\n" - "Options:\n" - " -e, --exclude EXPR Exclude sites for which the expression is true\n" + "GFF options:\n" + " --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n" " --force Run even if some sanity checks fail\n" + " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n" + "General options:\n" + " -e, --exclude EXPR Exclude sites for which the expression is true\n" " -i, --include EXPR Select sites for which the expression is true\n" " --no-version Do not append version and command line to the header\n" " -o, --output FILE Write output to a file [standard output]\n" @@ -4274,6 +3330,7 @@ static const char *usage(void) " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " --threads INT Use multithreading with worker threads [0]\n" " -v, --verbose INT Verbosity level 0-2 [1]\n" + " --write-index Automatically index the output files [off]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4294,6 +3351,7 @@ int main_csq(int argc, char *argv[]) args->verbosity = 1; args->record_cmd_line = 1; args->clevel = -1; + args->unify_chr_names = 1; static struct option loptions[] = { @@ -4323,6 +3381,9 @@ int main_csq(int argc, char *argv[]) {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,5}, {"no-version",no_argument,NULL,3}, + {"write-index",no_argument,NULL,6}, + {"dump-gff",required_argument,NULL,7}, + {"unify-chr-names",required_argument,NULL,8}, {0,0,0,0} }; int c, targets_is_file = 0, regions_is_file = 0; @@ -4341,7 +3402,7 @@ int main_csq(int argc, char *argv[]) case 3 : args->record_cmd_line = 0; break; case 'b': args->brief_predictions = 1; - fprintf(bcftools_stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n"); + fprintf(bcftools_stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n"); break; case 'B': args->brief_predictions = strtol(optarg,&tmp,10); @@ -4411,6 +3472,13 @@ int main_csq(int argc, char *argv[]) targets_overlap = parse_overlap_option(optarg); if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 6 : args->write_index = 1; break; + case 7 : args->dump_gff = optarg; break; + case 8 : + if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0; + else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1; + else error("Could not parse: --unify-chr-names %s\n",optarg); + break; case 'h': case '?': error("%s",usage()); default: error("The option not recognised: %s\n\n", optarg); break; diff --git a/bcftools/filter.c b/bcftools/filter.c index 3925475..b6547f8 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -109,8 +109,8 @@ struct _filter_t #if ENABLE_PERL_FILTERS PerlInterpreter *perl; #endif - char **undef_tag; - int nundef_tag; + char **undef_tag, **used_tag; + int nundef_tag, nused_tag; int status, exit_on_error; }; @@ -328,6 +328,32 @@ const char **filter_list_undef_tags(filter_t *filter, int *ntags) *ntags = filter->nundef_tag; return (const char**)filter->undef_tag; } +static void filter_add_used_tag(filter_t *filter, const char *prefix, char *str) +{ + int i; + kstring_t tmp = {0,0,0}; + if ( prefix ) kputs(prefix,&tmp); + kputs(str,&tmp); + for (i=0; inused_tag; i++) + if ( !strcmp(tmp.s,filter->used_tag[i]) ) break; + if ( inused_tag ) + { + free(tmp.s); + return; + } + + filter->nused_tag++; + filter->used_tag = (char**)realloc(filter->used_tag,sizeof(*filter->used_tag)*filter->nused_tag); + if ( !filter->used_tag ) error("Could not allocate memory\n"); + filter->used_tag[filter->nused_tag-1] = tmp.s; + if ( !filter->used_tag[filter->nused_tag-1] ) error("Could not allocate memory\n"); +} +const char **filter_list_used_tags(filter_t *filter, int *ntags) +{ + *ntags = filter->nused_tag; + return (const char**)filter->used_tag; +} + /* @@ -2841,6 +2867,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = filters_set_qual; tok->tag = strdup("QUAL"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ ) @@ -2855,24 +2882,28 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->tag = strdup("FILTER"); filter->max_unpack |= BCF_UN_FLT; tok->tag_type = BCF_HL_FLT; + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ ) { tok->comparator = filters_cmp_id; tok->tag = strdup("ID"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"CHROM",len) ) { tok->setter = &filters_set_chrom; tok->tag = strdup("CHROM"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"POS",len) ) { tok->setter = &filters_set_pos; tok->tag = strdup("POS"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"REF",len) ) @@ -2880,6 +2911,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->setter = &filters_set_ref_string; tok->is_str = 1; tok->tag = strdup("REF"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"ALT",len) ) @@ -2891,6 +2923,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->idxs[0] = -1; tok->nidxs = 1; tok->idx = -2; + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"N_ALT",len) ) @@ -3018,6 +3051,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } tok->tag = strdup(tmp.s); if ( tmp.s ) free(tmp.s); + filter_add_used_tag(filter,is_fmt ? "FORMAT/" : "INFO/",tok->tag); return 0; } else if ( !strcasecmp(tmp.s,"ALT") ) @@ -3026,6 +3060,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->is_str = 1; tok->tag = strdup(tmp.s); free(tmp.s); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strcasecmp(tmp.s,"AN") ) @@ -3669,7 +3704,9 @@ void filter_destroy(filter_t *filter) } } for (i=0; inundef_tag; i++) free(filter->undef_tag[i]); + for (i=0; inused_tag; i++) free(filter->used_tag[i]); free(filter->undef_tag); + free(filter->used_tag); free(filter->cached_GT.buf); free(filter->cached_GT.mask); free(filter->filters); diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 8e2d1d1..d0e2625 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -111,8 +111,8 @@ struct _filter_t #if ENABLE_PERL_FILTERS PerlInterpreter *perl; #endif - char **undef_tag; - int nundef_tag; + char **undef_tag, **used_tag; + int nundef_tag, nused_tag; int status, exit_on_error; }; @@ -330,6 +330,32 @@ const char **filter_list_undef_tags(filter_t *filter, int *ntags) *ntags = filter->nundef_tag; return (const char**)filter->undef_tag; } +static void filter_add_used_tag(filter_t *filter, const char *prefix, char *str) +{ + int i; + kstring_t tmp = {0,0,0}; + if ( prefix ) kputs(prefix,&tmp); + kputs(str,&tmp); + for (i=0; inused_tag; i++) + if ( !strcmp(tmp.s,filter->used_tag[i]) ) break; + if ( inused_tag ) + { + free(tmp.s); + return; + } + + filter->nused_tag++; + filter->used_tag = (char**)realloc(filter->used_tag,sizeof(*filter->used_tag)*filter->nused_tag); + if ( !filter->used_tag ) error("Could not allocate memory\n"); + filter->used_tag[filter->nused_tag-1] = tmp.s; + if ( !filter->used_tag[filter->nused_tag-1] ) error("Could not allocate memory\n"); +} +const char **filter_list_used_tags(filter_t *filter, int *ntags) +{ + *ntags = filter->nused_tag; + return (const char**)filter->used_tag; +} + /* @@ -2843,6 +2869,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->setter = filters_set_qual; tok->tag = strdup("QUAL"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ ) @@ -2857,24 +2884,28 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->tag = strdup("FILTER"); filter->max_unpack |= BCF_UN_FLT; tok->tag_type = BCF_HL_FLT; + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ ) { tok->comparator = filters_cmp_id; tok->tag = strdup("ID"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"CHROM",len) ) { tok->setter = &filters_set_chrom; tok->tag = strdup("CHROM"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"POS",len) ) { tok->setter = &filters_set_pos; tok->tag = strdup("POS"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"REF",len) ) @@ -2882,6 +2913,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->setter = &filters_set_ref_string; tok->is_str = 1; tok->tag = strdup("REF"); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"ALT",len) ) @@ -2893,6 +2925,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->idxs[0] = -1; tok->nidxs = 1; tok->idx = -2; + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strncasecmp(str,"N_ALT",len) ) @@ -3020,6 +3053,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } tok->tag = strdup(tmp.s); if ( tmp.s ) free(tmp.s); + filter_add_used_tag(filter,is_fmt ? "FORMAT/" : "INFO/",tok->tag); return 0; } else if ( !strcasecmp(tmp.s,"ALT") ) @@ -3028,6 +3062,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->is_str = 1; tok->tag = strdup(tmp.s); free(tmp.s); + filter_add_used_tag(filter,NULL,tok->tag); return 0; } else if ( !strcasecmp(tmp.s,"AN") ) @@ -3671,7 +3706,9 @@ void filter_destroy(filter_t *filter) } } for (i=0; inundef_tag; i++) free(filter->undef_tag[i]); + for (i=0; inused_tag; i++) free(filter->used_tag[i]); free(filter->undef_tag); + free(filter->used_tag); free(filter->cached_GT.buf); free(filter->cached_GT.mask); free(filter->filters); diff --git a/bcftools/filter.h b/bcftools/filter.h index 7be842a..cc60d6b 100644 --- a/bcftools/filter.h +++ b/bcftools/filter.h @@ -79,5 +79,6 @@ filter_t *filter_parse(bcf_hdr_t *hdr, const char *str); */ int filter_status(filter_t *filter); const char **filter_list_undef_tags(filter_t *filter, int *nundef); +const char **filter_list_used_tags(filter_t *filter, int *nused); #endif diff --git a/bcftools/gff.c b/bcftools/gff.c new file mode 100644 index 0000000..90da84b --- /dev/null +++ b/bcftools/gff.c @@ -0,0 +1,1098 @@ +/* The MIT License + + Copyright (c) 2023 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "gff.h" + +/* + Helper structures, only for initialization + + ftr_t + temporary list of all exons, CDS, UTRs +*/ +KHASH_MAP_INIT_INT(int2tscript, gf_tscript_t*) +KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) +typedef struct +{ + int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR + uint32_t beg; + uint32_t end; + uint32_t trid; + uint32_t strand:1; // STRAND_REV,STRAND_FWD + uint32_t phase:2; // 0, 1, 2, or 3 for unknown + uint32_t iseq:29; +} +ftr_t; + +/* + Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001) + to integer id. To keep the memory requirements low, the original version + relied on IDs in the form of a string prefix and a numerical id. However, + it turns out that this assumption is not valid for some ensembl GFFs, see + for example Zea_mays.AGPv4.36.gff3.gz + */ +typedef struct +{ + void *str2id; // khash_str2int + int nstr, mstr; + char **str; // numeric id to string +} +id_tbl_t; + +typedef struct +{ + // all exons, CDS, UTRs + ftr_t *ftr; + int nftr, mftr; + + // mapping from gene id to gf_gene_t + kh_int2gene_t *gid2gene; + + // mapping from transcript id to tscript, for quick CDS anchoring + kh_int2tscript_t *id2tr; + + // sequences + void *seq2int; // str2int hash + char **seq; + int nseq, mseq; + + // ignored biotypes + void *ignored_biotypes; + + id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx + + // pointers to the current partially processed line + char *id, *id_end, *parent, *parent_end, *biotype, *biotype_end, + *chr, *chr_end, *name, *name_end, *type, *type_end; +} +aux_t; + +struct gff_t_ +{ + const char *fname, *dump_fname; + + // the main regidx lookups, from chr:beg-end to overlapping features and + // index iterator + regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; + + // temporary structures, deleted after initializtion + aux_t init; + + // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx + id_tbl_t tscript_ids; + + int strip_chr_names, verbosity; + int force; // force run under various conditions. Currently only to skip out-of-phase transcripts + + struct { + int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + } warned; +}; + +static const char *gf_strings_noncoding[] = +{ + "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", + "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", + "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", + "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", + "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", + "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", + "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", + "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf", + "lncRNA" +}; +static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"}; +static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" }; + +int gff_set(gff_t *gff, gff_opt_t key, ...) +{ + va_list args; + switch (key) + { + case dump_fname: + va_start(args, key); + gff->dump_fname = va_arg(args,char*); + va_end(args); + return 0; + + case force_out_of_phase: + va_start(args, key); + gff->force = va_arg(args,int); + va_end(args); + return 0; + + case strip_chr_names: + va_start(args, key); + gff->strip_chr_names = va_arg(args,int); + va_end(args); + return 0; + + case verbosity: + va_start(args, key); + gff->verbosity = va_arg(args,int); + va_end(args); + return 0; + + default: + error("The key %d is not supported with gff_set\n",key); + } + return 0; +} + +void *gff_get(gff_t *gff, gff_opt_t key) +{ + switch (key) + { + case idx_cds: return gff->idx_cds; + case idx_utr: return gff->idx_utr; + case idx_exon: return gff->idx_exon; + case idx_tscript: return gff->idx_tscript; + default: + error("The key %d is not supported with gff_get\n",key); + } + return NULL; +} + +const char *gff_id2string(gff_t *gff, id_type_t type, int id) // currently only transcript ids +{ + return gff->tscript_ids.str[id]; +} + +const char *gf_type2gff_string(int type) +{ + if ( !GF_is_coding(type) ) + { + if ( type < (1<init; + char tmp = chr_end[1]; + chr_end[1] = 0; + int iseq; + if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) + { + char *new_chr = strdup(chr_beg); + hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); + aux->seq[aux->nseq] = new_chr; + iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); + aux->nseq++; + assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + } + chr_end[1] = tmp; + return iseq; +} +static inline char *gff_skip(const char *line, char *ss) +{ + while ( *ss && *ss!='\t' ) ss++; + if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return ss+1; +} +static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, char **chr_end) +{ + char *se = (char*) line; + while ( *se && *se!='\t' ) se++; + if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3; + *chr_beg = (char*) line; + *chr_end = se-1; +} +static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end) +{ + char *se = ss; + *beg = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss); + ss = se+1; + *end = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return se+1; +} +static void gff_id_init(id_tbl_t *tbl) +{ + memset(tbl, 0, sizeof(*tbl)); + tbl->str2id = khash_str2int_init(); +} +static void gff_id_destroy(id_tbl_t *tbl) +{ + khash_str2int_destroy_free(tbl->str2id); + free(tbl->str); +} +static inline int gff_id_register(id_tbl_t *tbl, char *beg, char *end, uint32_t *id_ptr) +{ + char tmp = end[1]; + end[1] = 0; + int id; + if ( khash_str2int_get(tbl->str2id, beg, &id) < 0 ) + { + id = tbl->nstr++; + hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); + tbl->str[id] = strdup(beg); + khash_str2int_set(tbl->str2id, tbl->str[id], id); + } + end[1] = tmp; + *id_ptr = id; + return 0; +} +static inline int gff_parse_biotype(char *line) +{ + if ( !line ) return -1; + switch (*line) + { + case 'p': + if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; + else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; + else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; + else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE; + break; + case 'a': + if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT; + else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE; + else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF; + break; + case 'I': + if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE; + else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE; + else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE; + else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE; + else if ( !strncmp(line,"IG_C",4) ) return GF_IG_C; + else if ( !strncmp(line,"IG_D",4) ) return GF_IG_D; + else if ( !strncmp(line,"IG_J",4) ) return GF_IG_J; + else if ( !strncmp(line,"IG_V",4) ) return GF_IG_V; + else if ( !strncmp(line,"IG_LV",5) ) return GF_IG_LV; + break; + case 'T': + if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE; + else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE; + else if ( !strncmp(line,"TR_C",4) ) return GF_TR_C; + else if ( !strncmp(line,"TR_D",4) ) return GF_TR_D; + else if ( !strncmp(line,"TR_J",4) ) return GF_TR_J; + else if ( !strncmp(line,"TR_V",4) ) return GF_TR_V; + break; + case 'M': + if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE; + else if ( !strncasecmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA; + else if ( !strncasecmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA; + else if ( !strncasecmp(line,"MRNA",4) ) return GF_PROTEIN_CODING; + break; + case 'l': + if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA; + if ( !strncmp(line,"lncRNA",7) ) return GF_lncRNA; + break; + case 'm': + if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA; + else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA; + else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA; + else if ( !strncasecmp(line,"mRNA",4) ) return GF_PROTEIN_CODING; + break; + case 'r': + if ( !strncmp(line,"rRNA",4) ) return GF_rRNA; + else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME; + else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON; + else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED; + break; + case 's': + if ( !strncmp(line,"snRNA",5) ) return GF_snRNA; + else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA; + else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA; + else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA; + else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA; + else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC; + else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING; + break; + case 't': + if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; + break; + case 'n': + if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD; + else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY; + break; + case 'N': + if ( !strncmp(line,"NMD",3) ) return GF_NMD; + break; + case 'k': + if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA; + break; + case 'u': + if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE; + break; + case 'L': + if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE; + break; + case '3': + if ( !strncasecmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA; + else if ( !strncasecmp(line,"3_prime_overlapping_ncRNA",25) ) return GF_3PRIME_OVERLAPPING_ncRNA; + break; + case 'd': + if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN; + break; + case 'v': + if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA; + break; + case 'b': + if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA; + break; + } + return 0; +} +static inline int gff_ignored_biotype(gff_t *gff, char *ss, char *se) +{ + if ( !ss ) return 0; + + char tmp = se[1]; + se[1] = 0; + + char *key = ss; + int n = 0; + if ( khash_str2int_get(gff->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss); + khash_str2int_set(gff->init.ignored_biotypes, key, n+1); + + se[1] = tmp; + return 1; +} +static gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id) +{ + khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id); + gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k); + if ( !gene ) + { + gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t)); + int ret; + k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret); + kh_val(aux->gid2gene,k) = gene; + } + return gene; +} +static void gff_parse_transcript(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + + ftr->type = gff_parse_biotype(aux->biotype); + if ( ftr->type <= 0 ) + { + char tmp = aux->type_end[1]; + aux->type_end[1] = 0; + ftr->type = gff_parse_biotype(aux->type); + aux->type_end[1] = tmp; + } + if ( ftr->type <= 0 ) + { + if ( !gff_ignored_biotype(gff,aux->biotype,aux->biotype_end) ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_tscript_biotype || gff->verbosity > 1 ) + fprintf(stderr,"Warning: Ignoring transcript with unknown biotype .. %s\n", line); + gff->warned.unknown_tscript_biotype++; + } + } + return; + } + + if ( !aux->id ) + error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + if ( !aux->parent ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + + uint32_t trid,gene_id; + gff_id_register(&gff->tscript_ids, aux->id, aux->id_end, &trid); + gff_id_register(&aux->gene_ids, aux->parent, aux->parent_end, &gene_id); + + gf_tscript_t *tr = (gf_tscript_t*) calloc(1,sizeof(gf_tscript_t)); + tr->id = trid; + tr->strand = ftr->strand; + tr->gene = gene_init(aux, gene_id); + tr->type = ftr->type; + tr->beg = ftr->beg; + tr->end = ftr->end; + + khint_t k; + int ret; + k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret); + kh_val(aux->id2tr,k) = tr; +} +// register exon, CDS, UTR +static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + if ( !aux->parent ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring found: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + + // associate with transcript id + gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid); + + if ( ftr->strand==-1 && gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_strand || gff->verbosity > 1 ) + fprintf(stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line); + gff->warned.unknown_strand++; + } + if ( ftr->phase==-1 && gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_phase|| gff->verbosity > 1 ) + fprintf(stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line); + gff->warned.unknown_phase++; + } + ftr->iseq = feature_set_seq(gff, aux->chr,aux->chr_end); +} +static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + if ( !aux->id ) return; + + uint32_t gene_id; + gff_id_register(&aux->gene_ids, aux->id, aux->id_end, &gene_id); + + gf_gene_t *gene = gene_init(aux, gene_id); + if ( gene->name ) + { + if ( !gff->warned.duplicate_id || gff->verbosity > 1 ) + fprintf(stderr,"Warning: The GFF contains features with duplicate id .. %s\n",line); + gff->warned.duplicate_id++; + return; + } + + gene->iseq = feature_set_seq(gff, aux->chr,aux->chr_end); + gene->beg = ftr->beg; + gene->end = ftr->end; + gene->strand = ftr->strand; + gene->id = gene_id; + + if ( aux->name ) + { + gene->name = (char*) malloc(aux->name_end - aux->name + 2); + memcpy(gene->name,aux->name,aux->name_end - aux->name + 1); + gene->name[aux->name_end - aux->name + 1] = 0; + } + else + gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead +} + +// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them, +// or -1 to indiciate the structure needs not be saved (either because of an error or because saved +// as transcript or gene.) +static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr) +{ + // - skip empty lines and commented lines + // - columns + // 1. chr + // 2. + // 3. CDS, transcript, gene, ... + // 4-5. beg,end + // 6. + // 7. strand + // 8. phase + // 9. Parent=transcript:ENST(\d+);ID=...;biotype=... etc + + char *ss = line; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + aux_t *aux = &gff->init; + gff_parse_chr(gff, line, &aux->chr, &aux->chr_end); + ss = gff_skip(line, aux->chr_end + 2); + + // 3rd column: is this a CDS, transcript, gene, etc.. The parsing order by frequency in Homo_sapiens.GRCh37.87.gff3 + int is_gene_line = 0; + ftr->type = 0; + aux->type = ss; + if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; } + else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; } + else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; } + else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } + else if ( !strncmp("biological_region\t",ss,18) ) { return -1; } // skip + else if ( !strncmp("gene\t",ss,5) ) { is_gene_line = 1; ss += 5; } + else ss = gff_skip(line, ss); + aux->type_end = ss - 1; + + // 4-5th columns: beg,end + ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); + + // 6th column: skip + ss = gff_skip(line, ss); + + // 7th column: strand + ftr->strand = -1; + if ( *ss == '+' ) ftr->strand = STRAND_FWD; + else if ( *ss == '-' ) ftr->strand = STRAND_REV; + ss += 2; + + // 8th column: phase (codon offset) + ftr->phase = -1; + if ( *ss == '0' ) ftr->phase = 0; + else if ( *ss == '1' ) ftr->phase = 1; + else if ( *ss == '2' ) ftr->phase = 2; + else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase + ss += 2; + + // 9th column: id, parent, name, biotype + aux->name = NULL, aux->id = NULL, aux->parent = NULL, aux->biotype = NULL; + while ( *ss ) + { + char *es = ss; + while ( *es && *es!=';' ) es++; + if ( !strncmp(ss,"ID=",3) ) + { + ss += 3; + aux->id_end = es - 1; + aux->id = ss; + if ( !strncmp(ss,"gene:",5) ) { aux->id += 5; is_gene_line = 1; } + else if ( !strncmp(ss,"transcript:",11) ) aux->id += 11; + } + else if ( !strncmp(ss,"Name=",5) ) { aux->name = ss + 5; aux->name_end = es - 1; } + else if ( !strncmp(ss,"Parent=",7) ) + { + ss += 7; + aux->parent_end = es - 1; + aux->parent = ss; + if ( !strncmp(ss,"gene:",5) ) aux->parent += 5; + else if ( !strncmp(ss,"transcript:",11) ) aux->parent += 11; + } + else if ( !strncmp(ss,"biotype=",8) ) { aux->biotype = ss + 8; aux->biotype_end = es - 1; } + else if ( !strncmp(ss,"gene_biotype=",13) ) { aux->biotype = ss + 13; aux->biotype_end = es - 1; } + if ( !*es ) break; + ss = es + 1; + } + + if ( is_gene_line || !aux->parent ) + { + gff_parse_gene(gff, line, ftr); + return -1; + } + + if ( ftr->type ) + { + gff_parse_exon(gff, line, ftr); + return 0; + } + + gff_parse_transcript(gff, line, ftr); + return -1; +} + +static int cmp_cds_ptr(const void *a, const void *b) +{ + // comparison function for qsort of transcripts's CDS + if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1; + if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1; + return 0; +} + +static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) +{ + *chr_beg = *chr_end = aux->seq[iseq]; + while ( (*chr_end)[1] ) (*chr_end)++; +} +static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid) +{ + khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid); + gf_tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k); + assert( tr ); + return tr; +} +static void register_cds(gff_t *gff, ftr_t *ftr) +{ + // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet. + // ftr is the result of parsing a gff CDS line + aux_t *aux = &gff->init; + + gf_tscript_t *tr = tscript_init(aux, ftr->trid); + if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); + + gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); + cds->tr = tr; + cds->beg = ftr->beg; + cds->len = ftr->end - ftr->beg + 1; + cds->icds = 0; // to keep valgrind on mac happy + cds->phase = ftr->phase; + + hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); + tr->cds[tr->ncds++] = cds; +} +static void register_utr(gff_t *gff, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t)); + utr->which = ftr->type==GF_UTR3 ? prime3 : prime5; + utr->beg = ftr->beg; + utr->end = ftr->end; + utr->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); +} +static void register_exon(gff_t *gff, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t)); + exon->beg = ftr->beg; + exon->end = ftr->end; + exon->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); +} + +static void tscript_init_cds(gff_t *gff) +{ + aux_t *aux = &gff->init; + + // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) + khint_t k; + for (k=0; kid2tr); k++) + { + if ( !kh_exist(aux->id2tr, k) ) continue; + gf_tscript_t *tr = (gf_tscript_t*) kh_val(aux->id2tr, k); + + // position-to-tscript lookup + char *chr_beg, *chr_end; + chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); + + if ( !tr->ncds ) continue; // transcript with no CDS + + // sort CDs + qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr); + + // trim non-coding start + int i, len = 0; + if ( tr->strand==STRAND_FWD ) + { + if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) + { + if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; + tr->cds[0]->beg += tr->cds[0]->phase; + tr->cds[0]->len -= tr->cds[0]->phase; + tr->cds[0]->phase = 0; + } + + // sanity check phase; the phase number in gff tells us how many bases to skip in this + // feature to reach the first base of the next codon + int tscript_ok = 1; + for (i=0; incds; i++) + { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 ) + fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]); + gff->warned.unknown_cds_phase++; + } + len += tr->cds[i]->len; + continue; + } + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3 ) + { + if ( !gff->force ) + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.wrong_phase || gff->verbosity > 1 ) + fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + gff->warned.wrong_phase++; + } + tscript_ok = 0; + break; + } + len += tr->cds[i]->len; + } + if ( !tscript_ok ) continue; // skip this transcript + } + else + { + if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) + { + // Check that the phase is not bigger than CDS length. Curiously, this can really happen, + // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141. + // This also fixes phase of 5' incomplete CDS, see test/csq/ENST00000520868/ENST00000520868.gff + // todo: the same for the fwd strand + i = tr->ncds - 1; + int phase = tr->cds[i]->phase; + if ( phase ) tr->trim |= TRIM_5PRIME; + while ( i>=0 && phase > tr->cds[i]->len ) + { + phase -= tr->cds[i]->len; + tr->cds[i]->phase = 0; + tr->cds[i]->len = 0; + i--; + } + if ( gff->verbosity > 0 && tr->cds[i]->phase ) + { + if ( !gff->warned.incomplete_cds || gff->verbosity > 1 ) + fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]); + gff->warned.incomplete_cds++; + } + tr->cds[i]->len -= tr->cds[i]->phase; + tr->cds[i]->phase = 0; + } + + // sanity check phase + int tscript_ok = 1; + for (i=tr->ncds-1; i>=0; i--) + { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 ) + fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]); + gff->warned.unknown_cds_phase++; + } + len += tr->cds[i]->len; + continue; + } + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3 ) + { + if ( !gff->force ) + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.wrong_phase || gff->verbosity > 1 ) + fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + gff->warned.wrong_phase++; + } + tscript_ok = 0; + break; + } + len += tr->cds[i]->len; + } + if ( !tscript_ok ) continue; // skip this transcript + } + + // set len. At the same check that CDS within a transcript do not overlap + len = 0; + for (i=0; incds; i++) + { + tr->cds[i]->icds = i; + len += tr->cds[i]->len; + if ( !i ) continue; + + gf_cds_t *a = tr->cds[i-1]; + gf_cds_t *b = tr->cds[i]; + if ( a->beg + a->len - 1 >= b->beg ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.overlapping_cds || gff->verbosity > 1 ) + fprintf(stderr,"Warning: GFF contains overlapping CDS %s, %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32" (ribosomal slippage?)\n", + gff->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + gff->warned.overlapping_cds++; + } + } + } + + if ( len%3 != 0 ) + { + // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289 + // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289 + // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one. + + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.incomplete_cds || gff->verbosity > 1 ) + fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]); + gff->warned.incomplete_cds++; + } + + tr->trim |= TRIM_3PRIME; + if ( tr->strand==STRAND_FWD ) + { + i = tr->ncds - 1; + while ( i>=0 && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + len -= dlen; + i--; + } + } + else + { + i = 0; + while ( incds && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + tr->cds[i]->beg += dlen; + len -= dlen; + i++; + } + } + } + + // set CDS offsets and insert into regidx + len=0; + for (i=0; incds; i++) + { + tr->cds[i]->pos = len; + len += tr->cds[i]->len; + regidx_push(gff->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); + } + } +} + +static void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } +static void regidx_free_tscript(void *payload) { gf_tscript_t *tr = *((gf_tscript_t**)payload); free(tr->cds); free(tr); } + +static int gff_dump(gff_t *gff, const char *fname) +{ + BGZF *out = bgzf_open(fname,"wg"); + if ( !out ) error("Failed to open %s: %s\n", fname, strerror(errno)); + + kstring_t str = {0,0,0}; + + khint_t k; + for (k=0; kinit.gid2gene); k++) + { + if ( !kh_exist(gff->init.gid2gene, k) ) continue; + gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k); + char *gene_id = gff->init.gene_ids.str[gene->id]; + str.l = 0; + ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + + regitr_t *itr = regitr_init(gff->idx_tscript); + while ( regitr_loop(itr) ) + { + gf_tscript_t *tr = regitr_payload(itr, gf_tscript_t*); + char *gene_id = gff->init.gene_ids.str[tr->gene->id]; + const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type); + str.l = 0; + ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_cds); + while ( regitr_loop(itr) ) + { + gf_cds_t *cds = regitr_payload(itr,gf_cds_t*); + gf_tscript_t *tr = cds->tr; + str.l = 0; + ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_utr); + while ( regitr_loop(itr) ) + { + gf_utr_t *utr = regitr_payload(itr,gf_utr_t*); + gf_tscript_t *tr = utr->tr; + str.l = 0; + ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_exon); + while ( regitr_loop(itr) ) + { + gf_exon_t *exon = regitr_payload(itr,gf_exon_t*); + gf_tscript_t *tr = exon->tr; + str.l = 0; + ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + if ( bgzf_close(out)!=0 ) error("Error: close failed .. %s\n", fname); + free(str.s); + + return 0; +} + +int gff_parse(gff_t *gff) +{ + if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname); + + aux_t *aux = &gff->init; + aux->seq2int = khash_str2int_init(); // chrom's numeric id + aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene + aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t + gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL); + aux->ignored_biotypes = khash_str2int_init(); + gff_id_init(&aux->gene_ids); + gff_id_init(&gff->tscript_ids); + + // parse gff + kstring_t str = {0,0,0}; + htsFile *fp = hts_open(gff->fname,"r"); + if ( !fp ) error("Failed to read %s\n", gff->fname); + while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) + { + hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr); + int ret = gff_parse_line(gff, str.s, aux->ftr + aux->nftr); + if ( !ret ) aux->nftr++; + } + free(str.s); + if ( hts_close(fp)!=0 ) error("Close failed: %s\n", gff->fname); + + + // process gff information: connect CDS and exons to transcripts + gff->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL); + gff->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL); + gff->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL); + + int i; + for (i=0; inftr; i++) + { + ftr_t *ftr = &aux->ftr[i]; + + // check whether to keep this feature: is there a mapping trid -> gene_id -> gene? + khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); + if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype + + gf_tscript_t *tr = kh_val(aux->id2tr,k); + tr->used = 1; + tr->gene->used = 1; + + // populate regidx by category: + // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 + // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... + if ( ftr->type==GF_CDS ) register_cds(gff, ftr); + else if ( ftr->type==GF_EXON ) register_exon(gff, ftr); + else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr); + else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr); + else + error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); + } + tscript_init_cds(gff); + + if ( gff->verbosity > 0 ) + { + fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", + regidx_nregs(gff->idx_tscript), + regidx_nregs(gff->idx_exon), + regidx_nregs(gff->idx_cds), + regidx_nregs(gff->idx_utr)); + } + + if ( gff->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) + { + khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; + fprintf(stderr,"Ignored the following biotypes:\n"); + for (i = kh_begin(ign); i < kh_end(ign); i++) + { + if ( !kh_exist(ign,i)) continue; + const char *biotype = kh_key(ign,i); + if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; + fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); + } + } + khash_str2int_destroy_free(aux->ignored_biotypes); + + // warned about unprinted warnings + if ( gff->verbosity > 0 ) + { + int nwarn = 0; + #define INC_NWARN(X) if (gff->warned.X) nwarn += gff->verbosity > 1 ? 0 : gff->warned.X - 1; + INC_NWARN(unknown_chr); + INC_NWARN(unknown_tscript_biotype); + INC_NWARN(unknown_strand); + INC_NWARN(unknown_phase); + INC_NWARN(duplicate_id); + INC_NWARN(unknown_cds_phase); + INC_NWARN(incomplete_cds); + INC_NWARN(wrong_phase); + INC_NWARN(overlapping_cds); + if ( nwarn > 0 ) + fprintf(stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn); + } + + if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname); + + if ( !regidx_nregs(gff->idx_tscript) ) + error("Error: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n" + " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" + " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); + + free(aux->seq); + free(aux->ftr); + khash_str2int_destroy_free(aux->seq2int); + // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); + kh_destroy(int2tscript,aux->id2tr); + gff_id_destroy(&aux->gene_ids); + + return 0; +} + +gff_t *gff_init(const char *fname) +{ + gff_t *gff = calloc(sizeof(gff_t),1); + gff->fname = fname; + return gff; +} +void gff_destroy(gff_t *gff) +{ + khint_t k; + if ( gff->init.gid2gene ) + { + for (k=0; kinit.gid2gene); k++) + { + if ( !kh_exist(gff->init.gid2gene, k) ) continue; + gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k); + free(gene->name); + free(gene); + } + kh_destroy(int2gene,gff->init.gid2gene); + } + + regidx_destroy(gff->idx_cds); + regidx_destroy(gff->idx_utr); + regidx_destroy(gff->idx_exon); + regidx_destroy(gff->idx_tscript); + + gff_id_destroy(&gff->tscript_ids); + free(gff); +} + diff --git a/bcftools/gff.c.pysam.c b/bcftools/gff.c.pysam.c new file mode 100644 index 0000000..f5c817d --- /dev/null +++ b/bcftools/gff.c.pysam.c @@ -0,0 +1,1100 @@ +#include "bcftools.pysam.h" + +/* The MIT License + + Copyright (c) 2023 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "gff.h" + +/* + Helper structures, only for initialization + + ftr_t + temporary list of all exons, CDS, UTRs +*/ +KHASH_MAP_INIT_INT(int2tscript, gf_tscript_t*) +KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) +typedef struct +{ + int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR + uint32_t beg; + uint32_t end; + uint32_t trid; + uint32_t strand:1; // STRAND_REV,STRAND_FWD + uint32_t phase:2; // 0, 1, 2, or 3 for unknown + uint32_t iseq:29; +} +ftr_t; + +/* + Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001) + to integer id. To keep the memory requirements low, the original version + relied on IDs in the form of a string prefix and a numerical id. However, + it turns out that this assumption is not valid for some ensembl GFFs, see + for example Zea_mays.AGPv4.36.gff3.gz + */ +typedef struct +{ + void *str2id; // khash_str2int + int nstr, mstr; + char **str; // numeric id to string +} +id_tbl_t; + +typedef struct +{ + // all exons, CDS, UTRs + ftr_t *ftr; + int nftr, mftr; + + // mapping from gene id to gf_gene_t + kh_int2gene_t *gid2gene; + + // mapping from transcript id to tscript, for quick CDS anchoring + kh_int2tscript_t *id2tr; + + // sequences + void *seq2int; // str2int hash + char **seq; + int nseq, mseq; + + // ignored biotypes + void *ignored_biotypes; + + id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx + + // pointers to the current partially processed line + char *id, *id_end, *parent, *parent_end, *biotype, *biotype_end, + *chr, *chr_end, *name, *name_end, *type, *type_end; +} +aux_t; + +struct gff_t_ +{ + const char *fname, *dump_fname; + + // the main regidx lookups, from chr:beg-end to overlapping features and + // index iterator + regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; + + // temporary structures, deleted after initializtion + aux_t init; + + // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx + id_tbl_t tscript_ids; + + int strip_chr_names, verbosity; + int force; // force run under various conditions. Currently only to skip out-of-phase transcripts + + struct { + int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; + int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; + } warned; +}; + +static const char *gf_strings_noncoding[] = +{ + "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", + "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", + "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", + "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", + "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", + "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", + "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", + "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf", + "lncRNA" +}; +static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"}; +static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" }; + +int gff_set(gff_t *gff, gff_opt_t key, ...) +{ + va_list args; + switch (key) + { + case dump_fname: + va_start(args, key); + gff->dump_fname = va_arg(args,char*); + va_end(args); + return 0; + + case force_out_of_phase: + va_start(args, key); + gff->force = va_arg(args,int); + va_end(args); + return 0; + + case strip_chr_names: + va_start(args, key); + gff->strip_chr_names = va_arg(args,int); + va_end(args); + return 0; + + case verbosity: + va_start(args, key); + gff->verbosity = va_arg(args,int); + va_end(args); + return 0; + + default: + error("The key %d is not supported with gff_set\n",key); + } + return 0; +} + +void *gff_get(gff_t *gff, gff_opt_t key) +{ + switch (key) + { + case idx_cds: return gff->idx_cds; + case idx_utr: return gff->idx_utr; + case idx_exon: return gff->idx_exon; + case idx_tscript: return gff->idx_tscript; + default: + error("The key %d is not supported with gff_get\n",key); + } + return NULL; +} + +const char *gff_id2string(gff_t *gff, id_type_t type, int id) // currently only transcript ids +{ + return gff->tscript_ids.str[id]; +} + +const char *gf_type2gff_string(int type) +{ + if ( !GF_is_coding(type) ) + { + if ( type < (1<init; + char tmp = chr_end[1]; + chr_end[1] = 0; + int iseq; + if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) + { + char *new_chr = strdup(chr_beg); + hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); + aux->seq[aux->nseq] = new_chr; + iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); + aux->nseq++; + assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + } + chr_end[1] = tmp; + return iseq; +} +static inline char *gff_skip(const char *line, char *ss) +{ + while ( *ss && *ss!='\t' ) ss++; + if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return ss+1; +} +static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, char **chr_end) +{ + char *se = (char*) line; + while ( *se && *se!='\t' ) se++; + if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3; + *chr_beg = (char*) line; + *chr_end = se-1; +} +static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end) +{ + char *se = ss; + *beg = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss); + ss = se+1; + *end = strtol(ss, &se, 10) - 1; + if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + return se+1; +} +static void gff_id_init(id_tbl_t *tbl) +{ + memset(tbl, 0, sizeof(*tbl)); + tbl->str2id = khash_str2int_init(); +} +static void gff_id_destroy(id_tbl_t *tbl) +{ + khash_str2int_destroy_free(tbl->str2id); + free(tbl->str); +} +static inline int gff_id_register(id_tbl_t *tbl, char *beg, char *end, uint32_t *id_ptr) +{ + char tmp = end[1]; + end[1] = 0; + int id; + if ( khash_str2int_get(tbl->str2id, beg, &id) < 0 ) + { + id = tbl->nstr++; + hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); + tbl->str[id] = strdup(beg); + khash_str2int_set(tbl->str2id, tbl->str[id], id); + } + end[1] = tmp; + *id_ptr = id; + return 0; +} +static inline int gff_parse_biotype(char *line) +{ + if ( !line ) return -1; + switch (*line) + { + case 'p': + if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; + else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; + else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; + else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE; + break; + case 'a': + if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT; + else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE; + else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF; + break; + case 'I': + if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE; + else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE; + else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE; + else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE; + else if ( !strncmp(line,"IG_C",4) ) return GF_IG_C; + else if ( !strncmp(line,"IG_D",4) ) return GF_IG_D; + else if ( !strncmp(line,"IG_J",4) ) return GF_IG_J; + else if ( !strncmp(line,"IG_V",4) ) return GF_IG_V; + else if ( !strncmp(line,"IG_LV",5) ) return GF_IG_LV; + break; + case 'T': + if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE; + else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE; + else if ( !strncmp(line,"TR_C",4) ) return GF_TR_C; + else if ( !strncmp(line,"TR_D",4) ) return GF_TR_D; + else if ( !strncmp(line,"TR_J",4) ) return GF_TR_J; + else if ( !strncmp(line,"TR_V",4) ) return GF_TR_V; + break; + case 'M': + if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE; + else if ( !strncasecmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA; + else if ( !strncasecmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA; + else if ( !strncasecmp(line,"MRNA",4) ) return GF_PROTEIN_CODING; + break; + case 'l': + if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA; + if ( !strncmp(line,"lncRNA",7) ) return GF_lncRNA; + break; + case 'm': + if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA; + else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE; + else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA; + else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA; + else if ( !strncasecmp(line,"mRNA",4) ) return GF_PROTEIN_CODING; + break; + case 'r': + if ( !strncmp(line,"rRNA",4) ) return GF_rRNA; + else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME; + else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON; + else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED; + break; + case 's': + if ( !strncmp(line,"snRNA",5) ) return GF_snRNA; + else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA; + else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA; + else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA; + else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA; + else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC; + else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING; + break; + case 't': + if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; + break; + case 'n': + if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD; + else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY; + break; + case 'N': + if ( !strncmp(line,"NMD",3) ) return GF_NMD; + break; + case 'k': + if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA; + break; + case 'u': + if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE; + else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE; + break; + case 'L': + if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE; + break; + case '3': + if ( !strncasecmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA; + else if ( !strncasecmp(line,"3_prime_overlapping_ncRNA",25) ) return GF_3PRIME_OVERLAPPING_ncRNA; + break; + case 'd': + if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN; + break; + case 'v': + if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA; + break; + case 'b': + if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA; + break; + } + return 0; +} +static inline int gff_ignored_biotype(gff_t *gff, char *ss, char *se) +{ + if ( !ss ) return 0; + + char tmp = se[1]; + se[1] = 0; + + char *key = ss; + int n = 0; + if ( khash_str2int_get(gff->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss); + khash_str2int_set(gff->init.ignored_biotypes, key, n+1); + + se[1] = tmp; + return 1; +} +static gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id) +{ + khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id); + gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k); + if ( !gene ) + { + gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t)); + int ret; + k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret); + kh_val(aux->gid2gene,k) = gene; + } + return gene; +} +static void gff_parse_transcript(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + + ftr->type = gff_parse_biotype(aux->biotype); + if ( ftr->type <= 0 ) + { + char tmp = aux->type_end[1]; + aux->type_end[1] = 0; + ftr->type = gff_parse_biotype(aux->type); + aux->type_end[1] = tmp; + } + if ( ftr->type <= 0 ) + { + if ( !gff_ignored_biotype(gff,aux->biotype,aux->biotype_end) ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_tscript_biotype || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: Ignoring transcript with unknown biotype .. %s\n", line); + gff->warned.unknown_tscript_biotype++; + } + } + return; + } + + if ( !aux->id ) + error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + if ( !aux->parent ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + + uint32_t trid,gene_id; + gff_id_register(&gff->tscript_ids, aux->id, aux->id_end, &trid); + gff_id_register(&aux->gene_ids, aux->parent, aux->parent_end, &gene_id); + + gf_tscript_t *tr = (gf_tscript_t*) calloc(1,sizeof(gf_tscript_t)); + tr->id = trid; + tr->strand = ftr->strand; + tr->gene = gene_init(aux, gene_id); + tr->type = ftr->type; + tr->beg = ftr->beg; + tr->end = ftr->end; + + khint_t k; + int ret; + k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret); + kh_val(aux->id2tr,k) = tr; +} +// register exon, CDS, UTR +static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + if ( !aux->parent ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring found: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + + // associate with transcript id + gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid); + + if ( ftr->strand==-1 && gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_strand || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line); + gff->warned.unknown_strand++; + } + if ( ftr->phase==-1 && gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_phase|| gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line); + gff->warned.unknown_phase++; + } + ftr->iseq = feature_set_seq(gff, aux->chr,aux->chr_end); +} +static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + if ( !aux->id ) return; + + uint32_t gene_id; + gff_id_register(&aux->gene_ids, aux->id, aux->id_end, &gene_id); + + gf_gene_t *gene = gene_init(aux, gene_id); + if ( gene->name ) + { + if ( !gff->warned.duplicate_id || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: The GFF contains features with duplicate id .. %s\n",line); + gff->warned.duplicate_id++; + return; + } + + gene->iseq = feature_set_seq(gff, aux->chr,aux->chr_end); + gene->beg = ftr->beg; + gene->end = ftr->end; + gene->strand = ftr->strand; + gene->id = gene_id; + + if ( aux->name ) + { + gene->name = (char*) malloc(aux->name_end - aux->name + 2); + memcpy(gene->name,aux->name,aux->name_end - aux->name + 1); + gene->name[aux->name_end - aux->name + 1] = 0; + } + else + gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead +} + +// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them, +// or -1 to indiciate the structure needs not be saved (either because of an error or because saved +// as transcript or gene.) +static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr) +{ + // - skip empty lines and commented lines + // - columns + // 1. chr + // 2. + // 3. CDS, transcript, gene, ... + // 4-5. beg,end + // 6. + // 7. strand + // 8. phase + // 9. Parent=transcript:ENST(\d+);ID=...;biotype=... etc + + char *ss = line; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + aux_t *aux = &gff->init; + gff_parse_chr(gff, line, &aux->chr, &aux->chr_end); + ss = gff_skip(line, aux->chr_end + 2); + + // 3rd column: is this a CDS, transcript, gene, etc.. The parsing order by frequency in Homo_sapiens.GRCh37.87.gff3 + int is_gene_line = 0; + ftr->type = 0; + aux->type = ss; + if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; } + else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; } + else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; } + else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } + else if ( !strncmp("biological_region\t",ss,18) ) { return -1; } // skip + else if ( !strncmp("gene\t",ss,5) ) { is_gene_line = 1; ss += 5; } + else ss = gff_skip(line, ss); + aux->type_end = ss - 1; + + // 4-5th columns: beg,end + ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); + + // 6th column: skip + ss = gff_skip(line, ss); + + // 7th column: strand + ftr->strand = -1; + if ( *ss == '+' ) ftr->strand = STRAND_FWD; + else if ( *ss == '-' ) ftr->strand = STRAND_REV; + ss += 2; + + // 8th column: phase (codon offset) + ftr->phase = -1; + if ( *ss == '0' ) ftr->phase = 0; + else if ( *ss == '1' ) ftr->phase = 1; + else if ( *ss == '2' ) ftr->phase = 2; + else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase + ss += 2; + + // 9th column: id, parent, name, biotype + aux->name = NULL, aux->id = NULL, aux->parent = NULL, aux->biotype = NULL; + while ( *ss ) + { + char *es = ss; + while ( *es && *es!=';' ) es++; + if ( !strncmp(ss,"ID=",3) ) + { + ss += 3; + aux->id_end = es - 1; + aux->id = ss; + if ( !strncmp(ss,"gene:",5) ) { aux->id += 5; is_gene_line = 1; } + else if ( !strncmp(ss,"transcript:",11) ) aux->id += 11; + } + else if ( !strncmp(ss,"Name=",5) ) { aux->name = ss + 5; aux->name_end = es - 1; } + else if ( !strncmp(ss,"Parent=",7) ) + { + ss += 7; + aux->parent_end = es - 1; + aux->parent = ss; + if ( !strncmp(ss,"gene:",5) ) aux->parent += 5; + else if ( !strncmp(ss,"transcript:",11) ) aux->parent += 11; + } + else if ( !strncmp(ss,"biotype=",8) ) { aux->biotype = ss + 8; aux->biotype_end = es - 1; } + else if ( !strncmp(ss,"gene_biotype=",13) ) { aux->biotype = ss + 13; aux->biotype_end = es - 1; } + if ( !*es ) break; + ss = es + 1; + } + + if ( is_gene_line || !aux->parent ) + { + gff_parse_gene(gff, line, ftr); + return -1; + } + + if ( ftr->type ) + { + gff_parse_exon(gff, line, ftr); + return 0; + } + + gff_parse_transcript(gff, line, ftr); + return -1; +} + +static int cmp_cds_ptr(const void *a, const void *b) +{ + // comparison function for qsort of transcripts's CDS + if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1; + if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1; + return 0; +} + +static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end) +{ + *chr_beg = *chr_end = aux->seq[iseq]; + while ( (*chr_end)[1] ) (*chr_end)++; +} +static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid) +{ + khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid); + gf_tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k); + assert( tr ); + return tr; +} +static void register_cds(gff_t *gff, ftr_t *ftr) +{ + // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet. + // ftr is the result of parsing a gff CDS line + aux_t *aux = &gff->init; + + gf_tscript_t *tr = tscript_init(aux, ftr->trid); + if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); + + gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); + cds->tr = tr; + cds->beg = ftr->beg; + cds->len = ftr->end - ftr->beg + 1; + cds->icds = 0; // to keep valgrind on mac happy + cds->phase = ftr->phase; + + hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); + tr->cds[tr->ncds++] = cds; +} +static void register_utr(gff_t *gff, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t)); + utr->which = ftr->type==GF_UTR3 ? prime3 : prime5; + utr->beg = ftr->beg; + utr->end = ftr->end; + utr->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr); +} +static void register_exon(gff_t *gff, ftr_t *ftr) +{ + aux_t *aux = &gff->init; + gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t)); + exon->beg = ftr->beg; + exon->end = ftr->end; + exon->tr = tscript_init(aux, ftr->trid); + + char *chr_beg, *chr_end; + chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon); +} + +static void tscript_init_cds(gff_t *gff) +{ + aux_t *aux = &gff->init; + + // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) + khint_t k; + for (k=0; kid2tr); k++) + { + if ( !kh_exist(aux->id2tr, k) ) continue; + gf_tscript_t *tr = (gf_tscript_t*) kh_val(aux->id2tr, k); + + // position-to-tscript lookup + char *chr_beg, *chr_end; + chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end); + regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr); + + if ( !tr->ncds ) continue; // transcript with no CDS + + // sort CDs + qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr); + + // trim non-coding start + int i, len = 0; + if ( tr->strand==STRAND_FWD ) + { + if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) + { + if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; + tr->cds[0]->beg += tr->cds[0]->phase; + tr->cds[0]->len -= tr->cds[0]->phase; + tr->cds[0]->phase = 0; + } + + // sanity check phase; the phase number in gff tells us how many bases to skip in this + // feature to reach the first base of the next codon + int tscript_ok = 1; + for (i=0; incds; i++) + { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]); + gff->warned.unknown_cds_phase++; + } + len += tr->cds[i]->len; + continue; + } + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3 ) + { + if ( !gff->force ) + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.wrong_phase || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + gff->warned.wrong_phase++; + } + tscript_ok = 0; + break; + } + len += tr->cds[i]->len; + } + if ( !tscript_ok ) continue; // skip this transcript + } + else + { + if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) + { + // Check that the phase is not bigger than CDS length. Curiously, this can really happen, + // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141. + // This also fixes phase of 5' incomplete CDS, see test/csq/ENST00000520868/ENST00000520868.gff + // todo: the same for the fwd strand + i = tr->ncds - 1; + int phase = tr->cds[i]->phase; + if ( phase ) tr->trim |= TRIM_5PRIME; + while ( i>=0 && phase > tr->cds[i]->len ) + { + phase -= tr->cds[i]->len; + tr->cds[i]->phase = 0; + tr->cds[i]->len = 0; + i--; + } + if ( gff->verbosity > 0 && tr->cds[i]->phase ) + { + if ( !gff->warned.incomplete_cds || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]); + gff->warned.incomplete_cds++; + } + tr->cds[i]->len -= tr->cds[i]->phase; + tr->cds[i]->phase = 0; + } + + // sanity check phase + int tscript_ok = 1; + for (i=tr->ncds-1; i>=0; i--) + { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]); + gff->warned.unknown_cds_phase++; + } + len += tr->cds[i]->len; + continue; + } + int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; + if ( phase!=len%3 ) + { + if ( !gff->force ) + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.wrong_phase || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + gff->warned.wrong_phase++; + } + tscript_ok = 0; + break; + } + len += tr->cds[i]->len; + } + if ( !tscript_ok ) continue; // skip this transcript + } + + // set len. At the same check that CDS within a transcript do not overlap + len = 0; + for (i=0; incds; i++) + { + tr->cds[i]->icds = i; + len += tr->cds[i]->len; + if ( !i ) continue; + + gf_cds_t *a = tr->cds[i-1]; + gf_cds_t *b = tr->cds[i]; + if ( a->beg + a->len - 1 >= b->beg ) + { + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.overlapping_cds || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s, %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32" (ribosomal slippage?)\n", + gff->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + gff->warned.overlapping_cds++; + } + } + } + + if ( len%3 != 0 ) + { + // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289 + // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289 + // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one. + + if ( gff->verbosity > 0 ) + { + if ( !gff->warned.incomplete_cds || gff->verbosity > 1 ) + fprintf(bcftools_stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]); + gff->warned.incomplete_cds++; + } + + tr->trim |= TRIM_3PRIME; + if ( tr->strand==STRAND_FWD ) + { + i = tr->ncds - 1; + while ( i>=0 && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + len -= dlen; + i--; + } + } + else + { + i = 0; + while ( incds && len%3 ) + { + int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len; + tr->cds[i]->len -= dlen; + tr->cds[i]->beg += dlen; + len -= dlen; + i++; + } + } + } + + // set CDS offsets and insert into regidx + len=0; + for (i=0; incds; i++) + { + tr->cds[i]->pos = len; + len += tr->cds[i]->len; + regidx_push(gff->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); + } + } +} + +static void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } +static void regidx_free_tscript(void *payload) { gf_tscript_t *tr = *((gf_tscript_t**)payload); free(tr->cds); free(tr); } + +static int gff_dump(gff_t *gff, const char *fname) +{ + BGZF *out = bgzf_open(fname,"wg"); + if ( !out ) error("Failed to open %s: %s\n", fname, strerror(errno)); + + kstring_t str = {0,0,0}; + + khint_t k; + for (k=0; kinit.gid2gene); k++) + { + if ( !kh_exist(gff->init.gid2gene, k) ) continue; + gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k); + char *gene_id = gff->init.gene_ids.str[gene->id]; + str.l = 0; + ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + + regitr_t *itr = regitr_init(gff->idx_tscript); + while ( regitr_loop(itr) ) + { + gf_tscript_t *tr = regitr_payload(itr, gf_tscript_t*); + char *gene_id = gff->init.gene_ids.str[tr->gene->id]; + const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type); + str.l = 0; + ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_cds); + while ( regitr_loop(itr) ) + { + gf_cds_t *cds = regitr_payload(itr,gf_cds_t*); + gf_tscript_t *tr = cds->tr; + str.l = 0; + ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_utr); + while ( regitr_loop(itr) ) + { + gf_utr_t *utr = regitr_payload(itr,gf_utr_t*); + gf_tscript_t *tr = utr->tr; + str.l = 0; + ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + itr = regitr_init(gff->idx_exon); + while ( regitr_loop(itr) ) + { + gf_exon_t *exon = regitr_payload(itr,gf_exon_t*); + gf_tscript_t *tr = exon->tr; + str.l = 0; + ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]); + if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno)); + } + regitr_destroy(itr); + + if ( bgzf_close(out)!=0 ) error("Error: close failed .. %s\n", fname); + free(str.s); + + return 0; +} + +int gff_parse(gff_t *gff) +{ + if ( gff->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", gff->fname); + + aux_t *aux = &gff->init; + aux->seq2int = khash_str2int_init(); // chrom's numeric id + aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene + aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t + gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL); + aux->ignored_biotypes = khash_str2int_init(); + gff_id_init(&aux->gene_ids); + gff_id_init(&gff->tscript_ids); + + // parse gff + kstring_t str = {0,0,0}; + htsFile *fp = hts_open(gff->fname,"r"); + if ( !fp ) error("Failed to read %s\n", gff->fname); + while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) + { + hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr); + int ret = gff_parse_line(gff, str.s, aux->ftr + aux->nftr); + if ( !ret ) aux->nftr++; + } + free(str.s); + if ( hts_close(fp)!=0 ) error("Close failed: %s\n", gff->fname); + + + // process gff information: connect CDS and exons to transcripts + gff->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL); + gff->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL); + gff->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL); + + int i; + for (i=0; inftr; i++) + { + ftr_t *ftr = &aux->ftr[i]; + + // check whether to keep this feature: is there a mapping trid -> gene_id -> gene? + khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid); + if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype + + gf_tscript_t *tr = kh_val(aux->id2tr,k); + tr->used = 1; + tr->gene->used = 1; + + // populate regidx by category: + // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 + // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... + if ( ftr->type==GF_CDS ) register_cds(gff, ftr); + else if ( ftr->type==GF_EXON ) register_exon(gff, ftr); + else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr); + else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr); + else + error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); + } + tscript_init_cds(gff); + + if ( gff->verbosity > 0 ) + { + fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", + regidx_nregs(gff->idx_tscript), + regidx_nregs(gff->idx_exon), + regidx_nregs(gff->idx_cds), + regidx_nregs(gff->idx_utr)); + } + + if ( gff->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) + { + khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; + fprintf(bcftools_stderr,"Ignored the following biotypes:\n"); + for (i = kh_begin(ign); i < kh_end(ign); i++) + { + if ( !kh_exist(ign,i)) continue; + const char *biotype = kh_key(ign,i); + if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; + fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); + } + } + khash_str2int_destroy_free(aux->ignored_biotypes); + + // warned about unprinted warnings + if ( gff->verbosity > 0 ) + { + int nwarn = 0; + #define INC_NWARN(X) if (gff->warned.X) nwarn += gff->verbosity > 1 ? 0 : gff->warned.X - 1; + INC_NWARN(unknown_chr); + INC_NWARN(unknown_tscript_biotype); + INC_NWARN(unknown_strand); + INC_NWARN(unknown_phase); + INC_NWARN(duplicate_id); + INC_NWARN(unknown_cds_phase); + INC_NWARN(incomplete_cds); + INC_NWARN(wrong_phase); + INC_NWARN(overlapping_cds); + if ( nwarn > 0 ) + fprintf(bcftools_stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn); + } + + if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname); + + if ( !regidx_nregs(gff->idx_tscript) ) + error("Error: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n" + " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" + " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); + + free(aux->seq); + free(aux->ftr); + khash_str2int_destroy_free(aux->seq2int); + // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); + kh_destroy(int2tscript,aux->id2tr); + gff_id_destroy(&aux->gene_ids); + + return 0; +} + +gff_t *gff_init(const char *fname) +{ + gff_t *gff = calloc(sizeof(gff_t),1); + gff->fname = fname; + return gff; +} +void gff_destroy(gff_t *gff) +{ + khint_t k; + if ( gff->init.gid2gene ) + { + for (k=0; kinit.gid2gene); k++) + { + if ( !kh_exist(gff->init.gid2gene, k) ) continue; + gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k); + free(gene->name); + free(gene); + } + kh_destroy(int2gene,gff->init.gid2gene); + } + + regidx_destroy(gff->idx_cds); + regidx_destroy(gff->idx_utr); + regidx_destroy(gff->idx_exon); + regidx_destroy(gff->idx_tscript); + + gff_id_destroy(&gff->tscript_ids); + free(gff); +} + diff --git a/bcftools/gff.h b/bcftools/gff.h new file mode 100644 index 0000000..ebb6463 --- /dev/null +++ b/bcftools/gff.h @@ -0,0 +1,332 @@ +/* The MIT License + + Copyright (c) 2023 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ +/* + GFF parsing code refactored from csq.c + + Things that would be nice to have + - dynamic N_REF_PAD + - for stop-lost events (also in frameshifts) report the number of truncated aa's + - memory could be greatly reduced by indexing gff (but it is quite compact already) + - deletions that go beyond transcript boundaries are not checked at sequence level + - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16 + - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882 + + Read about transcript types here + http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html + http://www.ensembl.org/info/genome/variation/predicted_data.html + https://www.gencodegenes.org/pages/biotypes.html + + List of supported biotypes + antisense + IG_C_gene + IG_D_gene + IG_J_gene + IG_LV_gene + IG_V_gene + lincRNA + lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping + macro_lncRNA + miRNA + misc_RNA + Mt_rRNA + Mt_tRNA + polymorphic_pseudogene + processed_transcript + protein_coding, mRNA + ribozyme + rRNA + sRNA + scRNA + scaRNA + sense_intronic + sense_overlapping + snRNA + snoRNA + TR_C_gene + TR_D_gene + TR_J_gene + TR_V_gene + + The gff parsing logic + We collect features such by combining gff lines A,B,C as follows: + A .. gene line with a supported biotype + A.ID=~/^gene:/ + + B .. transcript line referencing A with supported biotype + B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/ + + C .. corresponding CDS, exon, and UTR lines: + C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/ + + For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the + complete chain link C -> B -> A is required. For the rest, link B -> A suffices. + + + The supported consequence types, sorted by impact: + splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron) + splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron) + stop_gained .. DNA sequence variant resulting in a stop codon + frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame + stop_lost .. elongated transcript, stop codon changed + start_lost .. the first codon changed + inframe_altering .. combination of indels leading to unchanged reading frame and length + inframe_insertion .. inserted coding sequence, unchanged reading frame + inframe_deletion .. deleted coding sequence, unchanged reading frame + missense_variant .. amino acid (aa) change, unchanged length + splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron + synonymous_variant .. DNA sequence variant resulting in no amino acid change + stop_retained_variant .. different stop codon + start_retained_variant .. start codon retained by indel realignment + non_coding_variant .. variant in non-coding sequence, such as RNA gene + 5_prime_UTR_variant + 3_prime_UTR_variant + intron_variant .. reported only if none of the above + intergenic_variant .. reported only if none of the above + + + The annotation algorithm. + The algorithm checks if the variant falls in a region of a supported type. The + search is performed in the following order, until a match is found: + 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences + 2. idx_utr(gf_utr_t) - check UTR hits + 3. idx_exon(gf_exon_t) - check for splice variants + 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc. + + These regidx indexes are created by parsing a gff3 file as follows: + 1. create the array "ftr" of all UTR, CDS, exons. This will be + processed later and pruned based on transcript types we want to keep. + In the same go, create the hash "id2tr" of transcripts to keep + (based on biotype) which maps from transcript_id to a transcript. At + the same time also build the hash "gid2gene" which maps from gene_id to + gf_gene_t pointer. + + 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes. + Use only features from "ftr" which are present in "id2tr". + + 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene. + + Data structures. + idx_cds, idx_utr, idx_exon, idx_tscript: + as described above, regidx structures for fast lookup of exons/transcripts + overlapping a region, the payload is a pointer to tscript.cds +*/ + +#ifndef GFF_H__ +#define GFF_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "regidx.h" + +#ifndef __FUNCTION__ +# define __FUNCTION__ __func__ +#endif + +// Definition of splice_region, splice_acceptor and splice_donor +#define N_SPLICE_DONOR 2 +#define N_SPLICE_REGION_EXON 3 +#define N_SPLICE_REGION_INTRON 8 + +#define STRAND_REV 0 +#define STRAND_FWD 1 + +#define TRIM_NONE 0 +#define TRIM_5PRIME 1 +#define TRIM_3PRIME 2 + + +// GFF line types +#define GFF_UNKN_LINE 0 +#define GFF_TSCRIPT_LINE 1 +#define GFF_GENE_LINE 2 + + +/* + Genomic features, for fast lookup by position to overlapping features +*/ +#define GF_coding_bit 6 +#define GF_is_coding(x) ((x) & (1< +// @author Nicola Asuni +// @link https://github.com/tecnickcom/variantkey +// @license MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE) // @copyright 2017-2018 GENOMICS plc -// @license MIT (see LICENSE) -// @link https://github.com/genomicsplc/variantkey // // LICENSE // diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c index 9b21b18..d42a6a3 100644 --- a/bcftools/mpileup.c +++ b/bcftools/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2022 Genome Research Ltd. + Copyright (C) 2008-2023 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -101,6 +101,8 @@ typedef struct { int indels_v20; int argc; char **argv; + int write_index; + char *index_fn; } mplp_conf_t; typedef struct { @@ -489,37 +491,43 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) { // Left & right cigar op match. int lr = b->core.l_qseq > 500; - int lm = 0, rm = 0, k; + int lm = 0, rm = 0, k, nm = 0; for (k = 0; k < ncig; k++) { int cop = bam_cigar_op(cig[k]); if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) continue; if (cop == BAM_CMATCH || cop == BAM_CDIFF || - cop == BAM_CEQUAL) + cop == BAM_CEQUAL) { lm += bam_cigar_oplen(cig[k]); - else + nm++; + } else { break; + } } - for (k = ncig-1; k >= 0; k--) { - int cop = bam_cigar_op(cig[k]); - if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + // if everything is a match (or sequence (mis)match) then move on + // because we don't have an indel in the middle + if (nm != ncig) { + for (k = ncig-1; k >= 0; k--) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + rm += bam_cigar_oplen(cig[k]); + else + break; + } + + if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) continue; - if (cop == BAM_CMATCH || cop == BAM_CDIFF || - cop == BAM_CEQUAL) - rm += bam_cigar_oplen(cig[k]); - else - break; + if (lm >= REALN_DIST && rm >= REALN_DIST && + has_clip < (0.15+0.05*(nt>20))*nt) + continue; } - - if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) - continue; - - if (lm >= REALN_DIST && rm >= REALN_DIST && - has_clip < (0.15+0.05*(nt>20))*nt) - continue; } if (b->core.l_qseq > 500) { @@ -849,6 +857,7 @@ static int mpileup(mplp_conf_t *conf) for (i=0; ibcf_hdr, smpl[i]); if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); + if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname); conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ, conf->delta_baseQ); @@ -958,6 +967,15 @@ static int mpileup(mplp_conf_t *conf) bcf_destroy1(conf->bcf_rec); if (conf->bcf_fp) { + if ( conf->write_index ) + { + if ( bcf_idx_save(conf->bcf_fp)<0 ) + { + if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); + error("Error: cannot write to index %s\n",conf->index_fn); + } + free(conf->index_fn); + } if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); bcf_hdr_destroy(conf->bcf_hdr); bcf_call_destroy(conf->bca); @@ -1227,6 +1245,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" " --threads INT Use multithreading with INT worker threads [0]\n" + " --write-index Automatically index the output files [off]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" " -X, --config STR Specify platform specific profiles (see below)\n" @@ -1375,6 +1394,7 @@ int main_mpileup(int argc, char *argv[]) {"seed", required_argument, NULL, 13}, {"ambig-reads", required_argument, NULL, 14}, {"ar", required_argument, NULL, 14}, + {"write-index",no_argument,NULL,21}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1497,6 +1517,7 @@ int main_mpileup(int argc, char *argv[]) } break; case 20: mplp.indels_v20 = 1; break; + case 21: mplp.write_index = 1; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index 724a0ec..81c5849 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -2,7 +2,7 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2022 Genome Research Ltd. + Copyright (C) 2008-2023 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -103,6 +103,8 @@ typedef struct { int indels_v20; int argc; char **argv; + int write_index; + char *index_fn; } mplp_conf_t; typedef struct { @@ -491,37 +493,43 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) { // Left & right cigar op match. int lr = b->core.l_qseq > 500; - int lm = 0, rm = 0, k; + int lm = 0, rm = 0, k, nm = 0; for (k = 0; k < ncig; k++) { int cop = bam_cigar_op(cig[k]); if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) continue; if (cop == BAM_CMATCH || cop == BAM_CDIFF || - cop == BAM_CEQUAL) + cop == BAM_CEQUAL) { lm += bam_cigar_oplen(cig[k]); - else + nm++; + } else { break; + } } - for (k = ncig-1; k >= 0; k--) { - int cop = bam_cigar_op(cig[k]); - if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + // if everything is a match (or sequence (mis)match) then move on + // because we don't have an indel in the middle + if (nm != ncig) { + for (k = ncig-1; k >= 0; k--) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + rm += bam_cigar_oplen(cig[k]); + else + break; + } + + if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) continue; - if (cop == BAM_CMATCH || cop == BAM_CDIFF || - cop == BAM_CEQUAL) - rm += bam_cigar_oplen(cig[k]); - else - break; + if (lm >= REALN_DIST && rm >= REALN_DIST && + has_clip < (0.15+0.05*(nt>20))*nt) + continue; } - - if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) - continue; - - if (lm >= REALN_DIST && rm >= REALN_DIST && - has_clip < (0.15+0.05*(nt>20))*nt) - continue; } if (b->core.l_qseq > 500) { @@ -851,6 +859,7 @@ static int mpileup(mplp_conf_t *conf) for (i=0; ibcf_hdr, smpl[i]); if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); + if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname); conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ, conf->delta_baseQ); @@ -960,6 +969,15 @@ static int mpileup(mplp_conf_t *conf) bcf_destroy1(conf->bcf_rec); if (conf->bcf_fp) { + if ( conf->write_index ) + { + if ( bcf_idx_save(conf->bcf_fp)<0 ) + { + if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); + error("Error: cannot write to index %s\n",conf->index_fn); + } + free(conf->index_fn); + } if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); bcf_hdr_destroy(conf->bcf_hdr); bcf_call_destroy(conf->bca); @@ -1229,6 +1247,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" " --threads INT Use multithreading with INT worker threads [0]\n" + " --write-index Automatically index the output files [off]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" " -X, --config STR Specify platform specific profiles (see below)\n" @@ -1377,6 +1396,7 @@ int main_mpileup(int argc, char *argv[]) {"seed", required_argument, NULL, 13}, {"ambig-reads", required_argument, NULL, 14}, {"ar", required_argument, NULL, 14}, + {"write-index",no_argument,NULL,21}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1499,6 +1519,7 @@ int main_mpileup(int argc, char *argv[]) } break; case 20: mplp.indels_v20 = 1; break; + case 21: mplp.write_index = 1; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; diff --git a/bcftools/reheader.c b/bcftools/reheader.c index 4458f27..ed85217 100644 --- a/bcftools/reheader.c +++ b/bcftools/reheader.c @@ -68,7 +68,8 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0}; char *chr_name = NULL, *p, *q = line + 9; // skip ##contig= char *end = q; - int nopen = 1, chr_len = 0; + int nopen = 1; + hts_pos_t chr_len = 0; while ( *end && *end!='\n' ) end++; while ( *q && *q!='\n' && nopen>0 ) { @@ -118,7 +119,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see if ( !strcmp("ID",key.s) ) { if ( khash_str2int_has_key(chr_seen,val.s) ) continue; - chr_len = faidx_seq_len(fai, val.s); + chr_len = faidx_seq_len64(fai, val.s); if ( chr_len==-1 ) { free(val.s); free(key.s); free(tmp.s); @@ -136,7 +137,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see if ( quoted ) kputc('"',&tmp); } if ( !chr_name ) return end; - ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); + ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); free(key.s); free(val.s); free(tmp.s); return q; } @@ -211,7 +212,7 @@ static void update_from_fai(args_t *args) for (i=0; i\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i))); + ksprintf(&hdr_txt_new,"##contig=\n",faidx_iseq(fai,i),faidx_seq_len64(fai,faidx_iseq(fai,i))); } kputs(tmp+1,&hdr_txt_new); @@ -699,7 +700,7 @@ int main_reheader(int argc, char *argv[]) int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; - + static struct option loptions[] = { {"temp-prefix",1,0,'T'}, diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c index a069870..44dff8c 100644 --- a/bcftools/reheader.c.pysam.c +++ b/bcftools/reheader.c.pysam.c @@ -70,7 +70,8 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0}; char *chr_name = NULL, *p, *q = line + 9; // skip ##contig= char *end = q; - int nopen = 1, chr_len = 0; + int nopen = 1; + hts_pos_t chr_len = 0; while ( *end && *end!='\n' ) end++; while ( *q && *q!='\n' && nopen>0 ) { @@ -120,7 +121,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see if ( !strcmp("ID",key.s) ) { if ( khash_str2int_has_key(chr_seen,val.s) ) continue; - chr_len = faidx_seq_len(fai, val.s); + chr_len = faidx_seq_len64(fai, val.s); if ( chr_len==-1 ) { free(val.s); free(key.s); free(tmp.s); @@ -138,7 +139,7 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see if ( quoted ) kputc('"',&tmp); } if ( !chr_name ) return end; - ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); + ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); free(key.s); free(val.s); free(tmp.s); return q; } @@ -213,7 +214,7 @@ static void update_from_fai(args_t *args) for (i=0; i\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i))); + ksprintf(&hdr_txt_new,"##contig=\n",faidx_iseq(fai,i),faidx_seq_len64(fai,faidx_iseq(fai,i))); } kputs(tmp+1,&hdr_txt_new); @@ -701,7 +702,7 @@ int main_reheader(int argc, char *argv[]) int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; - + static struct option loptions[] = { {"temp-prefix",1,0,'T'}, diff --git a/bcftools/tsv2vcf.c b/bcftools/tsv2vcf.c index 596e75a..22dec30 100644 --- a/bcftools/tsv2vcf.c +++ b/bcftools/tsv2vcf.c @@ -10,10 +10,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/bcftools/tsv2vcf.c.pysam.c b/bcftools/tsv2vcf.c.pysam.c index 8c62157..83de6f3 100644 --- a/bcftools/tsv2vcf.c.pysam.c +++ b/bcftools/tsv2vcf.c.pysam.c @@ -12,10 +12,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/bcftools/variantkey.h b/bcftools/variantkey.h index ccd4d8d..a74935f 100644 --- a/bcftools/variantkey.h +++ b/bcftools/variantkey.h @@ -3,14 +3,15 @@ // variantkey.h // // @category Libraries -// @author Nicola Asuni -// @copyright 2017-2018 GENOMICS plc -// @license MIT (see LICENSE) -// @link https://github.com/genomicsplc/variantkey +// @author Nicola Asuni +// @link https://github.com/tecnickcom/variantkey +// @license MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE) +// @copyright 2017-2018 GENOMICS plc, 2018-2023 Nicola Asuni - Tecnick.com // // LICENSE // // Copyright (c) 2017-2018 GENOMICS plc +// Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -54,6 +55,7 @@ #define VKMASK_REFALT 0x000000007FFFFFFF //!< VariantKey binary mask for REF+ALT [ 00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111111 ] #define VKSHIFT_CHROM 59 //!< CHROM LSB position from the VariantKey LSB #define VKSHIFT_POS 31 //!< POS LSB position from the VariantKey LSB +#define MAXUINT32 0xFFFFFFFF //!< Maximum value for uint32_t /** * VariantKey struct. @@ -75,16 +77,54 @@ typedef struct vkrange_t uint64_t max; //!< Maximum VariantKey value for any given REF+ALT encoding } vkrange_t; -/** @brief Returns chromosome numerical encoding. +/** @brief Returns the encoding for a numerical chromosome input. * * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted. * @param size Length of the chrom string, excluding the terminating null byte. * * @return CHROM code */ +static inline uint8_t encode_numeric_chrom(const char *chrom, size_t size) +{ + size_t i; + uint8_t v = (chrom[0] - '0'); + for (i = 1; i < size; i++) + { + if ((chrom[i] > '9') || (chrom[i] < '0')) + { + return 0; // NA: a character that is not a numebr was found. + } + v = ((v * 10) + (chrom[i] - '0')); + } + return v; +} + + +/** @brief Returns a true value (1) if the input chrom has 'chr' prefix (case insensitive). + * + * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted. + * @param size Length of the chrom string, excluding the terminating null byte. + * + * @return True (1) if the chr prefix is present. + */ +static inline int has_chrom_chr_prefix(const char *chrom, size_t size) +{ + return ((size > 3) + && ((chrom[0] == 'c') || (chrom[0] == 'C')) + && ((chrom[1] == 'h') || (chrom[1] == 'H')) + && ((chrom[2] == 'r') || (chrom[2] == 'R'))); +} + +/** @brief Returns chromosome numerical encoding. + * + * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted. + * @param size Length of the chrom string, excluding the terminating null byte. + * + * @return CHROM code or 0 in case of invalid input. + */ static inline uint8_t encode_chrom(const char *chrom, size_t size) { - // X > 23 ; Y > 24 ; M > 25 + // X = 23; Y = 24; M = 25; any other letter is mapped to 0: static const uint8_t onecharmap[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -98,12 +138,9 @@ static inline uint8_t encode_chrom(const char *chrom, size_t size) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; - // remove "chr" prefix - if ((size > 3) - && ((chrom[0] == 'c') || (chrom[0] == 'C')) - && ((chrom[1] == 'h') || (chrom[1] == 'H')) - && ((chrom[2] == 'r') || (chrom[2] == 'R'))) + if (has_chrom_chr_prefix(chrom, size)) { + // remove "chr" prefix chrom += 3; size -= 3; } @@ -111,19 +148,9 @@ static inline uint8_t encode_chrom(const char *chrom, size_t size) { return 0; } - if ((chrom[0] <= '9') && (chrom[0] >= '0')) // Number + if ((chrom[0] <= '9') && (chrom[0] >= '0')) { - size_t i; - uint8_t v = (chrom[0] - '0'); - for (i = 1; i < size; i++) - { - if ((chrom[i] > '9') || (chrom[i] < '0')) - { - return 0; // NA - } - v = ((v * 10) + (chrom[i] - '0')); - } - return v; + return encode_numeric_chrom(chrom, size); } if ((size == 1) || ((size == 2) && ((chrom[1] == 'T') || (chrom[1] == 't')))) { @@ -159,10 +186,10 @@ static inline uint32_t encode_base(const uint8_t c) { /* Encode base: - A > 0 - C > 1 - G > 2 - T > 3 + A = 0 + C = 1 + G = 2 + T = 3 */ static const uint32_t map[] = { @@ -205,7 +232,7 @@ static inline uint32_t encode_refalt_rev(const char *ref, size_t sizeref, const uint8_t bitpos = 23; if ((encode_allele(&h, &bitpos, ref, sizeref) < 0) || (encode_allele(&h, &bitpos, alt, sizealt) < 0)) { - return 0; // error code + return MAXUINT32; // error code } return h; } @@ -318,7 +345,7 @@ static inline uint32_t encode_refalt(const char *ref, size_t sizeref, const char if ((sizeref + sizealt) <= 11) { uint32_t h = encode_refalt_rev(ref, sizeref, alt, sizealt); - if (h != 0) + if (h != MAXUINT32) { return h; } @@ -486,7 +513,9 @@ static inline void decode_variantkey(uint64_t code, variantkey_t *vk) vk->refalt = extract_variantkey_refalt(code); } -/** @brief Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT. +/** + * Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT. + * The variant should be already normalized (see normalize_variant or use normalized_variantkey). * * @param chrom Chromosome. An identifier from the reference genome, no white-space or leading zeros permitted. * @param sizechrom Length of the chrom string, excluding the terminating null byte. diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index 495d2b5..b2e39ef 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -118,6 +118,8 @@ typedef struct _args_t htsFile *out_fh; int output_type, n_threads, clevel; bcf_sr_regions_t *tgts; + char *index_fn; + int write_index; regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns regitr_t *tgt_itr; @@ -2863,9 +2865,16 @@ static void init_data(args_t *args) if ( args->mark_sites ) { - if ( !args->targets_fname ) error("The -a option not given\n"); - bcf_hdr_printf(args->hdr_out,"##INFO=", - args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); + if ( !args->targets_fname ) + { + if ( args->mark_sites_logic!=MARK_LISTED ) error("The -a option not given but -%s logic was requested\n",args->mark_sites); + fprintf(stderr,"Note: The -a option not given, all sites will be annotated with INFO/%s\n",args->mark_sites); + bcf_hdr_printf(args->hdr_out,"##INFO=", + args->mark_sites,args->mark_sites); + } + else + bcf_hdr_printf(args->hdr_out,"##INFO=", + args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); @@ -2881,6 +2890,7 @@ static void init_data(args_t *args) if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -2943,7 +2953,19 @@ static void destroy_data(args_t *args) convert_destroy(args->set_ids); if ( args->filter ) filter_destroy(args->filter); - if (args->out_fh) hts_close(args->out_fh); + if (args->out_fh) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + } free(args->sample_map); free(args->merge_method_str.s); } @@ -3072,6 +3094,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) args->cols[j].done = 0; if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) { + hts_pos_t vcf_end = line->pos + line->rlen - 1; while ( regitr_overlap(args->tgt_itr) ) { annot_line_t *tmp = &args->alines[0]; @@ -3082,7 +3105,7 @@ static void annotate(args_t *args, bcf1_t *line) // Check min overlap int len_ann = tmp->end - tmp->start + 1; int len_vcf = line->rlen; - int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; + int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; assert( isec > 0 ); if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue; if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue; @@ -3096,9 +3119,9 @@ static void annotate(args_t *args, bcf1_t *line) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); if ( ret==0 ) args->cols[j].done = 1; + has_overlap = 1; } } - has_overlap = 1; } for (j=0; jncols; j++) { @@ -3273,6 +3296,8 @@ static void annotate(args_t *args, bcf1_t *line) if ( args->mark_sites ) { + if ( !args->targets_fname ) has_overlap = 1; + // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 if ( args->mark_sites_logic==MARK_LISTED ) bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0); @@ -3315,6 +3340,7 @@ static void usage(args_t *args) fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n"); @@ -3371,6 +3397,7 @@ int main_vcfannotate(int argc, char *argv[]) {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, + {"write-index",no_argument,NULL,13}, {NULL,0,NULL,0} }; char *tmp; @@ -3447,6 +3474,7 @@ int main_vcfannotate(int argc, char *argv[]) case 10 : args->single_overlaps = 1; break; case 11 : args->rename_annots = optarg; break; case 12 : args->min_overlap_str = optarg; break; + case 13 : args->write_index = 1; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index 54f6a39..2234ddc 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -2,7 +2,7 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -120,6 +120,8 @@ typedef struct _args_t htsFile *out_fh; int output_type, n_threads, clevel; bcf_sr_regions_t *tgts; + char *index_fn; + int write_index; regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns regitr_t *tgt_itr; @@ -2865,9 +2867,16 @@ static void init_data(args_t *args) if ( args->mark_sites ) { - if ( !args->targets_fname ) error("The -a option not given\n"); - bcf_hdr_printf(args->hdr_out,"##INFO=", - args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); + if ( !args->targets_fname ) + { + if ( args->mark_sites_logic!=MARK_LISTED ) error("The -a option not given but -%s logic was requested\n",args->mark_sites); + fprintf(bcftools_stderr,"Note: The -a option not given, all sites will be annotated with INFO/%s\n",args->mark_sites); + bcf_hdr_printf(args->hdr_out,"##INFO=", + args->mark_sites,args->mark_sites); + } + else + bcf_hdr_printf(args->hdr_out,"##INFO=", + args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); @@ -2883,6 +2892,7 @@ static void init_data(args_t *args) if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -2945,7 +2955,19 @@ static void destroy_data(args_t *args) convert_destroy(args->set_ids); if ( args->filter ) filter_destroy(args->filter); - if (args->out_fh) hts_close(args->out_fh); + if (args->out_fh) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + } free(args->sample_map); free(args->merge_method_str.s); } @@ -3074,6 +3096,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) args->cols[j].done = 0; if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) { + hts_pos_t vcf_end = line->pos + line->rlen - 1; while ( regitr_overlap(args->tgt_itr) ) { annot_line_t *tmp = &args->alines[0]; @@ -3084,7 +3107,7 @@ static void annotate(args_t *args, bcf1_t *line) // Check min overlap int len_ann = tmp->end - tmp->start + 1; int len_vcf = line->rlen; - int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; + int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; assert( isec > 0 ); if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue; if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue; @@ -3098,9 +3121,9 @@ static void annotate(args_t *args, bcf1_t *line) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); if ( ret==0 ) args->cols[j].done = 1; + has_overlap = 1; } } - has_overlap = 1; } for (j=0; jncols; j++) { @@ -3275,6 +3298,8 @@ static void annotate(args_t *args, bcf1_t *line) if ( args->mark_sites ) { + if ( !args->targets_fname ) has_overlap = 1; + // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 if ( args->mark_sites_logic==MARK_LISTED ) bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0); @@ -3317,6 +3342,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); fprintf(bcftools_stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); fprintf(bcftools_stderr, " --threads INT Number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n"); @@ -3373,6 +3399,7 @@ int main_vcfannotate(int argc, char *argv[]) {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, + {"write-index",no_argument,NULL,13}, {NULL,0,NULL,0} }; char *tmp; @@ -3449,6 +3476,7 @@ int main_vcfannotate(int argc, char *argv[]) case 10 : args->single_overlaps = 1; break; case 11 : args->rename_annots = optarg; break; case 12 : args->min_overlap_str = optarg; break; + case 13 : args->write_index = 1; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index 1cd6f50..d2f6e2c 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -97,6 +97,8 @@ typedef struct int argc; char **argv; + char *index_fn; + int write_index; // int flag, prior_type, n1, n_sub, *sublist, n_perm; // uint32_t *trio_aux; @@ -715,6 +717,7 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); } @@ -753,6 +756,15 @@ static void destroy_data(args_t *args) free(args->str.s); if ( args->gvcf ) gvcf_destroy(args->gvcf); bcf_hdr_destroy(args->aux.hdr); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_sr_destroy(args->aux.srs); } @@ -908,6 +920,7 @@ static void usage(args_t *args) fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); fprintf(stderr, " -v, --variants-only Output variant sites only\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Consensus/variant calling options:\n"); fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); @@ -990,6 +1003,7 @@ int main_vcfcall(int argc, char *argv[]) {"chromosome-X",no_argument,NULL,'X'}, {"chromosome-Y",no_argument,NULL,'Y'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; @@ -1076,6 +1090,7 @@ int main_vcfcall(int argc, char *argv[]) args.regions_overlap = parse_overlap_option(optarg); if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; + case 10: args.write_index = 1; break; default: usage(&args); } } diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index 975247c..a955342 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -2,7 +2,7 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -99,6 +99,8 @@ typedef struct int argc; char **argv; + char *index_fn; + int write_index; // int flag, prior_type, n1, n_sub, *sublist, n_perm; // uint32_t *trio_aux; @@ -717,6 +719,7 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); } @@ -755,6 +758,15 @@ static void destroy_data(args_t *args) free(args->str.s); if ( args->gvcf ) gvcf_destroy(args->gvcf); bcf_hdr_destroy(args->aux.hdr); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_sr_destroy(args->aux.srs); } @@ -910,6 +922,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Consensus/variant calling options:\n"); fprintf(bcftools_stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); @@ -992,6 +1005,7 @@ int main_vcfcall(int argc, char *argv[]) {"chromosome-X",no_argument,NULL,'X'}, {"chromosome-Y",no_argument,NULL,'Y'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; @@ -1078,6 +1092,7 @@ int main_vcfcall(int argc, char *argv[]) args.regions_overlap = parse_overlap_option(optarg); if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; + case 10: args.write_index = 1; break; default: usage(&args); } } diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c index 74fd036..8e25cc5 100644 --- a/bcftools/vcfconcat.c +++ b/bcftools/vcfconcat.c @@ -1,6 +1,6 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -46,6 +46,8 @@ typedef struct _args_t int output_type, n_threads, record_cmd_line, clevel; bcf_hdr_t *out_hdr; int *seen_seq; + char *index_fn; + int write_index; // phasing int *start_pos, start_tid, ifname; @@ -59,10 +61,21 @@ typedef struct _args_t int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap; int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; int verbose, explicit_output_type, ligate_force, ligate_warn; + int sites_only; htsThreadPool *tpool; } args_t; +static bcf_hdr_t *drop_hdr_genotypes(args_t *args, bcf_hdr_t *hdr) +{ + if ( !args->sites_only ) return hdr; + bcf_hdr_t *rmme = hdr; + hdr = bcf_hdr_subset(rmme, 0, 0, 0); + bcf_hdr_remove(hdr, BCF_HL_FMT, NULL); + bcf_hdr_destroy(rmme); + return hdr; +} + static void init_data(args_t *args) { bcf1_t *line = NULL; @@ -83,6 +96,8 @@ static void init_data(args_t *args) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); + hdr = drop_hdr_genotypes(args, hdr); + args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); @@ -142,6 +157,7 @@ static void init_data(args_t *args) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool); } if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->allow_overlaps ) { @@ -203,7 +219,16 @@ static void destroy_data(args_t *args) int i; if ( args->out_fh ) { - if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"stdout"); } if ( args->tpool && !args->files ) { @@ -264,7 +289,7 @@ static void phased_flush(args_t *args) bcf1_t *brec = args->buf[i+1]; int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa); - if ( nGTs < 0 ) + if ( nGTs < 0 ) { if ( !gt_absent_warned ) { @@ -359,7 +384,7 @@ static void phased_flush(args_t *args) bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; jphase_qual[j] < args->min_PQ ) + if ( args->phase_qual[j] < args->min_PQ ) { args->phase_set[j] = rec->pos+1; args->phase_set_changed = 1; @@ -582,13 +607,14 @@ static void concat(args_t *args) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; + if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0); bcf_translate(args->out_hdr, args->files->readers[i].header, line); if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); if ( args->remove_dups ) break; } } } - else // concatenating + else // concatenate as is { struct timeval t0, t1; kstring_t tmp = {0,0,0}; @@ -604,6 +630,13 @@ static void concat(args_t *args) htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]); if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]); + if ( args->sites_only ) + { + bcf_hdr_t *hdr_ori = hdr; + hdr = bcf_hdr_subset(hdr_ori, 0, 0, 0); + bcf_hdr_remove(hdr, BCF_HL_FMT, NULL); + bcf_hdr_destroy(hdr_ori); + } if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; @@ -611,6 +644,22 @@ static void concat(args_t *args) while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; + + // remove genotypes + if ( args->sites_only ) + { + int ntab = 0; + while ( *str ) + { + if ( *str == '\t' && ++ntab==8 ) + { + *str = 0; + break; + } + str++; + } + str = fp->line.s; + } while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); @@ -639,6 +688,7 @@ static void concat(args_t *args) line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { + if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0); bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) @@ -917,6 +967,7 @@ static void usage(args_t *args) fprintf(stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: \n"); fprintf(stderr, " -D, --remove-duplicates Alias for -d exact\n"); fprintf(stderr, " -f, --file-list FILE Read the list of files from a file.\n"); + fprintf(stderr, " -G, --drop-genotypes Drop individual genotype information.\n"); fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); fprintf(stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n"); fprintf(stderr, " --ligate-warn Drop sites in imperfect overlaps\n"); @@ -931,6 +982,7 @@ static void usage(args_t *args) fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -v, --verbose 0|1 Set verbosity level [1]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -969,10 +1021,12 @@ int main_vcfconcat(int argc, char *argv[]) {"file-list",required_argument,NULL,'f'}, {"min-PQ",required_argument,NULL,'q'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,13}, + {"drop-genotypes",no_argument,NULL,'G'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0) { switch (c) { case 'c': args->compact_PS = 1; break; @@ -980,7 +1034,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 'd': args->remove_dups = optarg; break; case 'D': args->remove_dups = "exact"; break; - case 'q': + case 'q': args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); break; @@ -988,6 +1042,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'a': args->allow_overlaps = 1; break; case 'l': args->phased_concat = 1; break; case 'f': args->file_list = optarg; break; + case 'G': args->sites_only = 1; break; case 'o': args->output_fname = optarg; break; case 'O': args->explicit_output_type = 1; @@ -1021,6 +1076,7 @@ int main_vcfconcat(int argc, char *argv[]) args->verbose = strtol(optarg, &tmp, 0); if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); break; + case 13 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -1035,6 +1091,7 @@ int main_vcfconcat(int argc, char *argv[]) } if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n"); if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); + if ( args->sites_only && args->phased_concat ) error("The options --drop-genotypes and --ligate cannot be combined\n"); if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); if ( args->file_list ) { @@ -1049,6 +1106,7 @@ int main_vcfconcat(int argc, char *argv[]) { if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n"); if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n"); + if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n"); naive_concat(args); destroy_data(args); free(args); diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index e1baeef..0d3b394 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -2,7 +2,7 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -48,6 +48,8 @@ typedef struct _args_t int output_type, n_threads, record_cmd_line, clevel; bcf_hdr_t *out_hdr; int *seen_seq; + char *index_fn; + int write_index; // phasing int *start_pos, start_tid, ifname; @@ -61,10 +63,21 @@ typedef struct _args_t int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap; int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; int verbose, explicit_output_type, ligate_force, ligate_warn; + int sites_only; htsThreadPool *tpool; } args_t; +static bcf_hdr_t *drop_hdr_genotypes(args_t *args, bcf_hdr_t *hdr) +{ + if ( !args->sites_only ) return hdr; + bcf_hdr_t *rmme = hdr; + hdr = bcf_hdr_subset(rmme, 0, 0, 0); + bcf_hdr_remove(hdr, BCF_HL_FMT, NULL); + bcf_hdr_destroy(rmme); + return hdr; +} + static void init_data(args_t *args) { bcf1_t *line = NULL; @@ -85,6 +98,8 @@ static void init_data(args_t *args) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); + hdr = drop_hdr_genotypes(args, hdr); + args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); @@ -144,6 +159,7 @@ static void init_data(args_t *args) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool); } if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->allow_overlaps ) { @@ -205,7 +221,16 @@ static void destroy_data(args_t *args) int i; if ( args->out_fh ) { - if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"bcftools_stdout"); } if ( args->tpool && !args->files ) { @@ -266,7 +291,7 @@ static void phased_flush(args_t *args) bcf1_t *brec = args->buf[i+1]; int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa); - if ( nGTs < 0 ) + if ( nGTs < 0 ) { if ( !gt_absent_warned ) { @@ -361,7 +386,7 @@ static void phased_flush(args_t *args) bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; jphase_qual[j] < args->min_PQ ) + if ( args->phase_qual[j] < args->min_PQ ) { args->phase_set[j] = rec->pos+1; args->phase_set_changed = 1; @@ -584,13 +609,14 @@ static void concat(args_t *args) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; + if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0); bcf_translate(args->out_hdr, args->files->readers[i].header, line); if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); if ( args->remove_dups ) break; } } } - else // concatenating + else // concatenate as is { struct timeval t0, t1; kstring_t tmp = {0,0,0}; @@ -606,6 +632,13 @@ static void concat(args_t *args) htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]); if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]); + if ( args->sites_only ) + { + bcf_hdr_t *hdr_ori = hdr; + hdr = bcf_hdr_subset(hdr_ori, 0, 0, 0); + bcf_hdr_remove(hdr, BCF_HL_FMT, NULL); + bcf_hdr_destroy(hdr_ori); + } if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; @@ -613,6 +646,22 @@ static void concat(args_t *args) while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; + + // remove genotypes + if ( args->sites_only ) + { + int ntab = 0; + while ( *str ) + { + if ( *str == '\t' && ++ntab==8 ) + { + *str = 0; + break; + } + str++; + } + str = fp->line.s; + } while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); @@ -641,6 +690,7 @@ static void concat(args_t *args) line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { + if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0); bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) @@ -919,6 +969,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: \n"); fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d exact\n"); fprintf(bcftools_stderr, " -f, --file-list FILE Read the list of files from a file.\n"); + fprintf(bcftools_stderr, " -G, --drop-genotypes Drop individual genotype information.\n"); fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); fprintf(bcftools_stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n"); fprintf(bcftools_stderr, " --ligate-warn Drop sites in imperfect overlaps\n"); @@ -933,6 +984,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, " -v, --verbose 0|1 Set verbosity level [1]\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -971,10 +1023,12 @@ int main_vcfconcat(int argc, char *argv[]) {"file-list",required_argument,NULL,'f'}, {"min-PQ",required_argument,NULL,'q'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,13}, + {"drop-genotypes",no_argument,NULL,'G'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0) { switch (c) { case 'c': args->compact_PS = 1; break; @@ -982,7 +1036,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 'd': args->remove_dups = optarg; break; case 'D': args->remove_dups = "exact"; break; - case 'q': + case 'q': args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); break; @@ -990,6 +1044,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'a': args->allow_overlaps = 1; break; case 'l': args->phased_concat = 1; break; case 'f': args->file_list = optarg; break; + case 'G': args->sites_only = 1; break; case 'o': args->output_fname = optarg; break; case 'O': args->explicit_output_type = 1; @@ -1023,6 +1078,7 @@ int main_vcfconcat(int argc, char *argv[]) args->verbose = strtol(optarg, &tmp, 0); if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); break; + case 13 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -1037,6 +1093,7 @@ int main_vcfconcat(int argc, char *argv[]) } if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n"); if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); + if ( args->sites_only && args->phased_concat ) error("The options --drop-genotypes and --ligate cannot be combined\n"); if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); if ( args->file_list ) { @@ -1051,6 +1108,7 @@ int main_vcfconcat(int argc, char *argv[]) { if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n"); if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n"); + if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n"); naive_concat(args); destroy_data(args); free(args); diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index ce5ed99..76c4a32 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -1,6 +1,6 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -59,7 +59,7 @@ struct _args_t bcf_hdr_t *header; void (*convert_func)(struct _args_t *); struct { - int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; + int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written; } n; kstring_t str; int32_t *gts; @@ -70,6 +70,11 @@ struct _args_t char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname, *sex_fname; int argc, n_threads, record_cmd_line, keep_duplicates, clevel; + char *index_fn; + int write_index; + struct { + kstring_t ref,alt,refalt; + } tsv; }; static void destroy_data(args_t *args) @@ -139,6 +144,36 @@ static void open_vcf(args_t *args, const char *format_str) free(samples); } +static int _set_ref_alt(args_t *args, bcf1_t *rec) +{ + args->tsv.refalt.l = 0; + kputs(args->tsv.ref.s, &args->tsv.refalt); + if ( strcmp(".",args->tsv.alt.s) && strcmp(args->tsv.ref.s,args->tsv.alt.s) ) + { + kputc(',', &args->tsv.refalt); + kputs(args->tsv.alt.s, &args->tsv.refalt); + } + bcf_update_alleles_str(args->header, rec, args->tsv.refalt.s); + args->tsv.ref.l = 0; + args->tsv.alt.l = 0; + args->tsv.refalt.l = 0; + return 0; +} +static int tsv_setter_ref(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*) usr; + kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.ref); + if ( args->tsv.alt.l ) return _set_ref_alt(args,rec); + return 0; +} +static int tsv_setter_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*) usr; + kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.alt); + if ( args->tsv.ref.l ) return _set_ref_alt(args,rec); + return 0; +} + // Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) { @@ -160,7 +195,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) // REF,ALT args->str.l = 0; se = ++ss; - while ( se < tsv->se && *se!='_' ) se++; + while ( se < tsv->se && *se!='_' ) se++; if ( *se!='_' ) return -1; kputsn(ss,se-ss,&args->str); ss = ++se; @@ -269,12 +304,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr) if ( aa >= ab ) { if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0); - else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); + else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } - else if ( ab >= bb ) + else if ( ab >= bb ) { args->gts[2*i+0] = bcf_gt_unphased(0); - args->gts[2*i+1] = bcf_gt_unphased(1); + args->gts[2*i+1] = bcf_gt_unphased(1); } else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } @@ -293,7 +328,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); } // up is short for "unphased" - int nup = 0; + int nup = 0; for (i=0; iss + 4*i + nup; @@ -324,11 +359,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) break; default : fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); - return -1; + return -1; } if( ss[all*2+up+1]=='*' ) up = up + 1; } - + if(up && up != 2) { fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); @@ -356,13 +391,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) static void gensample_to_vcf(args_t *args) { /* - * Inpute: IMPUTE2 output (indentation changed here for clarity): + * Inpute: IMPUTE2 output (indentation changed here for clarity): * * 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ... * --- 20:62116698_C_A 62116698 C A 1 0 0 ... * * Second column is expected in the form of CHROM:POS_REF_ALT. We use second - * column because the first can be empty ("--") when filling sites from reference + * column because the first can be empty ("--") when filling sites from reference * panel. When the option --vcf-ids is given, the first column is used to set the * VCF ID. * @@ -455,6 +490,7 @@ static void gensample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); nsamples -= 2; @@ -474,6 +510,15 @@ static void gensample_to_vcf(args_t *args) } while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 ); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname); bcf_hdr_destroy(args->header); @@ -589,6 +634,7 @@ static void haplegendsample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2); @@ -616,6 +662,15 @@ static void haplegendsample_to_vcf(args_t *args) } } + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname); if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname); @@ -731,6 +786,7 @@ static void hapsample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); nsamples -= 2; @@ -749,6 +805,15 @@ static void hapsample_to_vcf(args_t *args) } while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 ); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname); bcf_hdr_destroy(args->header); @@ -784,7 +849,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname) } for (i=0; isex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); @@ -915,7 +980,7 @@ static void vcf_to_gensample(args_t *args) nok++; } } - fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", + fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup); if ( str.m ) free(str.s); @@ -976,7 +1041,7 @@ static void vcf_to_haplegendsample(args_t *args) { char *sample2sex = NULL; if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); - + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; @@ -1078,7 +1143,7 @@ static void vcf_to_hapsample(args_t *args) kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str); else kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); - + if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); else @@ -1213,7 +1278,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[] { if ( se - ss > 2 ) return -1; // currently only SNPs - if ( ss[0]=='-' ) + if ( ss[0]=='-' || ss[0]=='.' ) { // missing GT gts[0] = bcf_gt_missing; @@ -1229,7 +1294,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[] if ( alleles[a0]<0 ) alleles[a0] = (*nals)++; if ( alleles[a1]<0 ) alleles[a1] = (*nals)++; - gts[0] = bcf_gt_unphased(alleles[a0]); + gts[0] = bcf_gt_unphased(alleles[a0]); gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end; if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR @@ -1265,7 +1330,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) } ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - if ( ret==-2 ) + if ( ret==-2 ) { // something else than a SNP free(ref); @@ -1275,7 +1340,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) args->str.l = 0; kputc(ref[0], &args->str); - for (i=0; i<5; i++) + for (i=0; i<5; i++) { if ( alleles[i]>0 ) { @@ -1293,7 +1358,6 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) static void tsv_to_vcf(args_t *args) { if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n"); - if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n"); args->ref = fai_load(args->ref_fname); if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname); @@ -1303,17 +1367,21 @@ static void tsv_to_vcf(args_t *args) bcf_hdr_append(args->header, "##FORMAT="); if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); - int i, n; - char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); - if ( !smpls ) error("Could not parse %s\n", args->sample_list); - for (i=0; isample_list ) { - bcf_hdr_add_sample(args->header, smpls[i]); - free(smpls[i]); + smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); + if ( !smpl ) error("Could not parse %s\n", args->sample_list); + for (i=0; iheader, smpl[i]); + free(smpl[i]); + } + free(smpl); + bcf_hdr_add_sample(args->header, NULL); + args->gts = (int32_t *) malloc(sizeof(int32_t)*nsmpl*2); } - free(smpls); - bcf_hdr_add_sample(args->header, NULL); - args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2); char wmode[8]; set_wmode(wmode,args->output_type,args->outfname,args->clevel); @@ -1321,12 +1389,18 @@ static void tsv_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA"); if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n"); if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n"); if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n"); - if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n"); + if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) + { + if ( args->sample_list ) error("Expected AA column with -s/-S\n"); + if ( tsv_register(tsv, "REF", tsv_setter_ref, args) < 0 || tsv_register(tsv, "ALT", tsv_setter_alt, args) < 0 ) + error("Expected REF and ALT columns when AA was not given\n"); + } bcf1_t *rec = bcf_init(); bcf_float_set_missing(rec->qual); @@ -1343,6 +1417,7 @@ static void tsv_to_vcf(args_t *args) if ( !tsv_parse(tsv, rec, line.s) ) { if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + args->n.written++; } else args->n.skipped++; @@ -1350,20 +1425,36 @@ static void tsv_to_vcf(args_t *args) if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname); free(line.s); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } bcf_hdr_destroy(args->header); if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); tsv_destroy(tsv); bcf_destroy(rec); free(args->str.s); free(args->gts); + free(args->tsv.ref.s); + free(args->tsv.alt.s); + free(args->tsv.refalt.s); fprintf(stderr,"Rows total: \t%d\n", args->n.total); fprintf(stderr,"Rows skipped: \t%d\n", args->n.skipped); - fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing); - fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr); - fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra); - fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa); - fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa); + fprintf(stderr,"Sites written: \t%d\n", args->n.written); + if ( args->sample_list ) + { + fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing); + fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr); + fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra); + fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa); + fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa); + } } static void vcf_to_vcf(args_t *args) @@ -1377,6 +1468,7 @@ static void vcf_to_vcf(args_t *args) bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); while ( bcf_sr_next_line(args->files) ) { @@ -1389,6 +1481,15 @@ static void vcf_to_vcf(args_t *args) } if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); } + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); } @@ -1409,6 +1510,7 @@ static void gvcf_to_vcf(args_t *args) bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); int32_t *itmp = NULL, nitmp = 0; @@ -1419,7 +1521,7 @@ static void gvcf_to_vcf(args_t *args) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) + if ( !pass ) { if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); continue; @@ -1469,6 +1571,15 @@ static void gvcf_to_vcf(args_t *args) } } free(itmp); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); } @@ -1497,6 +1608,7 @@ static void usage(void) fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); fprintf(stderr, " -G, --gensample2vcf ... |,\n"); @@ -1528,7 +1640,7 @@ static void usage(void) fprintf(stderr, "\n"); fprintf(stderr, "TSV conversion:\n"); fprintf(stderr, " --tsv2vcf FILE\n"); - fprintf(stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n"); + fprintf(stderr, " -c, --columns STRING Columns of the input tsv file, see man page for details [ID,CHROM,POS,AA]\n"); fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); fprintf(stderr, " -s, --samples LIST List of sample names\n"); fprintf(stderr, " -S, --samples-file FILE File of sample names\n"); @@ -1590,6 +1702,7 @@ int main_vcfconvert(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"no-version",no_argument,NULL,10}, {"keep-duplicates",no_argument,NULL,12}, + {"write-index",no_argument,NULL,16}, {NULL,0,NULL,0} }; char *tmp; @@ -1618,6 +1731,7 @@ int main_vcfconvert(int argc, char *argv[]) case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break; case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break; case 15 : args->gen_3N6 = 1; break; + case 16 : args->write_index = 1; break; case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break; case 'f': args->ref_fname = optarg; break; case 'c': args->columns = optarg; break; @@ -1667,7 +1781,7 @@ int main_vcfconvert(int argc, char *argv[]) else args->infname = argv[optind]; } if ( !args->infname ) usage(); - + if ( args->convert_func ) args->convert_func(args); else vcf_to_vcf(args); diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index f340171..16bb3be 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -2,7 +2,7 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -61,7 +61,7 @@ struct _args_t bcf_hdr_t *header; void (*convert_func)(struct _args_t *); struct { - int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; + int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written; } n; kstring_t str; int32_t *gts; @@ -72,6 +72,11 @@ struct _args_t char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname, *sex_fname; int argc, n_threads, record_cmd_line, keep_duplicates, clevel; + char *index_fn; + int write_index; + struct { + kstring_t ref,alt,refalt; + } tsv; }; static void destroy_data(args_t *args) @@ -141,6 +146,36 @@ static void open_vcf(args_t *args, const char *format_str) free(samples); } +static int _set_ref_alt(args_t *args, bcf1_t *rec) +{ + args->tsv.refalt.l = 0; + kputs(args->tsv.ref.s, &args->tsv.refalt); + if ( strcmp(".",args->tsv.alt.s) && strcmp(args->tsv.ref.s,args->tsv.alt.s) ) + { + kputc(',', &args->tsv.refalt); + kputs(args->tsv.alt.s, &args->tsv.refalt); + } + bcf_update_alleles_str(args->header, rec, args->tsv.refalt.s); + args->tsv.ref.l = 0; + args->tsv.alt.l = 0; + args->tsv.refalt.l = 0; + return 0; +} +static int tsv_setter_ref(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*) usr; + kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.ref); + if ( args->tsv.alt.l ) return _set_ref_alt(args,rec); + return 0; +} +static int tsv_setter_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*) usr; + kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.alt); + if ( args->tsv.ref.l ) return _set_ref_alt(args,rec); + return 0; +} + // Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) { @@ -162,7 +197,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) // REF,ALT args->str.l = 0; se = ++ss; - while ( se < tsv->se && *se!='_' ) se++; + while ( se < tsv->se && *se!='_' ) se++; if ( *se!='_' ) return -1; kputsn(ss,se-ss,&args->str); ss = ++se; @@ -271,12 +306,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr) if ( aa >= ab ) { if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0); - else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); + else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } - else if ( ab >= bb ) + else if ( ab >= bb ) { args->gts[2*i+0] = bcf_gt_unphased(0); - args->gts[2*i+1] = bcf_gt_unphased(1); + args->gts[2*i+1] = bcf_gt_unphased(1); } else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } @@ -295,7 +330,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); } // up is short for "unphased" - int nup = 0; + int nup = 0; for (i=0; iss + 4*i + nup; @@ -326,11 +361,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) break; default : fprintf(bcftools_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); - return -1; + return -1; } if( ss[all*2+up+1]=='*' ) up = up + 1; } - + if(up && up != 2) { fprintf(bcftools_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); @@ -358,13 +393,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) static void gensample_to_vcf(args_t *args) { /* - * Inpute: IMPUTE2 output (indentation changed here for clarity): + * Inpute: IMPUTE2 output (indentation changed here for clarity): * * 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ... * --- 20:62116698_C_A 62116698 C A 1 0 0 ... * * Second column is expected in the form of CHROM:POS_REF_ALT. We use second - * column because the first can be empty ("--") when filling sites from reference + * column because the first can be empty ("--") when filling sites from reference * panel. When the option --vcf-ids is given, the first column is used to set the * VCF ID. * @@ -457,6 +492,7 @@ static void gensample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); nsamples -= 2; @@ -476,6 +512,15 @@ static void gensample_to_vcf(args_t *args) } while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 ); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname); bcf_hdr_destroy(args->header); @@ -591,6 +636,7 @@ static void haplegendsample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2); @@ -618,6 +664,15 @@ static void haplegendsample_to_vcf(args_t *args) } } + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname); if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname); @@ -733,6 +788,7 @@ static void hapsample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); nsamples -= 2; @@ -751,6 +807,15 @@ static void hapsample_to_vcf(args_t *args) } while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 ); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname); bcf_hdr_destroy(args->header); @@ -786,7 +851,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname) } for (i=0; isex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); @@ -917,7 +982,7 @@ static void vcf_to_gensample(args_t *args) nok++; } } - fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", + fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup); if ( str.m ) free(str.s); @@ -978,7 +1043,7 @@ static void vcf_to_haplegendsample(args_t *args) { char *sample2sex = NULL; if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); - + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; @@ -1080,7 +1145,7 @@ static void vcf_to_hapsample(args_t *args) kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str); else kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); - + if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); else @@ -1215,7 +1280,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[] { if ( se - ss > 2 ) return -1; // currently only SNPs - if ( ss[0]=='-' ) + if ( ss[0]=='-' || ss[0]=='.' ) { // missing GT gts[0] = bcf_gt_missing; @@ -1231,7 +1296,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[] if ( alleles[a0]<0 ) alleles[a0] = (*nals)++; if ( alleles[a1]<0 ) alleles[a1] = (*nals)++; - gts[0] = bcf_gt_unphased(alleles[a0]); + gts[0] = bcf_gt_unphased(alleles[a0]); gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end; if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR @@ -1267,7 +1332,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) } ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - if ( ret==-2 ) + if ( ret==-2 ) { // something else than a SNP free(ref); @@ -1277,7 +1342,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) args->str.l = 0; kputc(ref[0], &args->str); - for (i=0; i<5; i++) + for (i=0; i<5; i++) { if ( alleles[i]>0 ) { @@ -1295,7 +1360,6 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) static void tsv_to_vcf(args_t *args) { if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n"); - if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n"); args->ref = fai_load(args->ref_fname); if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname); @@ -1305,17 +1369,21 @@ static void tsv_to_vcf(args_t *args) bcf_hdr_append(args->header, "##FORMAT="); if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); - int i, n; - char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); - if ( !smpls ) error("Could not parse %s\n", args->sample_list); - for (i=0; isample_list ) { - bcf_hdr_add_sample(args->header, smpls[i]); - free(smpls[i]); + smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); + if ( !smpl ) error("Could not parse %s\n", args->sample_list); + for (i=0; iheader, smpl[i]); + free(smpl[i]); + } + free(smpl); + bcf_hdr_add_sample(args->header, NULL); + args->gts = (int32_t *) malloc(sizeof(int32_t)*nsmpl*2); } - free(smpls); - bcf_hdr_add_sample(args->header, NULL); - args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2); char wmode[8]; set_wmode(wmode,args->output_type,args->outfname,args->clevel); @@ -1323,12 +1391,18 @@ static void tsv_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA"); if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n"); if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n"); if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n"); - if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n"); + if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) + { + if ( args->sample_list ) error("Expected AA column with -s/-S\n"); + if ( tsv_register(tsv, "REF", tsv_setter_ref, args) < 0 || tsv_register(tsv, "ALT", tsv_setter_alt, args) < 0 ) + error("Expected REF and ALT columns when AA was not given\n"); + } bcf1_t *rec = bcf_init(); bcf_float_set_missing(rec->qual); @@ -1345,6 +1419,7 @@ static void tsv_to_vcf(args_t *args) if ( !tsv_parse(tsv, rec, line.s) ) { if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + args->n.written++; } else args->n.skipped++; @@ -1352,20 +1427,36 @@ static void tsv_to_vcf(args_t *args) if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname); free(line.s); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } bcf_hdr_destroy(args->header); if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); tsv_destroy(tsv); bcf_destroy(rec); free(args->str.s); free(args->gts); + free(args->tsv.ref.s); + free(args->tsv.alt.s); + free(args->tsv.refalt.s); fprintf(bcftools_stderr,"Rows total: \t%d\n", args->n.total); fprintf(bcftools_stderr,"Rows skipped: \t%d\n", args->n.skipped); - fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing); - fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr); - fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra); - fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa); - fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa); + fprintf(bcftools_stderr,"Sites written: \t%d\n", args->n.written); + if ( args->sample_list ) + { + fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing); + fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr); + fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra); + fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa); + fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa); + } } static void vcf_to_vcf(args_t *args) @@ -1379,6 +1470,7 @@ static void vcf_to_vcf(args_t *args) bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); while ( bcf_sr_next_line(args->files) ) { @@ -1391,6 +1483,15 @@ static void vcf_to_vcf(args_t *args) } if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); } + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); } @@ -1411,6 +1512,7 @@ static void gvcf_to_vcf(args_t *args) bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); int32_t *itmp = NULL, nitmp = 0; @@ -1421,7 +1523,7 @@ static void gvcf_to_vcf(args_t *args) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) + if ( !pass ) { if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); continue; @@ -1471,6 +1573,15 @@ static void gvcf_to_vcf(args_t *args) } } free(itmp); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); } @@ -1499,6 +1610,7 @@ static void usage(void) fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n"); fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); fprintf(bcftools_stderr, " -G, --gensample2vcf ... |,\n"); @@ -1530,7 +1642,7 @@ static void usage(void) fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "TSV conversion:\n"); fprintf(bcftools_stderr, " --tsv2vcf FILE\n"); - fprintf(bcftools_stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n"); + fprintf(bcftools_stderr, " -c, --columns STRING Columns of the input tsv file, see man page for details [ID,CHROM,POS,AA]\n"); fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); fprintf(bcftools_stderr, " -s, --samples LIST List of sample names\n"); fprintf(bcftools_stderr, " -S, --samples-file FILE File of sample names\n"); @@ -1592,6 +1704,7 @@ int main_vcfconvert(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"no-version",no_argument,NULL,10}, {"keep-duplicates",no_argument,NULL,12}, + {"write-index",no_argument,NULL,16}, {NULL,0,NULL,0} }; char *tmp; @@ -1620,6 +1733,7 @@ int main_vcfconvert(int argc, char *argv[]) case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break; case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break; case 15 : args->gen_3N6 = 1; break; + case 16 : args->write_index = 1; break; case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break; case 'f': args->ref_fname = optarg; break; case 'c': args->columns = optarg; break; @@ -1669,7 +1783,7 @@ int main_vcfconvert(int argc, char *argv[]) else args->infname = argv[optind]; } if ( !args->infname ) usage(); - + if ( args->convert_func ) args->convert_func(args); else vcf_to_vcf(args); diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c index 68d8672..8665409 100644 --- a/bcftools/vcffilter.c +++ b/bcftools/vcffilter.c @@ -1,6 +1,6 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -77,6 +77,8 @@ typedef struct _args_t char **argv, *output_fname, *targets_list, *regions_list, *mask_list; int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate; regidx_t *mask; + char *index_fn; + int write_index; } args_t; @@ -491,6 +493,7 @@ static void usage(args_t *args) fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -533,13 +536,14 @@ int main_vcffilter(int argc, char *argv[]) {"SnpGap",required_argument,NULL,'g'}, {"IndelGap",required_argument,NULL,'G'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,12}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) { switch (c) { case 'g': - args->snp_gap = strtol(optarg,&tmp,10); + args->snp_gap = strtol(optarg,&tmp,10); if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg); if ( *tmp==':' ) { @@ -625,6 +629,7 @@ int main_vcffilter(int argc, char *argv[]) else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2; else error("Could not parse: --mask-overlap %s\n",optarg); break; + case 12 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -672,6 +677,7 @@ int main_vcffilter(int argc, char *argv[]) init_data(args); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files, 0); @@ -713,7 +719,15 @@ int main_vcffilter(int argc, char *argv[]) } } buffered_filters(args, NULL); - + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); destroy_data(args); bcf_sr_destroy(args->files); diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c index f998083..6d17151 100644 --- a/bcftools/vcffilter.c.pysam.c +++ b/bcftools/vcffilter.c.pysam.c @@ -2,7 +2,7 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -79,6 +79,8 @@ typedef struct _args_t char **argv, *output_fname, *targets_list, *regions_list, *mask_list; int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate; regidx_t *mask; + char *index_fn; + int write_index; } args_t; @@ -493,6 +495,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -535,13 +538,14 @@ int main_vcffilter(int argc, char *argv[]) {"SnpGap",required_argument,NULL,'g'}, {"IndelGap",required_argument,NULL,'G'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,12}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) { switch (c) { case 'g': - args->snp_gap = strtol(optarg,&tmp,10); + args->snp_gap = strtol(optarg,&tmp,10); if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg); if ( *tmp==':' ) { @@ -627,6 +631,7 @@ int main_vcffilter(int argc, char *argv[]) else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2; else error("Could not parse: --mask-overlap %s\n",optarg); break; + case 12 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -674,6 +679,7 @@ int main_vcffilter(int argc, char *argv[]) init_data(args); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files, 0); @@ -715,7 +721,15 @@ int main_vcffilter(int argc, char *argv[]) } } buffered_filters(args, NULL); - + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); destroy_data(args); bcf_sr_destroy(args->files); diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c index f646e1f..561be62 100644 --- a/bcftools/vcfgtcheck.c +++ b/bcftools/vcfgtcheck.c @@ -1,6 +1,6 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -59,6 +59,7 @@ typedef struct int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file; int regions_overlap, targets_overlap; int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl; + int nused[2][2]; double *pdiff, *qry_prob, *gt_prob; uint32_t *ndiff,*ncnt,ncmp, npairs; int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr; @@ -309,7 +310,7 @@ static void init_data(args_t *args) init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname); } if ( args->gt_samples ) - { + { init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl, args->gt_hdr ? args->gt_hdr : args->qry_hdr, args->gt_fname ? args->gt_fname : args->qry_fname); @@ -377,7 +378,7 @@ static void init_data(args_t *args) args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob)); // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing - // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding + // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding // probabilities of 0/0, 0/1, and 1/1 genotypes for (i=0; i<8; i++) for (j=0; j<3; j++) @@ -555,7 +556,9 @@ static void process_line(args_t *args) args->gt_arr = args->qry_arr; } + // stats: number of compared sites, and used tags args->ncmp++; + args->nused[qry_use_GT][gt_use_GT]++; double af,hwe_dsg[8]; if ( args->calc_hwe_prob ) @@ -636,7 +639,7 @@ static void process_line(args_t *args) gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob); if ( !gt_dsg ) continue; // missing value if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom - + ptr = args->qry_arr + args->pairs[i].iqry*nqry1; qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob); if ( !qry_dsg ) continue; // missing value @@ -797,11 +800,15 @@ static void report(args_t *args) fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data); fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT); fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL); + fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]); + fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]); + fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]); + fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]); fprintf(args->fp,"# DC, discordance:\n"); fprintf(args->fp,"# - query sample\n"); fprintf(args->fp,"# - genotyped sample\n"); - fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n"); - fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n"); + fprintf(args->fp,"# - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n"); + fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n"); fprintf(args->fp,"# - number of sites compared (bigger is better)\n"); fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n"); @@ -1023,7 +1030,7 @@ static int is_input_okay(args_t *args, int nmatch) return 1; not_okay: - fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", + fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1,msg); return 0; } @@ -1097,7 +1104,7 @@ int main_vcfgtcheck(int argc, char *argv[]) args->es_max_mem = strdup("500M"); // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 - // - min_inter: pairs with smaller err value will be considered identical + // - min_inter: pairs with smaller err value will be considered identical // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered // different. If negative, the cutoff may be heuristically lowered args->min_inter_err = 0.23; @@ -1169,7 +1176,7 @@ int main_vcfgtcheck(int argc, char *argv[]) case 3 : args->calc_hwe_prob = 0; break; case 4 : error("The option -S, --target-sample has been deprecated\n"); break; case 5 : args->dry_run = 1; break; - case 6 : + case 6 : args->distinctive_sites = strtod(optarg,&tmp); if ( *tmp ) { @@ -1202,7 +1209,7 @@ int main_vcfgtcheck(int argc, char *argv[]) else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4; else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); break; - case 'S': + case 'S': if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1; else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1; else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index e0a70ba..54568b0 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -2,7 +2,7 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -61,6 +61,7 @@ typedef struct int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file; int regions_overlap, targets_overlap; int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl; + int nused[2][2]; double *pdiff, *qry_prob, *gt_prob; uint32_t *ndiff,*ncnt,ncmp, npairs; int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr; @@ -311,7 +312,7 @@ static void init_data(args_t *args) init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname); } if ( args->gt_samples ) - { + { init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl, args->gt_hdr ? args->gt_hdr : args->qry_hdr, args->gt_fname ? args->gt_fname : args->qry_fname); @@ -379,7 +380,7 @@ static void init_data(args_t *args) args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob)); // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing - // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding + // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding // probabilities of 0/0, 0/1, and 1/1 genotypes for (i=0; i<8; i++) for (j=0; j<3; j++) @@ -557,7 +558,9 @@ static void process_line(args_t *args) args->gt_arr = args->qry_arr; } + // stats: number of compared sites, and used tags args->ncmp++; + args->nused[qry_use_GT][gt_use_GT]++; double af,hwe_dsg[8]; if ( args->calc_hwe_prob ) @@ -638,7 +641,7 @@ static void process_line(args_t *args) gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob); if ( !gt_dsg ) continue; // missing value if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom - + ptr = args->qry_arr + args->pairs[i].iqry*nqry1; qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob); if ( !qry_dsg ) continue; // missing value @@ -799,11 +802,15 @@ static void report(args_t *args) fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data); fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT); fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL); + fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]); + fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]); + fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]); + fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]); fprintf(args->fp,"# DC, discordance:\n"); fprintf(args->fp,"# - query sample\n"); fprintf(args->fp,"# - genotyped sample\n"); - fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n"); - fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n"); + fprintf(args->fp,"# - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n"); + fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n"); fprintf(args->fp,"# - number of sites compared (bigger is better)\n"); fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n"); @@ -1025,7 +1032,7 @@ static int is_input_okay(args_t *args, int nmatch) return 1; not_okay: - fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", + fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1,msg); return 0; } @@ -1099,7 +1106,7 @@ int main_vcfgtcheck(int argc, char *argv[]) args->es_max_mem = strdup("500M"); // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 - // - min_inter: pairs with smaller err value will be considered identical + // - min_inter: pairs with smaller err value will be considered identical // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered // different. If negative, the cutoff may be heuristically lowered args->min_inter_err = 0.23; @@ -1171,7 +1178,7 @@ int main_vcfgtcheck(int argc, char *argv[]) case 3 : args->calc_hwe_prob = 0; break; case 4 : error("The option -S, --target-sample has been deprecated\n"); break; case 5 : args->dry_run = 1; break; - case 6 : + case 6 : args->distinctive_sites = strtod(optarg,&tmp); if ( *tmp ) { @@ -1204,7 +1211,7 @@ int main_vcfgtcheck(int argc, char *argv[]) else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4; else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); break; - case 'S': + case 'S': if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1; else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1; else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index a755a85..4ee29b4 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -60,6 +60,8 @@ typedef struct char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list; char *isec_exact; int argc, record_cmd_line; + char *index_fn; + int write_index; } args_t; @@ -148,6 +150,8 @@ void isec_vcf(args_t *args) if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); + if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 ) + error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output"); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(stderr,"Note: -w option not given, printing list of sites...\n"); @@ -253,7 +257,19 @@ void isec_vcf(args_t *args) } } if ( str.s ) free(str.s); - if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); + if ( out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); + } } static void add_filter(args_t *args, char *expr, int logic) @@ -481,6 +497,7 @@ static void usage(void) fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); @@ -537,6 +554,7 @@ int main_vcfisec(int argc, char *argv[]) {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -608,6 +626,7 @@ int main_vcfisec(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index 50214a6..76e4d3a 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -2,7 +2,7 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -62,6 +62,8 @@ typedef struct char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list; char *isec_exact; int argc, record_cmd_line; + char *index_fn; + int write_index; } args_t; @@ -150,6 +152,8 @@ void isec_vcf(args_t *args) if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); + if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 ) + error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output"); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(bcftools_stderr,"Note: -w option not given, printing list of sites...\n"); @@ -255,7 +259,19 @@ void isec_vcf(args_t *args) } } if ( str.s ) free(str.s); - if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); + if ( out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); + } } static void add_filter(args_t *args, char *expr, int logic) @@ -483,6 +499,7 @@ static void usage(void) fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); @@ -539,6 +556,7 @@ int main_vcfisec(int argc, char *argv[]) {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -610,6 +628,7 @@ int main_vcfisec(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index 621f410..87b6b8a 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -63,6 +63,19 @@ typedef khash_t(strdict) strdict_t; #define PL2PROB_MAX 1024 +// Rules for merging FORMAT Number=A,G,R vectors with missing values +#define MERGE_MISSING_DOT 0 // leave as is, i.e. use a missing value "." +#define MERGE_MISSING_CONST 1 // use a constant value +#define MERGE_MISSING_MAX 2 // use the existing maximum value + +typedef struct _missing_rule_t +{ + char *hdr_tag; + int type; + float value; +} +missing_rule_t; + // For merging INFO Number=A,G,R tags typedef struct { @@ -103,29 +116,37 @@ typedef struct int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles) int mmap; // size of map array (only buffer[i].n_allele is actually used) int als_differ; + int var_types; // variant types in this record, shifted by <<1 to account for VCF_REF } maux1_t; + +// Buffered lines for a single reader typedef struct { int rid; // current rid int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush. + int unkn_allele;// the index of the unknown allele (<*>, ) int cur; // current line or -1 if none int mrec; // allocated size of buf maux1_t *rec; // buffer to keep reader's lines bcf1_t **lines; // source buffer: either gvcf or readers' buffer + int var_types; // reader's variant types in the active [beg,end] window } buffer_t; typedef struct { - int n, pos, var_types; // number of readers, current position, currently available variant types + int n, pos, var_types; // number of readers; current position; variant types at this position across all available records + int *als_types, // allele type of each output allele + mals_types; char *chr; // current chromosome char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output int nals, mals, nout_als, mout_als; // size of the output array int *cnt, ncnt; // number of records that refer to the alleles int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases) + const char **fmt_key;// temporary short-lived array to store output tag names bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT int nfmt_map; // number of rows in the fmt_map array - int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes + int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes, from src idxs to dst file idxs void *tmp_arr; size_t ntmp_arr; buffer_t *buf; @@ -156,6 +177,9 @@ typedef struct faidx_t *gvcf_fai; info_rule_t *rules; int nrules; + char *missing_rules_str; + missing_rule_t *missing_rules; // lookup for -M, --missing-rules + int nmissing_rules; strdict_t *tmph; kstring_t tmps; bcf_srs_t *files; @@ -166,6 +190,8 @@ typedef struct int argc, n_threads, record_cmd_line, clevel; int local_alleles; // the value of -L option int keep_AC_AN; + char *index_fn; + int write_index; } args_t; @@ -298,6 +324,89 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul } } +static int missing_rules_comp_key2(const void *a, const void *b) +{ + missing_rule_t *rule1 = (missing_rule_t*) a; + missing_rule_t *rule2 = (missing_rule_t*) b; + return strcmp(rule1->hdr_tag, rule2->hdr_tag); +} +static int missing_rules_comp_key(const void *a, const void *b) +{ + char *key = (char*) a; + missing_rule_t *rule = (missing_rule_t*) b; + return strcmp(key, rule->hdr_tag); +} +static void missing_rules_init(args_t *args) +{ + kstring_t str = {0,0,0}; + if ( args->missing_rules_str ) + { + if ( !strcmp("-",args->missing_rules_str) ) kputs("PL:.,AD:.",&str); + else kputs(args->missing_rules_str,&str); + } + else if ( args->do_gvcf ) kputs("PL:max,AD:0",&str); + else return; + + args->nmissing_rules = 1; + char *ss = str.s, *tmp = ss; + int n = 0; + while ( *ss ) + { + if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); } + else if ( *ss==',' ) { *ss = 0; args->nmissing_rules++; n++; if ( n%2==1 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); } + ss++; + } + if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); + args->missing_rules = (missing_rule_t*) calloc(args->nmissing_rules,sizeof(missing_rule_t)); + + n = args->nmissing_rules; + args->nmissing_rules = 0; + ss = tmp; + while ( args->nmissing_rules < n ) + { + missing_rule_t *rule = &args->missing_rules[args->nmissing_rules]; + rule->hdr_tag = strdup(ss); + int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag); + if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_FMT,id) ) + { + if ( args->missing_rules_str ) error("The FORMAT tag is not defined in the header: \"%s\"\n", rule->hdr_tag); + free(rule->hdr_tag); + n--; + ss = strchr(ss, '\0'); ss++; + if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag); + ss = strchr(ss, '\0'); ss++; + continue; + } + + ss = strchr(ss, '\0'); ss++; + if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag); + + if ( !strcasecmp(ss,".") ) rule->type = MERGE_MISSING_DOT; + else if ( !strcasecmp(ss,"max") ) rule->type = MERGE_MISSING_MAX; + else + { + char *tmp = ss; + rule->value = strtod(ss, &tmp); + if ( *tmp ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); + rule->type = MERGE_MISSING_CONST; + } + ss = strchr(ss, '\0'); ss++; + args->nmissing_rules++; + } + qsort(args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key2); + free(str.s); +} +static void missing_rules_destroy(args_t *args) +{ + int i; + for (i=0; inmissing_rules; i++) + { + missing_rule_t *rule = &args->missing_rules[i]; + free(rule->hdr_tag); + } + free(args->missing_rules); +} + static int info_rules_comp_key2(const void *a, const void *b) { info_rule_t *rule1 = (info_rule_t*) a; @@ -770,6 +879,7 @@ void maux_destroy(maux_t *ma) int i,j; for (i=0; inout_smpl; i++) free(ma->str[i].s); free(ma->str); + free(ma->als_types); for (i=0; imals; i++) { free(ma->als[i]); @@ -793,6 +903,7 @@ void maux_destroy(maux_t *ma) free(ma->AGR_info); if (ma->ntmp_arr) free(ma->tmp_arr); if (ma->nfmt_map) free(ma->fmt_map); + free(ma->fmt_key); // ma->inf freed in bcf_destroy1 for (i=0; imals; i++) free(ma->als[i]); if (ma->mout_als) free(ma->out_als); @@ -820,7 +931,6 @@ void maux_reset(maux_t *ma, int *rid_tab) { int i,j; for (i=0; in; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1); - for (i=0; incnt; i++) ma->cnt[i] = 0; for (i=0; imals; i++) { free(ma->als[i]); @@ -856,6 +966,7 @@ void maux_reset(maux_t *ma, int *rid_tab) for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++) { ma->buf[i].rec[j].skip = 0; + ma->buf[i].rec[j].var_types = 0; bcf1_t *line = ma->files->readers[i].buffer[j]; if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break; } @@ -959,12 +1070,14 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) int ir, j; for (ir=0; irnreaders; ir++) { + ma->buf[ir].unkn_allele = 0; bcf1_t *line = maux_get_line(args,ir); if ( !line ) continue; for (j=1; jn_allele; j++) { int irec = ma->buf[ir].cur; if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als; + if ( bcf_has_variant_type(line,j,VCF_REF) && line->d.allele[j][0]=='<' ) ma->buf[ir].unkn_allele = j; } } } @@ -1985,7 +2098,7 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize); ma->laa_dirty = 1; } -void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) +void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule, bcf1_t *out) { bcf_srs_t *files = args->files; bcf_hdr_t *out_hdr = args->out_hdr; @@ -2135,12 +2248,32 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) for (l=1; lsmpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \ - for (l=0; lsmpl_ploidy[ismpl+j]==1 ? 1 : 0; \ + int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \ + if ( ma->buf[i].unkn_allele ) /* Use value from the unknown allele when available */ \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \ + for (l=0; ltype==MERGE_MISSING_CONST ) \ + { \ + for (l=0; lvalue; tgt++; } \ + } \ + else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + src_type_t max = src[0]; \ + for (l=1; ln; l++) if ( max < src[l] ) max = src[l]; \ + for (l=0; lsmpl_ploidy[ismpl+j]==1 ) \ + if ( haploid ) \ { \ - /* Haploid */ \ int iori, inew; \ for (iori=0; iorin_allele; iori++) \ { \ @@ -2194,7 +2327,26 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) continue; \ } \ src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \ - for (l=0; lbuf[i].unkn_allele ) /* Use value from the unknown allele when available */ \ + { \ + int iunkn = ma->buf[i].unkn_allele; \ + for (l=0; ltype==MERGE_MISSING_CONST ) \ + { \ + for (l=0; lvalue; tgt++; } \ + } \ + else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + src_type_t max = src[0]; \ + for (l=1; ln; l++) if ( max < src[l] ) max = src[l]; \ + for (l=0; ln_allele; iori++) \ { \ @@ -2234,6 +2386,7 @@ void merge_format(args_t *args, bcf1_t *out) { ma->nfmt_map = 2; ma->fmt_map = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*)); + ma->fmt_key = (const char**) malloc(ma->nfmt_map*sizeof(*ma->fmt_key)); } else memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**)); @@ -2250,7 +2403,7 @@ void merge_format(args_t *args, bcf1_t *out) bcf_hdr_t *hdr = reader->header; for (j=0; jn_fmt; j++) { - // Wat this tag already seen? + // Was this tag already seen? bcf_fmt_t *fmt = &line->d.fmt[j]; const char *key = hdr->id[BCF_DT_ID][fmt->id].key; kitr = kh_get(strdict, tmph, key); @@ -2269,9 +2422,11 @@ void merge_format(args_t *args, bcf1_t *out) { ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders); memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*)); + ma->fmt_key = (const char**) realloc(ma->fmt_key, sizeof(*ma->fmt_key)*(max_ifmt+1)); ma->nfmt_map = max_ifmt+1; } if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; } + ma->fmt_key[max_ifmt] = key; } kitr = kh_put(strdict, tmph, key, &ret); kh_value(tmph, kitr) = ifmt; @@ -2298,7 +2453,10 @@ void merge_format(args_t *args, bcf1_t *out) update_AN_AC(out_hdr, out); for (i=1; i<=max_ifmt; i++) - merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); + { + missing_rule_t *rule = (missing_rule_t*) bsearch(ma->fmt_key[i], args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key); + merge_format_field(args, &ma->fmt_map[i*files->nreaders], rule, out); + } if ( ma->laa_dirty ) update_local_alleles(args, out); @@ -2406,6 +2564,9 @@ void gvcf_write_block(args_t *args, int start, int end) { int slen = 0; char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen); + if (!seq) + exit(1); // faidx_fetch_seq has already reported the error. + if (slen) { out->d.allele[0][0] = seq[0]; @@ -2520,16 +2681,6 @@ static inline int is_gvcf_block(bcf1_t *line) return 0; } -// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h -// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and -// to accommodate for VCF_GVCF_REF defined below -static const int - snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), - indel_mask = VCF_INDEL<<2, - ins_mask = VCF_INS<<2, - del_mask = VCF_DEL<<2, - ref_mask = 2; - /* Check incoming lines for new gVCF blocks, set pointer to the current source buffer (gvcf or readers). In contrast to gvcf_flush, this function can be @@ -2629,7 +2780,7 @@ void clean_buffer(args_t *args) { if ( ma->gvcf[ir].active ) { - if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; + if ( ma->pos > ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block } if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; @@ -2664,13 +2815,16 @@ void debug_maux(args_t *args) { bcf_sr_t *reader = &files->readers[j]; buffer_t *buf = &maux->buf[j]; - fprintf(stderr," reader %d: ", j); + fprintf(stderr," reader %d (k=%d-%d): ", j,buf->beg,buf->end); for (k=buf->beg; kend; k++) { - if ( buf->rec[k].skip & SKIP_DONE ) continue; - bcf1_t *line = reader->buffer[k]; + if ( buf->rec[k].skip & SKIP_DONE ) { fprintf(stderr," DONE"); continue; } + bcf1_t *line = reader->buffer[k]; // selected for merging by can_merge fprintf(stderr,"\t"); - if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record will not be merged in this round + if ( buf->cur==k ) fprintf(stderr,"!"); // selected for merging by stage_line + if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record cannot be merged in this round + if ( !line->n_allele && maux->gvcf[j].active ) + fprintf(stderr,"<*>"); for (l=0; ln_allele; l++) fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]); if ( buf->rec[k].skip ) fprintf(stderr,"]"); @@ -2686,9 +2840,10 @@ void debug_state(args_t *args) { maux_t *maux = args->maux; int i,j; + fprintf(stderr,"State after position=%d done:\n",maux->pos+1); for (i=0; ifiles->nreaders; i++) { - fprintf(stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end); + fprintf(stderr,"\treader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end); if ( maux->buf[i].cur >=0 ) { bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); @@ -2698,20 +2853,136 @@ void debug_state(args_t *args) } fprintf(stderr,"\n"); } - fprintf(stderr,"gvcf_min=%d\n", args->maux->gvcf_min); + fprintf(stderr,"\tgvcf_min=%d\n", args->maux->gvcf_min); for (i=0; ifiles->nreaders; i++) { - fprintf(stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); + fprintf(stderr,"\t\treader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1); fprintf(stderr,"\n"); } fprintf(stderr,"\n"); } + +// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h +// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) +static const int + snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), + indel_mask = (VCF_INDEL<<1), + ins_mask = VCF_INS<<1, + del_mask = VCF_DEL<<1, + ref_mask = 1; + +// Can these types be merged given the -m settings? Despite the function's name, its focus is on +// excluding incompatible records, there will be a finer matching later in stage_line() +static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec) +{ + int k; + maux_t *maux = args->maux; + bcf1_t *rec = buf->lines[irec]; + int rec_types = buf->rec[irec].var_types; + + assert( selected_types ); // this is trivially true, set in can_merge() + + if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything + + // REF and gVCF_REF with no other alleles present can be merged with anything + if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1; + if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1; + + if ( args->collapse!=COLLAPSE_NONE ) + { + // If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del + // Include the new record if + // - rec has SNV, we already have SNV, and -m is both,snps,snp-ins-del + // - rec has indel, we already have an indel, and -m both,indels,snp-ins-del + if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) ) + { + if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1; + } + if ( args->collapse&COLLAPSE_INDELS ) + { + if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1; + } + if ( args->collapse&COLLAPSE_SNP_INS_DEL ) + { + if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1; + if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1; + } + // Whatever is left, allow to match if the alleles match exactly + } + + // The -m none mode or exact matching requested + // Simple test first: are the variants of the same type? + int x = selected_types >> 1; // remove REF + int y = rec_types >> 1; // remove REF + while ( x && y ) { x>>=1; y>>=1; } + if ( x || y ) return 0; // the types differ + + if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible + for (k=1; kn_allele; k++) + { + if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue; // this must be gVCF_REF (<*> or ) + if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break; + } + if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k] + return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF +} + +static void maux_update_alleles(args_t *args, int ireader, int irec) +{ + int k; + bcf_sr_t *reader = &args->files->readers[ireader]; + maux_t *maux = args->maux; + buffer_t *buf = &maux->buf[ireader]; + maux1_t *ma1 = &buf->rec[irec]; + bcf1_t *line = buf->lines[irec]; + hts_expand(int, line->n_allele, ma1->mmap, ma1->map); + if ( !maux->nals ) // first record to be merged, copy the alleles to the output + { + maux->nals = line->n_allele; + hts_expand0(char*, maux->nals, maux->mals, maux->als); + hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); + hts_expand0(int, maux->nals, maux->mals_types, maux->als_types); + for (k=0; knals; k++) + { + free(maux->als[k]); + maux->als[k] = strdup(line->d.allele[k]); + ma1->map[k] = k; + maux->cnt[k] = 1; + int var_type = bcf_has_variant_type(line, k, VCF_ANY); + if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL; + maux->als_types[k] = var_type ? var_type<<1 : ref_mask; + } + return; + } + // normalize alleles + maux->als = merge_alleles(line->d.allele, line->n_allele, ma1->map, maux->als, &maux->nals, &maux->mals); + if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); + hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); + hts_expand0(int, maux->nals, maux->mals_types, maux->als_types); + for (k=1; kn_allele; k++) + { + int ik = ma1->map[k]; + int var_type = bcf_has_variant_type(line, k, VCF_ANY); + if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL; + maux->als_types[ik] = var_type ? var_type<<1 : ref_mask; + maux->cnt[ik]++; // how many times an allele appears in the files + } + maux->cnt[0]++; +} + /* - Determine which line should be merged from which reader: go through all - readers and all buffered lines, expand REF,ALT and try to match lines with - the same ALTs. + Determine which lines remain to be merged across readers at the current position and + are compatible given the -m criteria. This is indicated by maux1_t.skip: 0=compatible, + SKIP_DONE=the record is done, SKIP_DIFF=not compatible and will be included next time. + + At the same time count how many times is each allele present across the readers and records + so that we can prioritize the records with the same alleles to come first. In the end maximum + one record at a time can be selected from each reader and that witll be done in stage_line(). + + The function maux_reset already initialized structures for this position, so here each + reader comes with the beg,end indexes that point to records with the same maux_t.pos position. */ int can_merge(args_t *args) { @@ -2719,28 +2990,39 @@ int can_merge(args_t *args) maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; char *id = NULL, ref = 'N'; - int i,j,k, ntodo = 0; + int i,j, ntodo = 0; for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; + maux->cnt[i] = 0; } maux->var_types = maux->nals = 0; - // this is only for the `-m none -g` mode, ensure that <*> lines come last - #define VCF_GVCF_REF 1 - + // In this loop we do the following: + // - remember the first encountered ID if matching by ID + // - count the number of unprocessed records at this position + // - collect all variant types at this position. This is to be able to perform -m matching and + // print SNVs first, then indels, then gVCF blocks + // - init the 'skip' variable to SKIP_DIFF for each record that has not been used yet for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; + buf->var_types = 0; - if ( gaux && gaux[i].active ) + if ( gaux && gaux[i].active ) // active gvcf block { - // skip readers with active gvcf blocks buf->rec[buf->beg].skip = SKIP_DIFF; + maux->var_types |= ref_mask; + buf->var_types |= ref_mask; + buf->rec[buf->beg].var_types = ref_mask; continue; } + + // for gvcf: find out REF at this position + if ( buf->beg < buf->end && ref=='N' ) ref = buf->lines[buf->beg]->d.allele[0][0]; + for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip & SKIP_DONE ) continue; @@ -2749,118 +3031,70 @@ int can_merge(args_t *args) ntodo++; bcf1_t *line = buf->lines[j]; - if ( args->merge_by_id ) - id = line->d.id; - else + if ( args->merge_by_id && !id ) { id = line->d.id; continue; } // set ID when merging by id + + if ( !buf->rec[j].var_types ) { int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); - if (var_type < 0) error("bcf_has_variant_types() failed."); + if ( var_type < 0 ) error("bcf_has_variant_types() failed."); if ( args->collapse==COLLAPSE_SNP_INS_DEL ) { // need to distinguish between ins and del so strip the VCF_INDEL flag var_type &= ~VCF_INDEL; } - maux->var_types |= var_type ? var_type<<2 : 2; - - // for the `-m none -g` mode - if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) ) - maux->var_types |= VCF_GVCF_REF; + var_type = var_type ? var_type<<1 : ref_mask; + if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask; + buf->rec[j].var_types = var_type; } + maux->var_types |= buf->rec[j].var_types; + buf->var_types |= buf->rec[j].var_types; } - - // for gvcf: find out REF at this position - if ( buf->beg < buf->end && ref=='N' ) - ref = buf->lines[buf->beg]->d.allele[0][0]; } if ( !ntodo ) return 0; + int selected_types = 0; + // In this loop we select from each reader compatible candidate lines. // (i.e. SNPs or indels). Go through all files and all lines at this // position and normalize relevant alleles. // REF-only sites may be associated with both SNPs and indels. for (i=0; inreaders; i++) { - bcf_sr_t *reader = &files->readers[i]; buffer_t *buf = &maux->buf[i]; - if ( gaux && gaux[i].active ) { + // gVCF records inherited from an upstream gVCF block have incorrect or missing allele and position gaux[i].line->d.allele[0][0] = ref; gaux[i].line->pos = maux->pos; + maux_update_alleles(args, i, buf->beg); + selected_types |= ref_mask; + continue; } - for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip & SKIP_DONE ) continue; bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer - - int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); - if (line_type < 0) error("bcf_has_variant_types() failed."); - line_type = line_type ? line_type<<2 : 2; + int line_types = buf->rec[j].var_types; // select relevant lines if ( args->merge_by_id ) { - if ( strcmp(id,line->d.id) ) continue; + if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record } + else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue; else { - // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant - // records come last, otherwise infinite loop is created (#1164) - if ( args->collapse==COLLAPSE_NONE && args->do_gvcf ) - { - if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue; - } - if ( args->collapse==COLLAPSE_NONE && maux->nals ) - { - // All alleles of the tested record must be present in the - // selected maux record plus variant types must be the same - if ( (maux->var_types & line_type) != line_type ) continue; - if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible - for (k=1; kn_allele; k++) - { - if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break; - } - if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele - } - if ( !(args->collapse&COLLAPSE_ANY) ) - { - // Merge: - // - SNPs+SNPs+MNPs+REF if -m both,snps - // - indels+indels+REF if -m both,indels, REF only if SNPs are not present - // - SNPs come first - if ( line_type & (indel_mask|ins_mask|del_mask) ) - { - if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first - if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks - } - } + // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes + if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics + && (maux->var_types&snp_mask) // there are SNVs at the current position + && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref + ) continue; } - buf->rec[j].skip = 0; + selected_types |= line_types; - hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map); - if ( !maux->nals ) // first record, copy the alleles to the output - { - maux->nals = line->n_allele; - hts_expand0(char*, maux->nals, maux->mals, maux->als); - hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); - for (k=0; knals; k++) - { - free(maux->als[k]); - maux->als[k] = strdup(line->d.allele[k]); - buf->rec[j].map[k] = k; - maux->cnt[k] = 1; - } - continue; - } - // normalize alleles - maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); - if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); - hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); - for (k=1; kn_allele; k++) - maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files - maux->cnt[0]++; + buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line + maux_update_alleles(args, i, j); } } return 1; @@ -2878,48 +3112,61 @@ void stage_line(args_t *args) bcf_srs_t *files = args->files; maux_t *maux = args->maux; - // debug_maux(args); - - // take the most frequent allele present in multiple files, REF is skipped - int i,j,k,icnt = 1; - for (i=2; inals; i++) - if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i; + // Take the most frequent allele present in multiple files, REF and gVCF_REF is skipped. + int i,j,k,icnt = -1; + for (i=1; inals; i++) + { + if ( maux->als_types[i] & ref_mask ) continue; + if ( icnt==-1 || maux->cnt[icnt] < maux->cnt[i] ) icnt = i; + } + int selected_type = icnt>0 ? maux->als_types[icnt] : ref_mask; int nout = 0; for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; buf->cur = -1; - if ( buf->beg >= buf->end ) continue; // no lines in the buffer + if ( buf->beg >= buf->end ) continue; // No lines in the buffer at this site // find lines with the same allele for (j=buf->beg; jend; j++) { - if ( buf->rec[j].skip ) continue; // done or not compatible - if ( args->merge_by_id ) break; - if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record + if ( buf->rec[j].skip ) + { + int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0; + if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1; + if ( !is_gvcf ) continue; // done or not compatible + } + if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line + + // skip if the reader has a record that matches the most frequent allele and this record is not it + if ( (selected_type & buf->var_types) && !(selected_type & buf->rec[j].var_types) ) continue; + // if the reader does not have the most frequent allele type but is a ref, accept + if ( !(selected_type & buf->var_types) && (buf->rec[j].var_types & ref_mask) ) break; + if ( selected_type==ref_mask ) break; + + // accept if the record has the most frequent allele for (k=0; klines[j]->n_allele; k++) if ( icnt==buf->rec[j].map[k] ) break; - if ( klines[j]->n_allele ) break; } if ( j>=buf->end ) { // no matching allele found in this file - if ( args->collapse==COLLAPSE_NONE ) continue; + if ( args->collapse==COLLAPSE_NONE ) continue; // exact matching requested, skip + // choose something compatible to create a multiallelic site given the -m criteria for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip ) continue; // done or not compatible if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged - int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap); - if (line_type < 0) error("bcf_has_variant_types() failed."); - if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; - if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; - if ( line_type==VCF_REF ) + int line_type = buf->rec[j].var_types; + if ( maux->var_types&snp_mask && line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; + if ( maux->var_types&indel_mask && line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ins_mask && line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( maux->var_types&del_mask && line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( line_type&ref_mask ) { if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; @@ -2940,12 +3187,21 @@ void stage_line(args_t *args) { // found a suitable line for merging buf->cur = j; - - // mark as finished so that it's ignored next time - buf->rec[j].skip = SKIP_DONE; - nout++; } } + + // debug_maux(args); + + // Mark lines staged for merging as finished so that they are ignored next time + for (i=0; inreaders; i++) + { + buffer_t *buf = &maux->buf[i]; + if ( buf->cur == -1 ) continue; + + buf->rec[buf->cur].skip = SKIP_DONE; + nout++; + } + assert( nout ); } @@ -3078,6 +3334,7 @@ void merge_vcf(args_t *args) error_errno("[%s] Failed to update header", __func__); } info_rules_init(args); + missing_rules_init(args); bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); @@ -3087,6 +3344,7 @@ void merge_vcf(args_t *args) if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); return; } + else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init(); args->maux = maux_init(args); @@ -3122,9 +3380,19 @@ void merge_vcf(args_t *args) gvcf_flush(args,1); info_rules_destroy(args); + missing_rules_destroy(args); maux_destroy(args->maux); bcf_hdr_destroy(args->out_hdr); - if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"stdout"); bcf_destroy1(args->out_line); kh_destroy(strdict, args->tmph); if ( args->tmps.m ) free(args->tmps.s); @@ -3146,11 +3414,12 @@ static void usage(void) fprintf(stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n"); fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); fprintf(stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); - fprintf(stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); + fprintf(stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n"); fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(stderr, " -l, --file-list FILE Read file names from the file\n"); fprintf(stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); fprintf(stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); + fprintf(stderr, " -M, --missing-rules TAG:METHOD Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n"); fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); @@ -3159,6 +3428,7 @@ static void usage(void) fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -3197,13 +3467,15 @@ int main_vcfmerge(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"regions-overlap",required_argument,NULL,4}, {"info-rules",required_argument,NULL,'i'}, + {"missing-rules",required_argument,NULL,'M'}, {"no-version",no_argument,NULL,8}, {"no-index",no_argument,NULL,10}, {"filter-logic",required_argument,NULL,'F'}, + {"write-index",no_argument,NULL,11}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) { switch (c) { case 'L': args->local_alleles = strtol(optarg,&tmp,10); @@ -3227,6 +3499,7 @@ int main_vcfmerge(int argc, char *argv[]) break; case 'l': args->file_list = optarg; break; case 'i': args->info_rules = optarg; break; + case 'M': args->missing_rules_str = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -3254,7 +3527,7 @@ int main_vcfmerge(int argc, char *argv[]) else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE; - else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL; + else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS; else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; } else error("The -m type \"%s\" is not recognised.\n", optarg); break; @@ -3271,6 +3544,7 @@ int main_vcfmerge(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->no_index = 1; break; + case 11 : args->write_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index 2231a57..7ce5dfa 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -2,7 +2,7 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -65,6 +65,19 @@ typedef khash_t(strdict) strdict_t; #define PL2PROB_MAX 1024 +// Rules for merging FORMAT Number=A,G,R vectors with missing values +#define MERGE_MISSING_DOT 0 // leave as is, i.e. use a missing value "." +#define MERGE_MISSING_CONST 1 // use a constant value +#define MERGE_MISSING_MAX 2 // use the existing maximum value + +typedef struct _missing_rule_t +{ + char *hdr_tag; + int type; + float value; +} +missing_rule_t; + // For merging INFO Number=A,G,R tags typedef struct { @@ -105,29 +118,37 @@ typedef struct int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles) int mmap; // size of map array (only buffer[i].n_allele is actually used) int als_differ; + int var_types; // variant types in this record, shifted by <<1 to account for VCF_REF } maux1_t; + +// Buffered lines for a single reader typedef struct { int rid; // current rid int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush. + int unkn_allele;// the index of the unknown allele (<*>, ) int cur; // current line or -1 if none int mrec; // allocated size of buf maux1_t *rec; // buffer to keep reader's lines bcf1_t **lines; // source buffer: either gvcf or readers' buffer + int var_types; // reader's variant types in the active [beg,end] window } buffer_t; typedef struct { - int n, pos, var_types; // number of readers, current position, currently available variant types + int n, pos, var_types; // number of readers; current position; variant types at this position across all available records + int *als_types, // allele type of each output allele + mals_types; char *chr; // current chromosome char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output int nals, mals, nout_als, mout_als; // size of the output array int *cnt, ncnt; // number of records that refer to the alleles int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases) + const char **fmt_key;// temporary short-lived array to store output tag names bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT int nfmt_map; // number of rows in the fmt_map array - int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes + int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes, from src idxs to dst file idxs void *tmp_arr; size_t ntmp_arr; buffer_t *buf; @@ -158,6 +179,9 @@ typedef struct faidx_t *gvcf_fai; info_rule_t *rules; int nrules; + char *missing_rules_str; + missing_rule_t *missing_rules; // lookup for -M, --missing-rules + int nmissing_rules; strdict_t *tmph; kstring_t tmps; bcf_srs_t *files; @@ -168,6 +192,8 @@ typedef struct int argc, n_threads, record_cmd_line, clevel; int local_alleles; // the value of -L option int keep_AC_AN; + char *index_fn; + int write_index; } args_t; @@ -300,6 +326,89 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul } } +static int missing_rules_comp_key2(const void *a, const void *b) +{ + missing_rule_t *rule1 = (missing_rule_t*) a; + missing_rule_t *rule2 = (missing_rule_t*) b; + return strcmp(rule1->hdr_tag, rule2->hdr_tag); +} +static int missing_rules_comp_key(const void *a, const void *b) +{ + char *key = (char*) a; + missing_rule_t *rule = (missing_rule_t*) b; + return strcmp(key, rule->hdr_tag); +} +static void missing_rules_init(args_t *args) +{ + kstring_t str = {0,0,0}; + if ( args->missing_rules_str ) + { + if ( !strcmp("-",args->missing_rules_str) ) kputs("PL:.,AD:.",&str); + else kputs(args->missing_rules_str,&str); + } + else if ( args->do_gvcf ) kputs("PL:max,AD:0",&str); + else return; + + args->nmissing_rules = 1; + char *ss = str.s, *tmp = ss; + int n = 0; + while ( *ss ) + { + if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); } + else if ( *ss==',' ) { *ss = 0; args->nmissing_rules++; n++; if ( n%2==1 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); } + ss++; + } + if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); + args->missing_rules = (missing_rule_t*) calloc(args->nmissing_rules,sizeof(missing_rule_t)); + + n = args->nmissing_rules; + args->nmissing_rules = 0; + ss = tmp; + while ( args->nmissing_rules < n ) + { + missing_rule_t *rule = &args->missing_rules[args->nmissing_rules]; + rule->hdr_tag = strdup(ss); + int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag); + if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_FMT,id) ) + { + if ( args->missing_rules_str ) error("The FORMAT tag is not defined in the header: \"%s\"\n", rule->hdr_tag); + free(rule->hdr_tag); + n--; + ss = strchr(ss, '\0'); ss++; + if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag); + ss = strchr(ss, '\0'); ss++; + continue; + } + + ss = strchr(ss, '\0'); ss++; + if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag); + + if ( !strcasecmp(ss,".") ) rule->type = MERGE_MISSING_DOT; + else if ( !strcasecmp(ss,"max") ) rule->type = MERGE_MISSING_MAX; + else + { + char *tmp = ss; + rule->value = strtod(ss, &tmp); + if ( *tmp ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); + rule->type = MERGE_MISSING_CONST; + } + ss = strchr(ss, '\0'); ss++; + args->nmissing_rules++; + } + qsort(args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key2); + free(str.s); +} +static void missing_rules_destroy(args_t *args) +{ + int i; + for (i=0; inmissing_rules; i++) + { + missing_rule_t *rule = &args->missing_rules[i]; + free(rule->hdr_tag); + } + free(args->missing_rules); +} + static int info_rules_comp_key2(const void *a, const void *b) { info_rule_t *rule1 = (info_rule_t*) a; @@ -772,6 +881,7 @@ void maux_destroy(maux_t *ma) int i,j; for (i=0; inout_smpl; i++) free(ma->str[i].s); free(ma->str); + free(ma->als_types); for (i=0; imals; i++) { free(ma->als[i]); @@ -795,6 +905,7 @@ void maux_destroy(maux_t *ma) free(ma->AGR_info); if (ma->ntmp_arr) free(ma->tmp_arr); if (ma->nfmt_map) free(ma->fmt_map); + free(ma->fmt_key); // ma->inf freed in bcf_destroy1 for (i=0; imals; i++) free(ma->als[i]); if (ma->mout_als) free(ma->out_als); @@ -822,7 +933,6 @@ void maux_reset(maux_t *ma, int *rid_tab) { int i,j; for (i=0; in; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1); - for (i=0; incnt; i++) ma->cnt[i] = 0; for (i=0; imals; i++) { free(ma->als[i]); @@ -858,6 +968,7 @@ void maux_reset(maux_t *ma, int *rid_tab) for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++) { ma->buf[i].rec[j].skip = 0; + ma->buf[i].rec[j].var_types = 0; bcf1_t *line = ma->files->readers[i].buffer[j]; if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break; } @@ -961,12 +1072,14 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) int ir, j; for (ir=0; irnreaders; ir++) { + ma->buf[ir].unkn_allele = 0; bcf1_t *line = maux_get_line(args,ir); if ( !line ) continue; for (j=1; jn_allele; j++) { int irec = ma->buf[ir].cur; if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als; + if ( bcf_has_variant_type(line,j,VCF_REF) && line->d.allele[j][0]=='<' ) ma->buf[ir].unkn_allele = j; } } } @@ -1987,7 +2100,7 @@ void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bc bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize); ma->laa_dirty = 1; } -void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) +void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule, bcf1_t *out) { bcf_srs_t *files = args->files; bcf_hdr_t *out_hdr = args->out_hdr; @@ -2137,12 +2250,32 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) for (l=1; lsmpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \ - for (l=0; lsmpl_ploidy[ismpl+j]==1 ? 1 : 0; \ + int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \ + if ( ma->buf[i].unkn_allele ) /* Use value from the unknown allele when available */ \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \ + for (l=0; ltype==MERGE_MISSING_CONST ) \ + { \ + for (l=0; lvalue; tgt++; } \ + } \ + else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + src_type_t max = src[0]; \ + for (l=1; ln; l++) if ( max < src[l] ) max = src[l]; \ + for (l=0; lsmpl_ploidy[ismpl+j]==1 ) \ + if ( haploid ) \ { \ - /* Haploid */ \ int iori, inew; \ for (iori=0; iorin_allele; iori++) \ { \ @@ -2196,7 +2329,26 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) continue; \ } \ src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \ - for (l=0; lbuf[i].unkn_allele ) /* Use value from the unknown allele when available */ \ + { \ + int iunkn = ma->buf[i].unkn_allele; \ + for (l=0; ltype==MERGE_MISSING_CONST ) \ + { \ + for (l=0; lvalue; tgt++; } \ + } \ + else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \ + { \ + src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \ + src_type_t max = src[0]; \ + for (l=1; ln; l++) if ( max < src[l] ) max = src[l]; \ + for (l=0; ln_allele; iori++) \ { \ @@ -2236,6 +2388,7 @@ void merge_format(args_t *args, bcf1_t *out) { ma->nfmt_map = 2; ma->fmt_map = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*)); + ma->fmt_key = (const char**) malloc(ma->nfmt_map*sizeof(*ma->fmt_key)); } else memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**)); @@ -2252,7 +2405,7 @@ void merge_format(args_t *args, bcf1_t *out) bcf_hdr_t *hdr = reader->header; for (j=0; jn_fmt; j++) { - // Wat this tag already seen? + // Was this tag already seen? bcf_fmt_t *fmt = &line->d.fmt[j]; const char *key = hdr->id[BCF_DT_ID][fmt->id].key; kitr = kh_get(strdict, tmph, key); @@ -2271,9 +2424,11 @@ void merge_format(args_t *args, bcf1_t *out) { ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders); memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*)); + ma->fmt_key = (const char**) realloc(ma->fmt_key, sizeof(*ma->fmt_key)*(max_ifmt+1)); ma->nfmt_map = max_ifmt+1; } if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; } + ma->fmt_key[max_ifmt] = key; } kitr = kh_put(strdict, tmph, key, &ret); kh_value(tmph, kitr) = ifmt; @@ -2300,7 +2455,10 @@ void merge_format(args_t *args, bcf1_t *out) update_AN_AC(out_hdr, out); for (i=1; i<=max_ifmt; i++) - merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); + { + missing_rule_t *rule = (missing_rule_t*) bsearch(ma->fmt_key[i], args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key); + merge_format_field(args, &ma->fmt_map[i*files->nreaders], rule, out); + } if ( ma->laa_dirty ) update_local_alleles(args, out); @@ -2408,6 +2566,9 @@ void gvcf_write_block(args_t *args, int start, int end) { int slen = 0; char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen); + if (!seq) + bcftools_exit(1); // faidx_fetch_seq has already reported the error. + if (slen) { out->d.allele[0][0] = seq[0]; @@ -2522,16 +2683,6 @@ static inline int is_gvcf_block(bcf1_t *line) return 0; } -// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h -// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and -// to accommodate for VCF_GVCF_REF defined below -static const int - snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), - indel_mask = VCF_INDEL<<2, - ins_mask = VCF_INS<<2, - del_mask = VCF_DEL<<2, - ref_mask = 2; - /* Check incoming lines for new gVCF blocks, set pointer to the current source buffer (gvcf or readers). In contrast to gvcf_flush, this function can be @@ -2631,7 +2782,7 @@ void clean_buffer(args_t *args) { if ( ma->gvcf[ir].active ) { - if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; + if ( ma->pos > ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block } if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; @@ -2666,13 +2817,16 @@ void debug_maux(args_t *args) { bcf_sr_t *reader = &files->readers[j]; buffer_t *buf = &maux->buf[j]; - fprintf(bcftools_stderr," reader %d: ", j); + fprintf(bcftools_stderr," reader %d (k=%d-%d): ", j,buf->beg,buf->end); for (k=buf->beg; kend; k++) { - if ( buf->rec[k].skip & SKIP_DONE ) continue; - bcf1_t *line = reader->buffer[k]; + if ( buf->rec[k].skip & SKIP_DONE ) { fprintf(bcftools_stderr," DONE"); continue; } + bcf1_t *line = reader->buffer[k]; // selected for merging by can_merge fprintf(bcftools_stderr,"\t"); - if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"["); // this record will not be merged in this round + if ( buf->cur==k ) fprintf(bcftools_stderr,"!"); // selected for merging by stage_line + if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"["); // this record cannot be merged in this round + if ( !line->n_allele && maux->gvcf[j].active ) + fprintf(bcftools_stderr,"<*>"); for (l=0; ln_allele; l++) fprintf(bcftools_stderr,"%s%s", l==0?"":",", line->d.allele[l]); if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"]"); @@ -2688,9 +2842,10 @@ void debug_state(args_t *args) { maux_t *maux = args->maux; int i,j; + fprintf(bcftools_stderr,"State after position=%d done:\n",maux->pos+1); for (i=0; ifiles->nreaders; i++) { - fprintf(bcftools_stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end); + fprintf(bcftools_stderr,"\treader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end); if ( maux->buf[i].cur >=0 ) { bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); @@ -2700,20 +2855,136 @@ void debug_state(args_t *args) } fprintf(bcftools_stderr,"\n"); } - fprintf(bcftools_stderr,"gvcf_min=%d\n", args->maux->gvcf_min); + fprintf(bcftools_stderr,"\tgvcf_min=%d\n", args->maux->gvcf_min); for (i=0; ifiles->nreaders; i++) { - fprintf(bcftools_stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); + fprintf(bcftools_stderr,"\t\treader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); if ( maux->gvcf[i].active ) fprintf(bcftools_stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1); fprintf(bcftools_stderr,"\n"); } fprintf(bcftools_stderr,"\n"); } + +// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h +// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) +static const int + snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), + indel_mask = (VCF_INDEL<<1), + ins_mask = VCF_INS<<1, + del_mask = VCF_DEL<<1, + ref_mask = 1; + +// Can these types be merged given the -m settings? Despite the function's name, its focus is on +// excluding incompatible records, there will be a finer matching later in stage_line() +static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec) +{ + int k; + maux_t *maux = args->maux; + bcf1_t *rec = buf->lines[irec]; + int rec_types = buf->rec[irec].var_types; + + assert( selected_types ); // this is trivially true, set in can_merge() + + if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything + + // REF and gVCF_REF with no other alleles present can be merged with anything + if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1; + if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1; + + if ( args->collapse!=COLLAPSE_NONE ) + { + // If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del + // Include the new record if + // - rec has SNV, we already have SNV, and -m is both,snps,snp-ins-del + // - rec has indel, we already have an indel, and -m both,indels,snp-ins-del + if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) ) + { + if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1; + } + if ( args->collapse&COLLAPSE_INDELS ) + { + if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1; + } + if ( args->collapse&COLLAPSE_SNP_INS_DEL ) + { + if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1; + if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1; + } + // Whatever is left, allow to match if the alleles match exactly + } + + // The -m none mode or exact matching requested + // Simple test first: are the variants of the same type? + int x = selected_types >> 1; // remove REF + int y = rec_types >> 1; // remove REF + while ( x && y ) { x>>=1; y>>=1; } + if ( x || y ) return 0; // the types differ + + if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible + for (k=1; kn_allele; k++) + { + if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue; // this must be gVCF_REF (<*> or ) + if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break; + } + if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k] + return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF +} + +static void maux_update_alleles(args_t *args, int ireader, int irec) +{ + int k; + bcf_sr_t *reader = &args->files->readers[ireader]; + maux_t *maux = args->maux; + buffer_t *buf = &maux->buf[ireader]; + maux1_t *ma1 = &buf->rec[irec]; + bcf1_t *line = buf->lines[irec]; + hts_expand(int, line->n_allele, ma1->mmap, ma1->map); + if ( !maux->nals ) // first record to be merged, copy the alleles to the output + { + maux->nals = line->n_allele; + hts_expand0(char*, maux->nals, maux->mals, maux->als); + hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); + hts_expand0(int, maux->nals, maux->mals_types, maux->als_types); + for (k=0; knals; k++) + { + free(maux->als[k]); + maux->als[k] = strdup(line->d.allele[k]); + ma1->map[k] = k; + maux->cnt[k] = 1; + int var_type = bcf_has_variant_type(line, k, VCF_ANY); + if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL; + maux->als_types[k] = var_type ? var_type<<1 : ref_mask; + } + return; + } + // normalize alleles + maux->als = merge_alleles(line->d.allele, line->n_allele, ma1->map, maux->als, &maux->nals, &maux->mals); + if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); + hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); + hts_expand0(int, maux->nals, maux->mals_types, maux->als_types); + for (k=1; kn_allele; k++) + { + int ik = ma1->map[k]; + int var_type = bcf_has_variant_type(line, k, VCF_ANY); + if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL; + maux->als_types[ik] = var_type ? var_type<<1 : ref_mask; + maux->cnt[ik]++; // how many times an allele appears in the files + } + maux->cnt[0]++; +} + /* - Determine which line should be merged from which reader: go through all - readers and all buffered lines, expand REF,ALT and try to match lines with - the same ALTs. + Determine which lines remain to be merged across readers at the current position and + are compatible given the -m criteria. This is indicated by maux1_t.skip: 0=compatible, + SKIP_DONE=the record is done, SKIP_DIFF=not compatible and will be included next time. + + At the same time count how many times is each allele present across the readers and records + so that we can prioritize the records with the same alleles to come first. In the end maximum + one record at a time can be selected from each reader and that witll be done in stage_line(). + + The function maux_reset already initialized structures for this position, so here each + reader comes with the beg,end indexes that point to records with the same maux_t.pos position. */ int can_merge(args_t *args) { @@ -2721,28 +2992,39 @@ int can_merge(args_t *args) maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; char *id = NULL, ref = 'N'; - int i,j,k, ntodo = 0; + int i,j, ntodo = 0; for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; + maux->cnt[i] = 0; } maux->var_types = maux->nals = 0; - // this is only for the `-m none -g` mode, ensure that <*> lines come last - #define VCF_GVCF_REF 1 - + // In this loop we do the following: + // - remember the first encountered ID if matching by ID + // - count the number of unprocessed records at this position + // - collect all variant types at this position. This is to be able to perform -m matching and + // print SNVs first, then indels, then gVCF blocks + // - init the 'skip' variable to SKIP_DIFF for each record that has not been used yet for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; + buf->var_types = 0; - if ( gaux && gaux[i].active ) + if ( gaux && gaux[i].active ) // active gvcf block { - // skip readers with active gvcf blocks buf->rec[buf->beg].skip = SKIP_DIFF; + maux->var_types |= ref_mask; + buf->var_types |= ref_mask; + buf->rec[buf->beg].var_types = ref_mask; continue; } + + // for gvcf: find out REF at this position + if ( buf->beg < buf->end && ref=='N' ) ref = buf->lines[buf->beg]->d.allele[0][0]; + for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip & SKIP_DONE ) continue; @@ -2751,118 +3033,70 @@ int can_merge(args_t *args) ntodo++; bcf1_t *line = buf->lines[j]; - if ( args->merge_by_id ) - id = line->d.id; - else + if ( args->merge_by_id && !id ) { id = line->d.id; continue; } // set ID when merging by id + + if ( !buf->rec[j].var_types ) { int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); - if (var_type < 0) error("bcf_has_variant_types() failed."); + if ( var_type < 0 ) error("bcf_has_variant_types() failed."); if ( args->collapse==COLLAPSE_SNP_INS_DEL ) { // need to distinguish between ins and del so strip the VCF_INDEL flag var_type &= ~VCF_INDEL; } - maux->var_types |= var_type ? var_type<<2 : 2; - - // for the `-m none -g` mode - if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) ) - maux->var_types |= VCF_GVCF_REF; + var_type = var_type ? var_type<<1 : ref_mask; + if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask; + buf->rec[j].var_types = var_type; } + maux->var_types |= buf->rec[j].var_types; + buf->var_types |= buf->rec[j].var_types; } - - // for gvcf: find out REF at this position - if ( buf->beg < buf->end && ref=='N' ) - ref = buf->lines[buf->beg]->d.allele[0][0]; } if ( !ntodo ) return 0; + int selected_types = 0; + // In this loop we select from each reader compatible candidate lines. // (i.e. SNPs or indels). Go through all files and all lines at this // position and normalize relevant alleles. // REF-only sites may be associated with both SNPs and indels. for (i=0; inreaders; i++) { - bcf_sr_t *reader = &files->readers[i]; buffer_t *buf = &maux->buf[i]; - if ( gaux && gaux[i].active ) { + // gVCF records inherited from an upstream gVCF block have incorrect or missing allele and position gaux[i].line->d.allele[0][0] = ref; gaux[i].line->pos = maux->pos; + maux_update_alleles(args, i, buf->beg); + selected_types |= ref_mask; + continue; } - for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip & SKIP_DONE ) continue; bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer - - int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); - if (line_type < 0) error("bcf_has_variant_types() failed."); - line_type = line_type ? line_type<<2 : 2; + int line_types = buf->rec[j].var_types; // select relevant lines if ( args->merge_by_id ) { - if ( strcmp(id,line->d.id) ) continue; + if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record } + else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue; else { - // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant - // records come last, otherwise infinite loop is created (#1164) - if ( args->collapse==COLLAPSE_NONE && args->do_gvcf ) - { - if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue; - } - if ( args->collapse==COLLAPSE_NONE && maux->nals ) - { - // All alleles of the tested record must be present in the - // selected maux record plus variant types must be the same - if ( (maux->var_types & line_type) != line_type ) continue; - if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible - for (k=1; kn_allele; k++) - { - if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break; - } - if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele - } - if ( !(args->collapse&COLLAPSE_ANY) ) - { - // Merge: - // - SNPs+SNPs+MNPs+REF if -m both,snps - // - indels+indels+REF if -m both,indels, REF only if SNPs are not present - // - SNPs come first - if ( line_type & (indel_mask|ins_mask|del_mask) ) - { - if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first - if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks - } - } + // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes + if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics + && (maux->var_types&snp_mask) // there are SNVs at the current position + && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref + ) continue; } - buf->rec[j].skip = 0; + selected_types |= line_types; - hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map); - if ( !maux->nals ) // first record, copy the alleles to the output - { - maux->nals = line->n_allele; - hts_expand0(char*, maux->nals, maux->mals, maux->als); - hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); - for (k=0; knals; k++) - { - free(maux->als[k]); - maux->als[k] = strdup(line->d.allele[k]); - buf->rec[j].map[k] = k; - maux->cnt[k] = 1; - } - continue; - } - // normalize alleles - maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); - if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); - hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); - for (k=1; kn_allele; k++) - maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files - maux->cnt[0]++; + buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line + maux_update_alleles(args, i, j); } } return 1; @@ -2880,48 +3114,61 @@ void stage_line(args_t *args) bcf_srs_t *files = args->files; maux_t *maux = args->maux; - // debug_maux(args); - - // take the most frequent allele present in multiple files, REF is skipped - int i,j,k,icnt = 1; - for (i=2; inals; i++) - if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i; + // Take the most frequent allele present in multiple files, REF and gVCF_REF is skipped. + int i,j,k,icnt = -1; + for (i=1; inals; i++) + { + if ( maux->als_types[i] & ref_mask ) continue; + if ( icnt==-1 || maux->cnt[icnt] < maux->cnt[i] ) icnt = i; + } + int selected_type = icnt>0 ? maux->als_types[icnt] : ref_mask; int nout = 0; for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; buf->cur = -1; - if ( buf->beg >= buf->end ) continue; // no lines in the buffer + if ( buf->beg >= buf->end ) continue; // No lines in the buffer at this site // find lines with the same allele for (j=buf->beg; jend; j++) { - if ( buf->rec[j].skip ) continue; // done or not compatible - if ( args->merge_by_id ) break; - if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record + if ( buf->rec[j].skip ) + { + int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0; + if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1; + if ( !is_gvcf ) continue; // done or not compatible + } + if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line + + // skip if the reader has a record that matches the most frequent allele and this record is not it + if ( (selected_type & buf->var_types) && !(selected_type & buf->rec[j].var_types) ) continue; + // if the reader does not have the most frequent allele type but is a ref, accept + if ( !(selected_type & buf->var_types) && (buf->rec[j].var_types & ref_mask) ) break; + if ( selected_type==ref_mask ) break; + + // accept if the record has the most frequent allele for (k=0; klines[j]->n_allele; k++) if ( icnt==buf->rec[j].map[k] ) break; - if ( klines[j]->n_allele ) break; } if ( j>=buf->end ) { // no matching allele found in this file - if ( args->collapse==COLLAPSE_NONE ) continue; + if ( args->collapse==COLLAPSE_NONE ) continue; // exact matching requested, skip + // choose something compatible to create a multiallelic site given the -m criteria for (j=buf->beg; jend; j++) { if ( buf->rec[j].skip ) continue; // done or not compatible if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged - int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap); - if (line_type < 0) error("bcf_has_variant_types() failed."); - if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; - if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; - if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; - if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; - if ( line_type==VCF_REF ) + int line_type = buf->rec[j].var_types; + if ( maux->var_types&snp_mask && line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; + if ( maux->var_types&indel_mask && line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ins_mask && line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( maux->var_types&del_mask && line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( line_type&ref_mask ) { if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; @@ -2942,12 +3189,21 @@ void stage_line(args_t *args) { // found a suitable line for merging buf->cur = j; - - // mark as finished so that it's ignored next time - buf->rec[j].skip = SKIP_DONE; - nout++; } } + + // debug_maux(args); + + // Mark lines staged for merging as finished so that they are ignored next time + for (i=0; inreaders; i++) + { + buffer_t *buf = &maux->buf[i]; + if ( buf->cur == -1 ) continue; + + buf->rec[buf->cur].skip = SKIP_DONE; + nout++; + } + assert( nout ); } @@ -3080,6 +3336,7 @@ void merge_vcf(args_t *args) error_errno("[%s] Failed to update header", __func__); } info_rules_init(args); + missing_rules_init(args); bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); @@ -3089,6 +3346,7 @@ void merge_vcf(args_t *args) if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); return; } + else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init(); args->maux = maux_init(args); @@ -3124,9 +3382,19 @@ void merge_vcf(args_t *args) gvcf_flush(args,1); info_rules_destroy(args); + missing_rules_destroy(args); maux_destroy(args->maux); bcf_hdr_destroy(args->out_hdr); - if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"bcftools_stdout"); bcf_destroy1(args->out_line); kh_destroy(strdict, args->tmph); if ( args->tmps.m ) free(args->tmps.s); @@ -3148,11 +3416,12 @@ static void usage(void) fprintf(bcftools_stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n"); fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); fprintf(bcftools_stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); - fprintf(bcftools_stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); + fprintf(bcftools_stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n"); fprintf(bcftools_stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(bcftools_stderr, " -l, --file-list FILE Read file names from the file\n"); fprintf(bcftools_stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); fprintf(bcftools_stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); + fprintf(bcftools_stderr, " -M, --missing-rules TAG:METHOD Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n"); fprintf(bcftools_stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); @@ -3161,6 +3430,7 @@ static void usage(void) fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -3199,13 +3469,15 @@ int main_vcfmerge(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"regions-overlap",required_argument,NULL,4}, {"info-rules",required_argument,NULL,'i'}, + {"missing-rules",required_argument,NULL,'M'}, {"no-version",no_argument,NULL,8}, {"no-index",no_argument,NULL,10}, {"filter-logic",required_argument,NULL,'F'}, + {"write-index",no_argument,NULL,11}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) { switch (c) { case 'L': args->local_alleles = strtol(optarg,&tmp,10); @@ -3229,6 +3501,7 @@ int main_vcfmerge(int argc, char *argv[]) break; case 'l': args->file_list = optarg; break; case 'i': args->info_rules = optarg; break; + case 'M': args->missing_rules_str = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -3256,7 +3529,7 @@ int main_vcfmerge(int argc, char *argv[]) else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE; - else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL; + else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS; else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; } else error("The -m type \"%s\" is not recognised.\n", optarg); break; @@ -3273,6 +3546,7 @@ int main_vcfmerge(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->no_index = 1; break; + case 11 : args->write_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index 9538f8d..02ad322 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -40,6 +40,8 @@ THE SOFTWARE. */ #include "bcftools.h" #include "rbuf.h" #include "abuf.h" +#include "gff.h" +#include "regidx.h" #define CHECK_REF_EXIT 1 #define CHECK_REF_WARN 2 @@ -86,8 +88,8 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, tmp_kstr; - int ntmp_als; + kstring_t *tmp_als, *tmp_del, tmp_kstr; + int ntmp_als, ntmp_del; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider int aln_win; // the realignment window size (maximum repeat size) @@ -105,6 +107,13 @@ typedef struct int use_star_allele, ma_use_ref_allele; char *old_rec_tag; htsFile *out; + char *index_fn; + int write_index; + int right_align; + char *gff_fname; + gff_t *gff; + regidx_t *idx_tscript; + regitr_t *itr_tscript; } args_t; @@ -344,6 +353,157 @@ static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) error("An error occurred while updating INFO/%s\n",args->old_rec_tag); } +static int is_left_align(args_t *args, bcf1_t *line) +{ + if ( args->right_align ) return 0; + if ( !args->gff ) return 1; + const char *chr = bcf_seqname(args->hdr,line); + if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do + if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1; + + // if there are two conflicting overlapping transcripts, go with the default left-alignment + int has_fwd = 0; + while ( regitr_overlap(args->itr_tscript) ) + { + gf_tscript_t *tr = regitr_payload(args->itr_tscript, gf_tscript_t*); + if ( tr->strand==STRAND_FWD ) has_fwd = 1; + if ( tr->strand==STRAND_REV ) return 1; + } + // either no hit at all (then left-align) or everything was on fwd strand (then right-align) + return has_fwd ? 0 : 1; +} +static hts_pos_t realign_left(args_t *args, bcf1_t *line) +{ + // trim from right + char *ref = NULL; + int i; + hts_pos_t nref=0, new_pos = line->pos; + kstring_t *als = args->tmp_als; + while (1) + { + // is the rightmost base identical in all alleles? + int min_len = als[0].l; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break; + if ( als[i].l < min_len ) min_len = als[i].l; + } + if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed + if ( min_len<=1 && new_pos==0 ) break; + + int pad_from_left = 0; + for (i=0; in_allele; i++) // trim all alleles + { + als[i].l--; + if ( !als[i].l ) pad_from_left = 1; + } + if ( pad_from_left ) + { + // extend all alleles to the left by aln_win bases (unless close to the chr start). + // Extra bases will be trimmed from the left after this loop is done + int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; + free(ref); + ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), new_pos-npad, new_pos-1, &nref); + if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line), (int64_t) new_pos-npad+1); + replace_iupac_codes(ref,nref); + for (i=0; in_allele; i++) + { + ks_resize(&als[i], als[i].l + npad); + if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l); + memcpy(als[i].s,ref,npad); + als[i].l += npad; + } + new_pos -= npad; + } + } + free(ref); + + // trim from left + int ntrim_left = 0; + while (1) + { + // is the first base identical in all alleles? + int min_len = als[0].l - ntrim_left; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break; + if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; + } + if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed + ntrim_left++; + } + if ( ntrim_left ) + { + for (i=0; in_allele; i++) + { + memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); + als[i].l -= ntrim_left; + } + new_pos += ntrim_left; + } + return new_pos; +} + +static hts_pos_t realign_right(args_t *args, bcf1_t *line) +{ + char *ref = NULL; + int i; + hts_pos_t new_pos = line->pos, nref = 0; + kstring_t *als = args->tmp_als; + + // trim from left + int ntrim_left = 0, npad_right = line->rlen, has_indel = 0; + while (1) + { + // is the leftmost base identical in all alleles? + int min_len = als[0].l - ntrim_left; + for (i=1; in_allele; i++) + { + if ( als[0].l!=als[i].l ) has_indel = 1; + if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break; + if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; + } + if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed further + + ntrim_left++; + if ( min_len<=1 ) // pad from the right + { + free(ref); + ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), line->pos + npad_right, line->pos + npad_right + args->aln_win, &nref); + if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,line), new_pos + ntrim_left); + npad_right += args->aln_win; + replace_iupac_codes(ref,nref); + for (i=0; in_allele; i++) kputs(ref, &als[i]); + } + } + ntrim_left -= has_indel; + if ( ntrim_left > 0 ) + { + for (i=0; in_allele; i++) + { + memmove(als[i].s, als[i].s + ntrim_left, als[i].l - ntrim_left); + als[i].l -= ntrim_left; + } + new_pos += ntrim_left; + } + free(ref); + + // trim from right + while (1) + { + // is the last base identical in all alleles? + int min_len = als[0].l; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break; + if ( min_len > als[i].l ) min_len = als[i].l; + } + if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed more + for (i=0; in_allele; i++) { als[i].l--; als[i].s[als[i].l]=0; } + } + return new_pos; +} + #define ERR_DUP_ALLELE -2 #define ERR_REF_MISMATCH -1 #define ERR_OK 0 @@ -396,10 +556,32 @@ static int realign(args_t *args, bcf1_t *line) // make a copy of each allele for trimming hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als); + hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del); kstring_t *als = args->tmp_als; + kstring_t *del = args->tmp_del; for (i=0; in_allele; i++) { - if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele + del[i].l = 0; + if ( line->d.allele[i][0]=='<' ) + { + // symbolic allele, only will be realigned + if ( strncmp("d.allele[i],4) ) return ERR_SYMBOLIC; + if ( nref < line->rlen ) + { + free(ref); + reflen = line->rlen; + ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); + if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); + seq_to_upper(ref,0); + replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N + als[0].l = 0; + kputs(ref, &als[0]); + als[i].l = 0; + kputsn(ref,1,&als[i]); + kputs(line->d.allele[i],&del[i]); + continue; + } + } if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) { @@ -416,69 +598,17 @@ static int realign(args_t *args, bcf1_t *line) if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; } - - // trim from right - int new_pos = line->pos; - while (1) - { - // is the rightmost base identical in all alleles? - int min_len = als[0].l; - for (i=1; in_allele; i++) - { - if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break; - if ( als[i].l < min_len ) min_len = als[i].l; - } - if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed - if ( min_len<=1 && new_pos==0 ) break; - - int pad_from_left = 0; - for (i=0; in_allele; i++) // trim all alleles - { - als[i].l--; - if ( !als[i].l ) pad_from_left = 1; - } - if ( pad_from_left ) - { - int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; - free(ref); - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref); - if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1); - replace_iupac_codes(ref,nref); - for (i=0; in_allele; i++) - { - ks_resize(&als[i], als[i].l + npad); - if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l); - memcpy(als[i].s,ref,npad); - als[i].l += npad; - } - new_pos -= npad; - } - } free(ref); + ref = NULL; - // trim from left - int ntrim_left = 0; - while (1) - { - // is the first base identical in all alleles? - int min_len = als[0].l - ntrim_left; - for (i=1; in_allele; i++) - { - if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break; - if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; - } - if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed - ntrim_left++; - } - if ( ntrim_left ) - { - for (i=0; in_allele; i++) - { - memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); - als[i].l -= ntrim_left; - } - new_pos += ntrim_left; - } + // which direction are we aligning? + int left_align = is_left_align(args, line); + + hts_pos_t new_pos; + if ( left_align ) + new_pos = realign_left(args, line); + else + new_pos = realign_right(args, line); // Have the alleles changed? als[0].s[ als[0].l ] = 0; // in order for strcmp to work @@ -491,7 +621,8 @@ static int realign(args_t *args, bcf1_t *line) for (i=0; in_allele; i++) { if (i>0) kputc(',',&args->tmp_kstr); - kputsn(als[i].s,als[i].l,&args->tmp_kstr); + if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr); + else kputsn(als[i].s,als[i].l,&args->tmp_kstr); } args->tmp_kstr.s[ args->tmp_kstr.l ] = 0; bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s); @@ -1281,10 +1412,12 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ ngts2 /= nsmpl; if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); - int32_t *gt = (int32_t*) args->tmp_arr1; - int32_t *gt2 = (int32_t*) args->tmp_arr2; + int32_t *gt = (int32_t*) args->tmp_arr1; // the first, destination line + int32_t *gt2 = (int32_t*) args->tmp_arr2; // one of the subsequent lines, i.e. the source line for (j=0; j=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2); + + // The destination allele int ial = args->maps[i].map[ial2]; - for (k=0; kabuf, const char*, INFO_TAG, args->old_rec_tag); abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele); } + if ( args->gff_fname ) + { + args->gff = gff_init(args->gff_fname); + gff_set(args->gff,verbosity,1); + gff_set(args->gff,strip_chr_names,1); + gff_parse(args->gff); + args->idx_tscript = gff_get(args->gff,idx_tscript); + args->itr_tscript = regitr_init(NULL); + } } static void destroy_data(args_t *args) { + if ( args->gff ) + { + gff_destroy(args->gff); + regitr_destroy(args->itr_tscript); + } cmpals_destroy(&args->cmpals_in); cmpals_destroy(&args->cmpals_out); int i; @@ -1929,7 +2082,10 @@ static void destroy_data(args_t *args) free(args->maps[i].map); for (i=0; intmp_als; i++) free(args->tmp_als[i].s); + for (i=0; intmp_del; i++) + free(args->tmp_del[i].s); free(args->tmp_als); + free(args->tmp_del); free(args->tmp_kstr.s); if ( args->tmp_str ) { @@ -2018,6 +2174,7 @@ static void normalize_vcf(args_t *args) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm"); if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); bcf1_t *line; int prev_rid = -1, prev_pos = -1, prev_type = 0; @@ -2081,6 +2238,15 @@ static void normalize_vcf(args_t *args) if ( j>0 ) flush_buffer(args, args->out, j); } flush_buffer(args, args->out, args->rbuf.n); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out)<0 ) + { + if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); @@ -2104,6 +2270,7 @@ static void usage(void) fprintf(stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n"); fprintf(stderr, " -f, --fasta-ref FILE Reference sequence\n"); fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); + fprintf(stderr, " -g, --gff-annot FILE Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n"); fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); fprintf(stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n"); @@ -2121,6 +2288,7 @@ static void usage(void) fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # normalize and left-align indels\n"); @@ -2163,6 +2331,8 @@ int main_vcfnorm(int argc, char *argv[]) {"old-rec-tag",required_argument,NULL,12}, {"keep-sum",required_argument,NULL,10}, {"fasta-ref",required_argument,NULL,'f'}, + {"gff-annot",required_argument,NULL,'g'}, + {"right-align",no_argument,NULL,15}, // undocumented, only for debugging {"do-not-normalize",no_argument,NULL,'N'}, {"multiallelics",required_argument,NULL,'m'}, {"multi-overlaps",required_argument,NULL,13}, @@ -2181,10 +2351,11 @@ int main_vcfnorm(int argc, char *argv[]) {"check-ref",required_argument,NULL,'c'}, {"strict-filter",no_argument,NULL,'s'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,14}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) { switch (c) { case 10: // possibly generalize this also to INFO/AD and other tags @@ -2192,6 +2363,7 @@ int main_vcfnorm(int argc, char *argv[]) error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n"); args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data break; + case 'g': args->gff_fname = optarg; break; case 'a': args->atomize = SPLIT; break; case 11 : if ( optarg[0]=='*' ) args->use_star_allele = 1; @@ -2204,6 +2376,8 @@ int main_vcfnorm(int argc, char *argv[]) else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0; else error("Invalid argument to --multi-overlaps\n"); break; + case 14 : args->write_index = 1; break; + case 15 : args->right_align = 1; break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index e2d4177..de9c285 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -2,7 +2,7 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -42,6 +42,8 @@ THE SOFTWARE. */ #include "bcftools.h" #include "rbuf.h" #include "abuf.h" +#include "gff.h" +#include "regidx.h" #define CHECK_REF_EXIT 1 #define CHECK_REF_WARN 2 @@ -88,8 +90,8 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, tmp_kstr; - int ntmp_als; + kstring_t *tmp_als, *tmp_del, tmp_kstr; + int ntmp_als, ntmp_del; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider int aln_win; // the realignment window size (maximum repeat size) @@ -107,6 +109,13 @@ typedef struct int use_star_allele, ma_use_ref_allele; char *old_rec_tag; htsFile *out; + char *index_fn; + int write_index; + int right_align; + char *gff_fname; + gff_t *gff; + regidx_t *idx_tscript; + regitr_t *itr_tscript; } args_t; @@ -346,6 +355,157 @@ static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) error("An error occurred while updating INFO/%s\n",args->old_rec_tag); } +static int is_left_align(args_t *args, bcf1_t *line) +{ + if ( args->right_align ) return 0; + if ( !args->gff ) return 1; + const char *chr = bcf_seqname(args->hdr,line); + if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do + if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1; + + // if there are two conflicting overlapping transcripts, go with the default left-alignment + int has_fwd = 0; + while ( regitr_overlap(args->itr_tscript) ) + { + gf_tscript_t *tr = regitr_payload(args->itr_tscript, gf_tscript_t*); + if ( tr->strand==STRAND_FWD ) has_fwd = 1; + if ( tr->strand==STRAND_REV ) return 1; + } + // either no hit at all (then left-align) or everything was on fwd strand (then right-align) + return has_fwd ? 0 : 1; +} +static hts_pos_t realign_left(args_t *args, bcf1_t *line) +{ + // trim from right + char *ref = NULL; + int i; + hts_pos_t nref=0, new_pos = line->pos; + kstring_t *als = args->tmp_als; + while (1) + { + // is the rightmost base identical in all alleles? + int min_len = als[0].l; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break; + if ( als[i].l < min_len ) min_len = als[i].l; + } + if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed + if ( min_len<=1 && new_pos==0 ) break; + + int pad_from_left = 0; + for (i=0; in_allele; i++) // trim all alleles + { + als[i].l--; + if ( !als[i].l ) pad_from_left = 1; + } + if ( pad_from_left ) + { + // extend all alleles to the left by aln_win bases (unless close to the chr start). + // Extra bases will be trimmed from the left after this loop is done + int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; + free(ref); + ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), new_pos-npad, new_pos-1, &nref); + if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line), (int64_t) new_pos-npad+1); + replace_iupac_codes(ref,nref); + for (i=0; in_allele; i++) + { + ks_resize(&als[i], als[i].l + npad); + if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l); + memcpy(als[i].s,ref,npad); + als[i].l += npad; + } + new_pos -= npad; + } + } + free(ref); + + // trim from left + int ntrim_left = 0; + while (1) + { + // is the first base identical in all alleles? + int min_len = als[0].l - ntrim_left; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break; + if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; + } + if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed + ntrim_left++; + } + if ( ntrim_left ) + { + for (i=0; in_allele; i++) + { + memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); + als[i].l -= ntrim_left; + } + new_pos += ntrim_left; + } + return new_pos; +} + +static hts_pos_t realign_right(args_t *args, bcf1_t *line) +{ + char *ref = NULL; + int i; + hts_pos_t new_pos = line->pos, nref = 0; + kstring_t *als = args->tmp_als; + + // trim from left + int ntrim_left = 0, npad_right = line->rlen, has_indel = 0; + while (1) + { + // is the leftmost base identical in all alleles? + int min_len = als[0].l - ntrim_left; + for (i=1; in_allele; i++) + { + if ( als[0].l!=als[i].l ) has_indel = 1; + if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break; + if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; + } + if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed further + + ntrim_left++; + if ( min_len<=1 ) // pad from the right + { + free(ref); + ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), line->pos + npad_right, line->pos + npad_right + args->aln_win, &nref); + if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,line), new_pos + ntrim_left); + npad_right += args->aln_win; + replace_iupac_codes(ref,nref); + for (i=0; in_allele; i++) kputs(ref, &als[i]); + } + } + ntrim_left -= has_indel; + if ( ntrim_left > 0 ) + { + for (i=0; in_allele; i++) + { + memmove(als[i].s, als[i].s + ntrim_left, als[i].l - ntrim_left); + als[i].l -= ntrim_left; + } + new_pos += ntrim_left; + } + free(ref); + + // trim from right + while (1) + { + // is the last base identical in all alleles? + int min_len = als[0].l; + for (i=1; in_allele; i++) + { + if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break; + if ( min_len > als[i].l ) min_len = als[i].l; + } + if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed more + for (i=0; in_allele; i++) { als[i].l--; als[i].s[als[i].l]=0; } + } + return new_pos; +} + #define ERR_DUP_ALLELE -2 #define ERR_REF_MISMATCH -1 #define ERR_OK 0 @@ -398,10 +558,32 @@ static int realign(args_t *args, bcf1_t *line) // make a copy of each allele for trimming hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als); + hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del); kstring_t *als = args->tmp_als; + kstring_t *del = args->tmp_del; for (i=0; in_allele; i++) { - if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele + del[i].l = 0; + if ( line->d.allele[i][0]=='<' ) + { + // symbolic allele, only will be realigned + if ( strncmp("d.allele[i],4) ) return ERR_SYMBOLIC; + if ( nref < line->rlen ) + { + free(ref); + reflen = line->rlen; + ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); + if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); + seq_to_upper(ref,0); + replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N + als[0].l = 0; + kputs(ref, &als[0]); + als[i].l = 0; + kputsn(ref,1,&als[i]); + kputs(line->d.allele[i],&del[i]); + continue; + } + } if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) { @@ -418,69 +600,17 @@ static int realign(args_t *args, bcf1_t *line) if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; } - - // trim from right - int new_pos = line->pos; - while (1) - { - // is the rightmost base identical in all alleles? - int min_len = als[0].l; - for (i=1; in_allele; i++) - { - if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break; - if ( als[i].l < min_len ) min_len = als[i].l; - } - if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed - if ( min_len<=1 && new_pos==0 ) break; - - int pad_from_left = 0; - for (i=0; in_allele; i++) // trim all alleles - { - als[i].l--; - if ( !als[i].l ) pad_from_left = 1; - } - if ( pad_from_left ) - { - int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; - free(ref); - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref); - if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1); - replace_iupac_codes(ref,nref); - for (i=0; in_allele; i++) - { - ks_resize(&als[i], als[i].l + npad); - if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l); - memcpy(als[i].s,ref,npad); - als[i].l += npad; - } - new_pos -= npad; - } - } free(ref); + ref = NULL; - // trim from left - int ntrim_left = 0; - while (1) - { - // is the first base identical in all alleles? - int min_len = als[0].l - ntrim_left; - for (i=1; in_allele; i++) - { - if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break; - if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left; - } - if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed - ntrim_left++; - } - if ( ntrim_left ) - { - for (i=0; in_allele; i++) - { - memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); - als[i].l -= ntrim_left; - } - new_pos += ntrim_left; - } + // which direction are we aligning? + int left_align = is_left_align(args, line); + + hts_pos_t new_pos; + if ( left_align ) + new_pos = realign_left(args, line); + else + new_pos = realign_right(args, line); // Have the alleles changed? als[0].s[ als[0].l ] = 0; // in order for strcmp to work @@ -493,7 +623,8 @@ static int realign(args_t *args, bcf1_t *line) for (i=0; in_allele; i++) { if (i>0) kputc(',',&args->tmp_kstr); - kputsn(als[i].s,als[i].l,&args->tmp_kstr); + if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr); + else kputsn(als[i].s,als[i].l,&args->tmp_kstr); } args->tmp_kstr.s[ args->tmp_kstr.l ] = 0; bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s); @@ -1283,10 +1414,12 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ ngts2 /= nsmpl; if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); - int32_t *gt = (int32_t*) args->tmp_arr1; - int32_t *gt2 = (int32_t*) args->tmp_arr2; + int32_t *gt = (int32_t*) args->tmp_arr1; // the first, destination line + int32_t *gt2 = (int32_t*) args->tmp_arr2; // one of the subsequent lines, i.e. the source line for (j=0; j=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2); + + // The destination allele int ial = args->maps[i].map[ial2]; - for (k=0; kabuf, const char*, INFO_TAG, args->old_rec_tag); abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele); } + if ( args->gff_fname ) + { + args->gff = gff_init(args->gff_fname); + gff_set(args->gff,verbosity,1); + gff_set(args->gff,strip_chr_names,1); + gff_parse(args->gff); + args->idx_tscript = gff_get(args->gff,idx_tscript); + args->itr_tscript = regitr_init(NULL); + } } static void destroy_data(args_t *args) { + if ( args->gff ) + { + gff_destroy(args->gff); + regitr_destroy(args->itr_tscript); + } cmpals_destroy(&args->cmpals_in); cmpals_destroy(&args->cmpals_out); int i; @@ -1931,7 +2084,10 @@ static void destroy_data(args_t *args) free(args->maps[i].map); for (i=0; intmp_als; i++) free(args->tmp_als[i].s); + for (i=0; intmp_del; i++) + free(args->tmp_del[i].s); free(args->tmp_als); + free(args->tmp_del); free(args->tmp_kstr.s); if ( args->tmp_str ) { @@ -2020,6 +2176,7 @@ static void normalize_vcf(args_t *args) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm"); if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); bcf1_t *line; int prev_rid = -1, prev_pos = -1, prev_type = 0; @@ -2083,6 +2240,15 @@ static void normalize_vcf(args_t *args) if ( j>0 ) flush_buffer(args, args->out, j); } flush_buffer(args, args->out, args->rbuf.n); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out)<0 ) + { + if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); fprintf(bcftools_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); @@ -2106,6 +2272,7 @@ static void usage(void) fprintf(bcftools_stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n"); fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence\n"); fprintf(bcftools_stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); + fprintf(bcftools_stderr, " -g, --gff-annot FILE Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n"); fprintf(bcftools_stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); fprintf(bcftools_stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); fprintf(bcftools_stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n"); @@ -2123,6 +2290,7 @@ static void usage(void) fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # normalize and left-align indels\n"); @@ -2165,6 +2333,8 @@ int main_vcfnorm(int argc, char *argv[]) {"old-rec-tag",required_argument,NULL,12}, {"keep-sum",required_argument,NULL,10}, {"fasta-ref",required_argument,NULL,'f'}, + {"gff-annot",required_argument,NULL,'g'}, + {"right-align",no_argument,NULL,15}, // undocumented, only for debugging {"do-not-normalize",no_argument,NULL,'N'}, {"multiallelics",required_argument,NULL,'m'}, {"multi-overlaps",required_argument,NULL,13}, @@ -2183,10 +2353,11 @@ int main_vcfnorm(int argc, char *argv[]) {"check-ref",required_argument,NULL,'c'}, {"strict-filter",no_argument,NULL,'s'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,14}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) { switch (c) { case 10: // possibly generalize this also to INFO/AD and other tags @@ -2194,6 +2365,7 @@ int main_vcfnorm(int argc, char *argv[]) error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n"); args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data break; + case 'g': args->gff_fname = optarg; break; case 'a': args->atomize = SPLIT; break; case 11 : if ( optarg[0]=='*' ) args->use_star_allele = 1; @@ -2206,6 +2378,8 @@ int main_vcfnorm(int argc, char *argv[]) else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0; else error("Invalid argument to --multi-overlaps\n"); break; + case 14 : args->write_index = 1; break; + case 15 : args->right_align = 1; break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c index 4568668..6877519 100644 --- a/bcftools/vcfplugin.c +++ b/bcftools/vcfplugin.c @@ -1,6 +1,6 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -149,6 +149,8 @@ typedef struct _args_t char **argv, *output_fname, *regions_list, *targets_list; int argc, drop_header, verbose, record_cmd_line, plist_only; + char *index_fn; + int write_index; } args_t; @@ -548,6 +550,7 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -569,7 +572,19 @@ static void destroy_data(args_t *args) } if ( args->filter ) filter_destroy(args->filter); - if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if (args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + } } static void usage(args_t *args) @@ -598,6 +613,7 @@ static void usage(args_t *args) fprintf(stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); fprintf(stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n"); fprintf(stderr, " -V, --version Print version string and exit\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -643,9 +659,9 @@ int main_plugin(int argc, char *argv[]) if ( argv[1][0]!='-' ) { args->verbose = is_verbose(argc, argv); - plugin_name = argv[1]; - argc--; - argv++; + plugin_name = argv[1]; + argc--; + argv++; load_plugin(args, plugin_name, 1, &args->plugin); if ( args->plugin.run ) { @@ -675,6 +691,7 @@ int main_plugin(int argc, char *argv[]) {"targets-file",required_argument,NULL,'T'}, {"targets-overlap",required_argument,NULL,2}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -723,6 +740,7 @@ int main_plugin(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case '?': case 'h': usage_only = 1; break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index b37ac23..ad04eb4 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -2,7 +2,7 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -151,6 +151,8 @@ typedef struct _args_t char **argv, *output_fname, *regions_list, *targets_list; int argc, drop_header, verbose, record_cmd_line, plist_only; + char *index_fn; + int write_index; } args_t; @@ -550,6 +552,7 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -571,7 +574,19 @@ static void destroy_data(args_t *args) } if ( args->filter ) filter_destroy(args->filter); - if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if (args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + } } static void usage(args_t *args) @@ -600,6 +615,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); fprintf(bcftools_stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n"); fprintf(bcftools_stderr, " -V, --version Print version string and exit\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -645,9 +661,9 @@ int main_plugin(int argc, char *argv[]) if ( argv[1][0]!='-' ) { args->verbose = is_verbose(argc, argv); - plugin_name = argv[1]; - argc--; - argv++; + plugin_name = argv[1]; + argc--; + argv++; load_plugin(args, plugin_name, 1, &args->plugin); if ( args->plugin.run ) { @@ -677,6 +693,7 @@ int main_plugin(int argc, char *argv[]) {"targets-file",required_argument,NULL,'T'}, {"targets-overlap",required_argument,NULL,2}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -725,6 +742,7 @@ int main_plugin(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case '?': case 'h': usage_only = 1; break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c index 889f363..5f4eb07 100644 --- a/bcftools/vcfquery.c +++ b/bcftools/vcfquery.c @@ -1,6 +1,6 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -94,6 +94,7 @@ static void init_data(args_t *args) smpl_ilist_destroy(ilist); } args->convert = convert_init(args->header, samples, nsamples, args->format_str); + convert_set_option(args->convert, force_newline, 1); convert_set_option(args->convert, subset_samples, &args->smpl_pass); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); free(samples); diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index f1e0f8b..e4f2520 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -2,7 +2,7 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -96,6 +96,7 @@ static void init_data(args_t *args) smpl_ilist_destroy(ilist); } args->convert = convert_init(args->header, samples, nsamples, args->format_str); + convert_set_option(args->convert, force_newline, 1); convert_set_option(args->convert, subset_samples, &args->smpl_pass); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); free(samples); diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c index 1de2b28..3b208a0 100644 --- a/bcftools/vcfsort.c +++ b/bcftools/vcfsort.c @@ -1,6 +1,6 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2022 Genome Research Ltd. + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -62,6 +62,8 @@ typedef struct _args_t uint8_t *mem_block; size_t nbuf, mbuf, nblk; blk_t *blk; + char *index_fn; + int write_index; } args_t; @@ -300,6 +302,7 @@ void merge_blocks(args_t *args) set_wmode(wmode,args->output_type,args->output_fname,args->clevel); htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bhp->ndat ) { blk_t *blk = bhp->dat[0]; @@ -307,6 +310,15 @@ void merge_blocks(args_t *args) khp_delete(blk, bhp); blk_read(args, bhp, args->hdr, blk); } + if ( args->write_index ) + { + if ( bcf_idx_save(out)<0 ) + { + if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); clean_files(args); @@ -333,6 +345,7 @@ static void usage(args_t *args) #else fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); #endif + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -395,6 +408,7 @@ int main_sort(int argc, char *argv[]) {"output-file",required_argument,NULL,'o'}, {"output",required_argument,NULL,'o'}, {"help",no_argument,NULL,'h'}, + {"write-index",no_argument,NULL,1}, {0,0,0,0} }; char *tmp; @@ -423,6 +437,7 @@ int main_sort(int argc, char *argv[]) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 1 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c index 79dbc43..948d60b 100644 --- a/bcftools/vcfsort.c.pysam.c +++ b/bcftools/vcfsort.c.pysam.c @@ -2,7 +2,7 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2022 Genome Research Ltd. + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -64,6 +64,8 @@ typedef struct _args_t uint8_t *mem_block; size_t nbuf, mbuf, nblk; blk_t *blk; + char *index_fn; + int write_index; } args_t; @@ -302,6 +304,7 @@ void merge_blocks(args_t *args) set_wmode(wmode,args->output_type,args->output_fname,args->clevel); htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bhp->ndat ) { blk_t *blk = bhp->dat[0]; @@ -309,6 +312,15 @@ void merge_blocks(args_t *args) khp_delete(blk, bhp); blk_read(args, bhp, args->hdr, blk); } + if ( args->write_index ) + { + if ( bcf_idx_save(out)<0 ) + { + if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); clean_files(args); @@ -335,6 +347,7 @@ static void usage(args_t *args) #else fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); #endif + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -397,6 +410,7 @@ int main_sort(int argc, char *argv[]) {"output-file",required_argument,NULL,'o'}, {"output",required_argument,NULL,'o'}, {"help",no_argument,NULL,'h'}, + {"write-index",no_argument,NULL,1}, {0,0,0,0} }; char *tmp; @@ -425,6 +439,7 @@ int main_sort(int argc, char *argv[]) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 1 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index 10189fe..e2744ab 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -70,6 +70,13 @@ typedef struct } idist_t; +// variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins +typedef struct +{ + int snv[21], indel[21]; +} +vaf_t; + typedef struct { uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; @@ -93,7 +100,8 @@ typedef struct int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs; - int *smpl_frm_shifts; // not-applicable, in-frame, out-frame + int *smpl_frm_shifts; // not-applicable, in-frame, out-frame + vaf_t vaf, *smpl_vaf; // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions unsigned long int *smpl_dp; idist_t dp, dp_sites; int nusr; @@ -141,7 +149,9 @@ typedef struct gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons bin_t *af_bins; float *farr; - int mfarr; + int32_t *iarr; + int mfarr, miarr; + int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref; // indel context indel_ctx_t *indel_ctx; @@ -447,6 +457,8 @@ static void init_stats(args_t *args) if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) ) error("No such INFO tag: %s\n", args->af_tag); + int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0; + #if QUAL_STATS args->m_qual = 999; #endif @@ -501,6 +513,8 @@ static void init_stats(args_t *args) stats->smpl_dp = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int)); stats->smpl_ndp = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_sngl = (int *) calloc(args->files->n_smpl,sizeof(int)); + if ( has_fmt_ad ) + stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t)); #if HWE_STATS stats->af_hwe = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int)); #endif @@ -586,6 +600,7 @@ static void destroy_stats(args_t *args) free(stats->smpl_dp); free(stats->smpl_ndp); free(stats->smpl_sngl); + free(stats->smpl_vaf); idist_destroy(&stats->dp); idist_destroy(&stats->dp_sites); for (j=0; jnusr; j++) @@ -602,6 +617,7 @@ static void destroy_stats(args_t *args) for (j=0; jnusr; j++) free(args->usr[j].tag); if ( args->af_bins ) bin_destroy(args->af_bins); free(args->farr); + free(args->iarr); free(args->usr); free(args->tmp_frm); free(args->tmp_iaf); @@ -615,6 +631,8 @@ static void destroy_stats(args_t *args) if (args->filter[1]) filter_destroy(args->filter[1]); } +// The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons. +// The number of bins, either m_af (101) or as given by the user in --af-bins static void init_iaf(args_t *args, bcf_sr_t *reader) { bcf1_t *line = reader->buffer[0]; @@ -869,205 +887,279 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) } } -static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal) +// Returns the max non-ref AD value +static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial) { - if ( !fmt ) return; - - float dvaf; + int iv, ad = 0; + *ial = 0; #define BRANCH_INT(type_t,missing,vector_end) { \ - type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \ - if ( p[ial]==vector_end || p[jal]==vector_end ) return; \ - if ( p[ial]==missing || p[jal]==missing ) return; \ - if ( !p[ial] && !p[jal] ) return; \ - dvaf = (float)p[ial]/(p[ial]+p[jal]); \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + for (iv=1; ivn; iv++) \ + { \ + if ( ptr[iv]==vector_end ) break; \ + if ( ptr[iv]==missing ) continue; \ + if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\ + } \ } - switch (fmt->type) { + switch (ad_fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break; + default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; } #undef BRANCH_INT - + return ad; +} +static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial) +{ + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + if ( ptr[ial]==vector_end ) return 0; \ + if ( ptr[ial]==missing ) return 0; \ + return ptr[ial]; \ + } + switch (ad_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; + } + #undef BRANCH_INT +} +static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf) +{ int len = line->d.var[ial].n; if ( len < -stats->m_indel ) len = -stats->m_indel; else if ( len > stats->m_indel ) len = stats->m_indel; int bin = stats->m_indel + len; stats->nvaf[bin]++; - stats->dvaf[bin] += dvaf; + stats->dvaf[bin] += vaf; +} +#define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05)) +static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf) +{ + int idx = vaf2bin(vaf); + if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++; + else smpl_vaf->indel[idx]++; } -static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) +static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr) { - bcf_srs_t *files = args->files; - bcf1_t *line = reader->buffer[0]; - bcf_fmt_t *fmt_ptr; - int nref_tot = 0, nhet_tot = 0, nalt_tot = 0; - int line_type = bcf_get_variant_types(line); + if ( dp_fmt_ptr ) + { + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \ + if ( *ptr==missing || *ptr==vector_end ) return -1; \ + return *ptr; \ + } + switch (dp_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); exit(1); break; + } + #undef BRANCH_INT + } + if ( ad_fmt_ptr ) + { + int iv, dp = 0, has_value = 0; + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + for (iv=0; ivn; iv++) \ + { \ + if ( ptr[iv]==vector_end ) break; \ + if ( ptr[iv]==missing ) continue; \ + has_value = 1; \ + dp += ptr[iv]; \ + } \ + } + switch (ad_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break; + } + #undef BRANCH_INT + if ( !has_value ) return -1; + return dp; + } + return -1; +} +static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal) +{ + if ( gt==GT_UNKN ) + { + stats->smpl_missing[ismpl]++; + return; + } - if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) ) + int var_type = 0; + if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); + if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); + if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) { - bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL; + if ( var_type&VCF_INDEL && stats->smpl_frm_shifts ) + { + assert( ialn_allele ); + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++; + } + if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++; + if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++; + return; + } + if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; } + #if HWE_STATS + switch (gt) + { + case GT_HOM_RR: args->nref_tot++; break; + case GT_HET_RA: args->nhet_tot++; break; + case GT_HET_AA: + case GT_HOM_AA: args->nalt_tot++; break; + } + #endif - int ref = bcf_acgt2int(*line->d.allele[0]); - int is, n_nref = 0, i_nref = 0; - for (is=0; isfiles->n_smpl; is++) + if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP + { + if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++; + else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++; + else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++; + else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++; + if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called { - int ial, jal; - int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal); - if ( gt==GT_UNKN ) - { - stats->smpl_missing[is]++; - continue; - } - if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) + int ref = bcf_acgt2int(*line->d.allele[0]); + int alt = bcf_acgt2int(*line->d.allele[ial]); + if ( alt<0 ) return; + if ( abs(ref-alt)==2 ) + stats->smpl_ts[ismpl]++; + else + stats->smpl_tv[ismpl]++; + } + if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal ) + { + int ref = bcf_acgt2int(*line->d.allele[0]); + int alt = bcf_acgt2int(*line->d.allele[jal]); + if ( alt<0 ) return; + if ( abs(ref-alt)==2 ) + stats->smpl_ts[ismpl]++; + else + stats->smpl_tv[ismpl]++; + } + } + if ( var_type&VCF_INDEL ) + { + if ( gt != GT_HOM_RR ) + { + stats->smpl_indels[ismpl]++; + if ( gt==GT_HET_RA || gt==GT_HET_AA ) { - if ( line_type&VCF_INDEL && stats->smpl_frm_shifts ) + int is_ins = 0, is_del = 0; + if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) { - assert( ialn_allele ); - stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++; + if ( line->d.var[ial].n < 0 ) is_del = 1; + else is_ins = 1; } - if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++; - if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++; - continue; - } - if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; } - #if HWE_STATS - switch (gt) + if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) { - case GT_HOM_RR: nref_tot++; break; - case GT_HET_RA: nhet_tot++; break; - case GT_HET_AA: - case GT_HOM_AA: nalt_tot++; break; - } - #endif - int var_type = 0; - if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); - if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); - if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP - { - if ( gt == GT_HET_RA ) stats->smpl_hets[is]++; - else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++; - else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++; - else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++; - if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called - { - int alt = bcf_acgt2int(*line->d.allele[ial]); - if ( alt<0 ) continue; - if ( abs(ref-alt)==2 ) - stats->smpl_ts[is]++; - else - stats->smpl_tv[is]++; + if ( line->d.var[jal].n < 0 ) is_del = 1; + else is_ins = 1; } + // Note that alt-het genotypes with both ins and del allele are counted twice!! + if ( is_del ) stats->smpl_del_hets[ismpl]++; + if ( is_ins ) stats->smpl_ins_hets[ismpl]++; } - if ( var_type&VCF_INDEL ) + else if ( gt==GT_HOM_AA ) { - if ( gt != GT_HOM_RR ) - { - stats->smpl_indels[is]++; - - if ( gt==GT_HET_RA || gt==GT_HET_AA ) - { - int is_ins = 0, is_del = 0; - if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) - { - if ( line->d.var[ial].n < 0 ) is_del = 1; - else is_ins = 1; - update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal); - } - if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) - { - if ( line->d.var[jal].n < 0 ) is_del = 1; - else is_ins = 1; - update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial); - } - // Note that alt-het genotypes with both ins and del allele are counted twice!! - if ( is_del ) stats->smpl_del_hets[is]++; - if ( is_ins ) stats->smpl_ins_hets[is]++; - } - else if ( gt==GT_HOM_AA ) - { - if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++; - else stats->smpl_ins_homs[is]++; - } - } - if ( stats->smpl_frm_shifts ) - { - assert( ialn_allele && jaln_allele ); - stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++; - stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++; - } + if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++; + else stats->smpl_ins_homs[ismpl]++; } } - if ( n_nref==1 ) stats->smpl_sngl[i_nref]++; + if ( stats->smpl_frm_shifts ) + { + assert( ialn_allele && jaln_allele ); + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++; + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++; + } } +} +static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) +{ + bcf_srs_t *files = args->files; + bcf1_t *line = reader->buffer[0]; - #if HWE_STATS - if ( nhet_tot + nref_tot + nalt_tot ) + args->nref_tot = 0; + args->nhet_tot = 0; + args->nalt_tot = 0; + args->n_nref = 0; + args->i_nref = 0; + + bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT"); + bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD"); + bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP"); + + int is; + for (is=0; isfiles->n_smpl; is++) + { + // Determine depth + int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr); + if ( dp>0 ) { - float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot); - int idx = het_frac*(args->naf_hwe - 1); -//check me: what is this? - if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1]; - stats->af_hwe[idx]++; + (*idist(&stats->dp, dp))++; + stats->smpl_ndp[is]++; + stats->smpl_dp[is] += dp; } - #endif - if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) ) - { - #define BRANCH_INT(type_t,missing,vector_end) { \ - int is; \ - for (is=0; isfiles->n_smpl; is++) \ - { \ - type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ - if ( *p==vector_end ) continue; \ - if ( *p!=missing ) \ - { \ - (*idist(&stats->dp, *p))++; \ - stats->smpl_ndp[is]++; \ - stats->smpl_dp[is] += *p; \ - } \ - } \ + // Determine genotype + int ial, jal, gt=GT_UNKN; + if ( gt_fmt_ptr ) + { + gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal); + sample_gt_stats(args,stats,line,is,gt,ial,jal); } - switch (fmt_ptr->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; + + // Determine variant allele frequency + if ( dp>0 && ad_fmt_ptr ) + { + float iad = 0, jad = 0; + if ( gt==GT_UNKN ) // GT not available + { + iad = get_ad(line,ad_fmt_ptr,is,&ial); + } + else if ( gt!=GT_UNKN ) + { + iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial); + jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal); + } + if ( iad ) + { + update_dvaf(stats,line,ial,(float)iad/dp); + update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp); + } + if ( jad && iad!=jad ) + { + update_dvaf(stats,line,jal,(float)jad/dp); + update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp); + } } - #undef BRANCH_INT } - else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) ) + if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++; + +#if HWE_STATS + if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) ) { - #define BRANCH_INT(type_t,missing,vector_end) { \ - int is,iv; \ - for (is=0; isfiles->n_smpl; is++) \ - { \ - type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ - int dp = 0, has_value = 0; \ - for (iv=0; ivn; iv++) \ - { \ - if ( p[iv]==vector_end ) break; \ - if ( p[iv]==missing ) continue; \ - has_value = 1; \ - dp += p[iv]; \ - } \ - if ( has_value ) \ - { \ - (*idist(&stats->dp, dp))++; \ - stats->smpl_ndp[is]++; \ - stats->smpl_dp[is] += dp; \ - } \ - } \ - } - switch (fmt_ptr->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; - } - #undef BRANCH_INT + // Number of heterozygous genotypes observed for any given allele frequency. This is used + // by plot-vcfstats to show the observed vs expected number of hets. There the expected number + // of hets is calculated from the probability P(het) = 2*AF*(1-AF). + // The array af_hwe is organized as follows + // m_af .. number of allele frequency bins + // naf_hwe .. the number of het genotype frequency bins + // iallele_freq*naf_hwe + ihet_freq + // + float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot); + int ihet_freq = het_frac * (args->naf_hwe - 1); + int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe; + stats->af_hwe[idx]++; } +#endif if ( matched==3 ) { @@ -1200,8 +1292,8 @@ static void do_vcf_stats(args_t *args) if ( files->n_smpl ) do_sample_stats(args, stats, reader, ret); - if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 ) - (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; + if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 ) + (*idist(&stats->dp_sites, args->iarr[0]))++; } } @@ -1736,6 +1828,24 @@ static void print_stats(args_t *args) } #endif } + + if ( args->stats[0].smpl_vaf ) + { + printf("# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n"); + printf("# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n"); + for (id=0; idnstats; id++) + { + stats_t *stats = &args->stats[id]; + for (i=0; ifiles->n_smpl; i++) + { + printf("VAF\t%d\t%s\t", id,args->files->samples[i]); + for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].snv[j]); + printf("\t"); + for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].indel[j]); + printf("\n"); + } + } + } } static void usage(void) diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index 3b7da5a..11db1d1 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -72,6 +72,13 @@ typedef struct } idist_t; +// variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins +typedef struct +{ + int snv[21], indel[21]; +} +vaf_t; + typedef struct { uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; @@ -95,7 +102,8 @@ typedef struct int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs; - int *smpl_frm_shifts; // not-applicable, in-frame, out-frame + int *smpl_frm_shifts; // not-applicable, in-frame, out-frame + vaf_t vaf, *smpl_vaf; // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions unsigned long int *smpl_dp; idist_t dp, dp_sites; int nusr; @@ -143,7 +151,9 @@ typedef struct gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons bin_t *af_bins; float *farr; - int mfarr; + int32_t *iarr; + int mfarr, miarr; + int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref; // indel context indel_ctx_t *indel_ctx; @@ -449,6 +459,8 @@ static void init_stats(args_t *args) if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) ) error("No such INFO tag: %s\n", args->af_tag); + int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0; + #if QUAL_STATS args->m_qual = 999; #endif @@ -503,6 +515,8 @@ static void init_stats(args_t *args) stats->smpl_dp = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int)); stats->smpl_ndp = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_sngl = (int *) calloc(args->files->n_smpl,sizeof(int)); + if ( has_fmt_ad ) + stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t)); #if HWE_STATS stats->af_hwe = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int)); #endif @@ -588,6 +602,7 @@ static void destroy_stats(args_t *args) free(stats->smpl_dp); free(stats->smpl_ndp); free(stats->smpl_sngl); + free(stats->smpl_vaf); idist_destroy(&stats->dp); idist_destroy(&stats->dp_sites); for (j=0; jnusr; j++) @@ -604,6 +619,7 @@ static void destroy_stats(args_t *args) for (j=0; jnusr; j++) free(args->usr[j].tag); if ( args->af_bins ) bin_destroy(args->af_bins); free(args->farr); + free(args->iarr); free(args->usr); free(args->tmp_frm); free(args->tmp_iaf); @@ -617,6 +633,8 @@ static void destroy_stats(args_t *args) if (args->filter[1]) filter_destroy(args->filter[1]); } +// The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons. +// The number of bins, either m_af (101) or as given by the user in --af-bins static void init_iaf(args_t *args, bcf_sr_t *reader) { bcf1_t *line = reader->buffer[0]; @@ -871,205 +889,279 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) } } -static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal) +// Returns the max non-ref AD value +static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial) { - if ( !fmt ) return; - - float dvaf; + int iv, ad = 0; + *ial = 0; #define BRANCH_INT(type_t,missing,vector_end) { \ - type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \ - if ( p[ial]==vector_end || p[jal]==vector_end ) return; \ - if ( p[ial]==missing || p[jal]==missing ) return; \ - if ( !p[ial] && !p[jal] ) return; \ - dvaf = (float)p[ial]/(p[ial]+p[jal]); \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + for (iv=1; ivn; iv++) \ + { \ + if ( ptr[iv]==vector_end ) break; \ + if ( ptr[iv]==missing ) continue; \ + if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\ + } \ } - switch (fmt->type) { + switch (ad_fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); bcftools_exit(1); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break; } #undef BRANCH_INT - + return ad; +} +static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial) +{ + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + if ( ptr[ial]==vector_end ) return 0; \ + if ( ptr[ial]==missing ) return 0; \ + return ptr[ial]; \ + } + switch (ad_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break; + } + #undef BRANCH_INT +} +static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf) +{ int len = line->d.var[ial].n; if ( len < -stats->m_indel ) len = -stats->m_indel; else if ( len > stats->m_indel ) len = stats->m_indel; int bin = stats->m_indel + len; stats->nvaf[bin]++; - stats->dvaf[bin] += dvaf; + stats->dvaf[bin] += vaf; +} +#define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05)) +static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf) +{ + int idx = vaf2bin(vaf); + if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++; + else smpl_vaf->indel[idx]++; } -static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) +static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr) { - bcf_srs_t *files = args->files; - bcf1_t *line = reader->buffer[0]; - bcf_fmt_t *fmt_ptr; - int nref_tot = 0, nhet_tot = 0, nalt_tot = 0; - int line_type = bcf_get_variant_types(line); + if ( dp_fmt_ptr ) + { + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \ + if ( *ptr==missing || *ptr==vector_end ) return -1; \ + return *ptr; \ + } + switch (dp_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); bcftools_exit(1); break; + } + #undef BRANCH_INT + } + if ( ad_fmt_ptr ) + { + int iv, dp = 0, has_value = 0; + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \ + for (iv=0; ivn; iv++) \ + { \ + if ( ptr[iv]==vector_end ) break; \ + if ( ptr[iv]==missing ) continue; \ + has_value = 1; \ + dp += ptr[iv]; \ + } \ + } + switch (ad_fmt_ptr->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break; + } + #undef BRANCH_INT + if ( !has_value ) return -1; + return dp; + } + return -1; +} +static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal) +{ + if ( gt==GT_UNKN ) + { + stats->smpl_missing[ismpl]++; + return; + } - if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) ) + int var_type = 0; + if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); + if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); + if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) { - bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL; + if ( var_type&VCF_INDEL && stats->smpl_frm_shifts ) + { + assert( ialn_allele ); + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++; + } + if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++; + if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++; + return; + } + if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; } + #if HWE_STATS + switch (gt) + { + case GT_HOM_RR: args->nref_tot++; break; + case GT_HET_RA: args->nhet_tot++; break; + case GT_HET_AA: + case GT_HOM_AA: args->nalt_tot++; break; + } + #endif - int ref = bcf_acgt2int(*line->d.allele[0]); - int is, n_nref = 0, i_nref = 0; - for (is=0; isfiles->n_smpl; is++) + if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP + { + if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++; + else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++; + else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++; + else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++; + if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called { - int ial, jal; - int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal); - if ( gt==GT_UNKN ) - { - stats->smpl_missing[is]++; - continue; - } - if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) + int ref = bcf_acgt2int(*line->d.allele[0]); + int alt = bcf_acgt2int(*line->d.allele[ial]); + if ( alt<0 ) return; + if ( abs(ref-alt)==2 ) + stats->smpl_ts[ismpl]++; + else + stats->smpl_tv[ismpl]++; + } + if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal ) + { + int ref = bcf_acgt2int(*line->d.allele[0]); + int alt = bcf_acgt2int(*line->d.allele[jal]); + if ( alt<0 ) return; + if ( abs(ref-alt)==2 ) + stats->smpl_ts[ismpl]++; + else + stats->smpl_tv[ismpl]++; + } + } + if ( var_type&VCF_INDEL ) + { + if ( gt != GT_HOM_RR ) + { + stats->smpl_indels[ismpl]++; + if ( gt==GT_HET_RA || gt==GT_HET_AA ) { - if ( line_type&VCF_INDEL && stats->smpl_frm_shifts ) + int is_ins = 0, is_del = 0; + if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) { - assert( ialn_allele ); - stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++; + if ( line->d.var[ial].n < 0 ) is_del = 1; + else is_ins = 1; } - if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++; - if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++; - continue; - } - if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; } - #if HWE_STATS - switch (gt) + if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) { - case GT_HOM_RR: nref_tot++; break; - case GT_HET_RA: nhet_tot++; break; - case GT_HET_AA: - case GT_HOM_AA: nalt_tot++; break; - } - #endif - int var_type = 0; - if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); - if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); - if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP - { - if ( gt == GT_HET_RA ) stats->smpl_hets[is]++; - else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++; - else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++; - else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++; - if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called - { - int alt = bcf_acgt2int(*line->d.allele[ial]); - if ( alt<0 ) continue; - if ( abs(ref-alt)==2 ) - stats->smpl_ts[is]++; - else - stats->smpl_tv[is]++; + if ( line->d.var[jal].n < 0 ) is_del = 1; + else is_ins = 1; } + // Note that alt-het genotypes with both ins and del allele are counted twice!! + if ( is_del ) stats->smpl_del_hets[ismpl]++; + if ( is_ins ) stats->smpl_ins_hets[ismpl]++; } - if ( var_type&VCF_INDEL ) + else if ( gt==GT_HOM_AA ) { - if ( gt != GT_HOM_RR ) - { - stats->smpl_indels[is]++; - - if ( gt==GT_HET_RA || gt==GT_HET_AA ) - { - int is_ins = 0, is_del = 0; - if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) - { - if ( line->d.var[ial].n < 0 ) is_del = 1; - else is_ins = 1; - update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal); - } - if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) - { - if ( line->d.var[jal].n < 0 ) is_del = 1; - else is_ins = 1; - update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial); - } - // Note that alt-het genotypes with both ins and del allele are counted twice!! - if ( is_del ) stats->smpl_del_hets[is]++; - if ( is_ins ) stats->smpl_ins_hets[is]++; - } - else if ( gt==GT_HOM_AA ) - { - if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++; - else stats->smpl_ins_homs[is]++; - } - } - if ( stats->smpl_frm_shifts ) - { - assert( ialn_allele && jaln_allele ); - stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++; - stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++; - } + if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++; + else stats->smpl_ins_homs[ismpl]++; } } - if ( n_nref==1 ) stats->smpl_sngl[i_nref]++; + if ( stats->smpl_frm_shifts ) + { + assert( ialn_allele && jaln_allele ); + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++; + stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++; + } } +} +static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) +{ + bcf_srs_t *files = args->files; + bcf1_t *line = reader->buffer[0]; - #if HWE_STATS - if ( nhet_tot + nref_tot + nalt_tot ) + args->nref_tot = 0; + args->nhet_tot = 0; + args->nalt_tot = 0; + args->n_nref = 0; + args->i_nref = 0; + + bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT"); + bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD"); + bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP"); + + int is; + for (is=0; isfiles->n_smpl; is++) + { + // Determine depth + int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr); + if ( dp>0 ) { - float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot); - int idx = het_frac*(args->naf_hwe - 1); -//check me: what is this? - if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1]; - stats->af_hwe[idx]++; + (*idist(&stats->dp, dp))++; + stats->smpl_ndp[is]++; + stats->smpl_dp[is] += dp; } - #endif - if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) ) - { - #define BRANCH_INT(type_t,missing,vector_end) { \ - int is; \ - for (is=0; isfiles->n_smpl; is++) \ - { \ - type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ - if ( *p==vector_end ) continue; \ - if ( *p!=missing ) \ - { \ - (*idist(&stats->dp, *p))++; \ - stats->smpl_ndp[is]++; \ - stats->smpl_dp[is] += *p; \ - } \ - } \ + // Determine genotype + int ial, jal, gt=GT_UNKN; + if ( gt_fmt_ptr ) + { + gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal); + sample_gt_stats(args,stats,line,is,gt,ial,jal); } - switch (fmt_ptr->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break; + + // Determine variant allele frequency + if ( dp>0 && ad_fmt_ptr ) + { + float iad = 0, jad = 0; + if ( gt==GT_UNKN ) // GT not available + { + iad = get_ad(line,ad_fmt_ptr,is,&ial); + } + else if ( gt!=GT_UNKN ) + { + iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial); + jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal); + } + if ( iad ) + { + update_dvaf(stats,line,ial,(float)iad/dp); + update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp); + } + if ( jad && iad!=jad ) + { + update_dvaf(stats,line,jal,(float)jad/dp); + update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp); + } } - #undef BRANCH_INT } - else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) ) + if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++; + +#if HWE_STATS + if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) ) { - #define BRANCH_INT(type_t,missing,vector_end) { \ - int is,iv; \ - for (is=0; isfiles->n_smpl; is++) \ - { \ - type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ - int dp = 0, has_value = 0; \ - for (iv=0; ivn; iv++) \ - { \ - if ( p[iv]==vector_end ) break; \ - if ( p[iv]==missing ) continue; \ - has_value = 1; \ - dp += p[iv]; \ - } \ - if ( has_value ) \ - { \ - (*idist(&stats->dp, dp))++; \ - stats->smpl_ndp[is]++; \ - stats->smpl_dp[is] += dp; \ - } \ - } \ - } - switch (fmt_ptr->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break; - } - #undef BRANCH_INT + // Number of heterozygous genotypes observed for any given allele frequency. This is used + // by plot-vcfstats to show the observed vs expected number of hets. There the expected number + // of hets is calculated from the probability P(het) = 2*AF*(1-AF). + // The array af_hwe is organized as follows + // m_af .. number of allele frequency bins + // naf_hwe .. the number of het genotype frequency bins + // iallele_freq*naf_hwe + ihet_freq + // + float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot); + int ihet_freq = het_frac * (args->naf_hwe - 1); + int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe; + stats->af_hwe[idx]++; } +#endif if ( matched==3 ) { @@ -1202,8 +1294,8 @@ static void do_vcf_stats(args_t *args) if ( files->n_smpl ) do_sample_stats(args, stats, reader, ret); - if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 ) - (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; + if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 ) + (*idist(&stats->dp_sites, args->iarr[0]))++; } } @@ -1738,6 +1830,24 @@ static void print_stats(args_t *args) } #endif } + + if ( args->stats[0].smpl_vaf ) + { + fprintf(bcftools_stdout, "# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n"); + fprintf(bcftools_stdout, "# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n"); + for (id=0; idnstats; id++) + { + stats_t *stats = &args->stats[id]; + for (i=0; ifiles->n_smpl; i++) + { + fprintf(bcftools_stdout, "VAF\t%d\t%s\t", id,args->files->samples[i]); + for (j=0; j<21; j++) fprintf(bcftools_stdout, "%s%d",j?",":"",stats->smpl_vaf[i].snv[j]); + fprintf(bcftools_stdout, "\t"); + for (j=0; j<21; j++) fprintf(bcftools_stdout, "%s%d",j?",":"",stats->smpl_vaf[i].indel[j]); + fprintf(bcftools_stdout, "\n"); + } + } + } } static void usage(void) diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index 96dcbc7..e09efa0 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -1,6 +1,6 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Shane McCarthy @@ -76,6 +76,8 @@ typedef struct _args_t char *include_types, *exclude_types; int include, exclude; int record_cmd_line; + char *index_fn; + int write_index; htsFile *out; } args_t; @@ -532,6 +534,7 @@ static void usage(args_t *args) fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n"); fprintf(stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); fprintf(stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -548,6 +551,7 @@ int main_vcfview(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->write_index = 0; args->min_ac = args->max_ac = args->min_af = args->max_af = -1; args->regions_overlap = 1; args->targets_overlap = 0; @@ -596,6 +600,7 @@ int main_vcfview(int argc, char *argv[]) {"phased",no_argument,NULL,'p'}, {"exclude-phased",no_argument,NULL,'P'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -727,6 +732,7 @@ int main_vcfview(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } @@ -783,6 +789,8 @@ int main_vcfview(int argc, char *argv[]) else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); + if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out); + int ret = 0; if (!args->header_only) { @@ -795,7 +803,18 @@ int main_vcfview(int argc, char *argv[]) ret = args->files->errnum; if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); } - hts_close(args->out); + + if (args->write_index) + { + if (bcf_idx_save(args->out) < 0) + { + if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + + if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout"); destroy_data(args); bcf_sr_destroy(args->files); free(args); diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index 85f483d..1485b1e 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -2,7 +2,7 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Shane McCarthy @@ -78,6 +78,8 @@ typedef struct _args_t char *include_types, *exclude_types; int include, exclude; int record_cmd_line; + char *index_fn; + int write_index; htsFile *out; } args_t; @@ -534,6 +536,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n"); fprintf(bcftools_stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); fprintf(bcftools_stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); + fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -550,6 +553,7 @@ int main_vcfview(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->write_index = 0; args->min_ac = args->max_ac = args->min_af = args->max_af = -1; args->regions_overlap = 1; args->targets_overlap = 0; @@ -598,6 +602,7 @@ int main_vcfview(int argc, char *argv[]) {"phased",no_argument,NULL,'p'}, {"exclude-phased",no_argument,NULL,'P'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -729,6 +734,7 @@ int main_vcfview(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } @@ -785,6 +791,8 @@ int main_vcfview(int argc, char *argv[]) else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); + if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out); + int ret = 0; if (!args->header_only) { @@ -797,7 +805,18 @@ int main_vcfview(int argc, char *argv[]) ret = args->files->errnum; if ( ret ) fprintf(bcftools_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); } - hts_close(args->out); + + if (args->write_index) + { + if (bcf_idx_save(args->out) < 0) + { + if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"bcftools_stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + + if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"bcftools_stdout"); destroy_data(args); bcf_sr_destroy(args->files); free(args); diff --git a/bcftools/version.c b/bcftools/version.c index 4306d40..38417a7 100644 --- a/bcftools/version.c +++ b/bcftools/version.c @@ -1,6 +1,6 @@ /* version.c -- report version numbers for plugins. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2023 Genome Research Ltd. Author: Petr Danecek @@ -72,22 +72,26 @@ const char *hts_bcf_wmode(int file_type) const char *hts_bcf_wmode2(int file_type, const char *fname) { if ( !fname ) return hts_bcf_wmode(file_type); - int len = strlen(fname); - if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); - if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF); - if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); - if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; + if ( !end ) end = fname ? fname + strlen(fname) : fname; + int len = end - fname; + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); + if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF); + if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); return hts_bcf_wmode(file_type); } void set_wmode(char dst[8], int file_type, const char *fname, int clevel) { const char *ret = NULL; - int len = fname ? strlen(fname) : 0; - if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); - else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF); - else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); - else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; + if ( !end ) end = fname ? fname + strlen(fname) : fname; + int len = end - fname; + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); + else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF); + else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); else ret = hts_bcf_wmode(file_type); if ( clevel>=0 && clevel<=9 ) { @@ -107,3 +111,33 @@ int parse_overlap_option(const char *arg) else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2; else return -1; } + +// See also samtools/sam_utils.c auto_index() +int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname) +{ + int min_shift = 14; // CSI + + if ( !fname || !*fname || !strcmp(fname, "-") ) return -1; + + char *delim = strstr(fname, HTS_IDX_DELIM); + if (delim) + { + delim += strlen(HTS_IDX_DELIM); + *idx_fname = strdup(delim); + if ( !*idx_fname ) return -1; + + size_t l = strlen(*idx_fname); + if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0; + } + else + { + if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1; + sprintf(*idx_fname, "%s.csi", fname); + } + + if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1; + + return 0; +} + + diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c index df12fc4..23949bf 100644 --- a/bcftools/version.c.pysam.c +++ b/bcftools/version.c.pysam.c @@ -2,7 +2,7 @@ /* version.c -- report version numbers for plugins. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2023 Genome Research Ltd. Author: Petr Danecek @@ -74,22 +74,26 @@ const char *hts_bcf_wmode(int file_type) const char *hts_bcf_wmode2(int file_type, const char *fname) { if ( !fname ) return hts_bcf_wmode(file_type); - int len = strlen(fname); - if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); - if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF); - if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); - if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; + if ( !end ) end = fname ? fname + strlen(fname) : fname; + int len = end - fname; + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); + if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF); + if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); return hts_bcf_wmode(file_type); } void set_wmode(char dst[8], int file_type, const char *fname, int clevel) { const char *ret = NULL; - int len = fname ? strlen(fname) : 0; - if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); - else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF); - else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); - else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; + if ( !end ) end = fname ? fname + strlen(fname) : fname; + int len = end - fname; + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); + else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF); + else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); else ret = hts_bcf_wmode(file_type); if ( clevel>=0 && clevel<=9 ) { @@ -109,3 +113,33 @@ int parse_overlap_option(const char *arg) else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2; else return -1; } + +// See also samtools/sam_utils.c auto_index() +int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname) +{ + int min_shift = 14; // CSI + + if ( !fname || !*fname || !strcmp(fname, "-") ) return -1; + + char *delim = strstr(fname, HTS_IDX_DELIM); + if (delim) + { + delim += strlen(HTS_IDX_DELIM); + *idx_fname = strdup(delim); + if ( !*idx_fname ) return -1; + + size_t l = strlen(*idx_fname); + if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0; + } + else + { + if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1; + sprintf(*idx_fname, "%s.csi", fname); + } + + if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1; + + return 0; +} + + diff --git a/bcftools/version.sh b/bcftools/version.sh index 55d8042..69bf963 100755 --- a/bcftools/version.sh +++ b/bcftools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.17 +VERSION=1.18 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/cy_build.py b/cy_build.py deleted file mode 100644 index 59a6e12..0000000 --- a/cy_build.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import re -import sys - -try: - from Cython.Distutils import build_ext -except ImportError: - from setuptools.command.build_ext import build_ext - -from distutils.extension import Extension -from distutils.sysconfig import get_config_var, get_config_vars, get_python_version -from pkg_resources import Distribution - - -if sys.platform == 'darwin': - config_vars = get_config_vars() - config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '') - config_vars['SHLIB_EXT'] = '.so' - - -def is_pip_install(): - if "_" in os.environ and os.environ["_"].endswith("pip"): - return True - if "pip-egg-info" in sys.argv: - return True - if re.search("/pip-.*-build/", __file__): - return True - return False - - -class CyExtension(Extension): - def __init__(self, *args, **kwargs): - self._init_func = kwargs.pop("init_func", None) - self._prebuild_func = kwargs.pop("prebuild_func", None) - Extension.__init__(self, *args, **kwargs) - - def extend_includes(self, includes): - self.include_dirs.extend(includes) - - def extend_macros(self, macros): - self.define_macros.extend(macros) - - def extend_extra_objects(self, objs): - self.extra_objects.extend(objs) - - -class cy_build_ext(build_ext): - - def _get_egg_name(self): - ei_cmd = self.get_finalized_command("egg_info") - return Distribution( - None, None, ei_cmd.egg_name, ei_cmd.egg_version, get_python_version(), - self.distribution.has_ext_modules() and self.plat_name).egg_name() - - def build_extension(self, ext): - - if isinstance(ext, CyExtension) and ext._init_func: - ext._init_func(ext) - - if not self.inplace: - ext.library_dirs.append(os.path.join(self.build_lib, "pysam")) - - if sys.platform == 'darwin': - # The idea is to give shared libraries an install name of the form - # `@rpath/`, and to set the rpath equal to - # @loader_path. This will allow Python packages to find the library - # in the expected place, while still giving enough flexibility to - # external applications to link against the library. - relative_module_path = ext.name.replace(".", os.sep) + (get_config_var('EXT_SUFFIX') or get_config_var('SO')) - library_path = os.path.join( - "@rpath", os.path.basename(relative_module_path) - ) - - if not ext.extra_link_args: - ext.extra_link_args = [] - ext.extra_link_args += ['-dynamiclib', - '-rpath', '@loader_path', - '-Wl,-headerpad_max_install_names', - '-Wl,-install_name,%s' % library_path, - '-Wl,-x'] - else: - if not ext.extra_link_args: - ext.extra_link_args = [] - - ext.extra_link_args += ['-Wl,-rpath,$ORIGIN'] - - if isinstance(ext, CyExtension) and ext._prebuild_func: - ext._prebuild_func(ext, self.force) - - build_ext.build_extension(self, ext) diff --git a/devtools/import.py b/devtools/import.py index 90194d0..a4652f4 100644 --- a/devtools/import.py +++ b/devtools/import.py @@ -37,7 +37,7 @@ EXCLUDE = { "htslib": ( 'htslib/tabix.c', 'htslib/bgzip.c', 'htslib/htsfile.c', - "test", "tests"), + "samples", "test", "tests"), } diff --git a/devtools/install-prerequisites.sh b/devtools/install-prerequisites.sh new file mode 100755 index 0000000..eaedce1 --- /dev/null +++ b/devtools/install-prerequisites.sh @@ -0,0 +1,34 @@ +#!/bin/sh -e + +if test -x /usr/bin/dnf; then + echo Installing prerequisites via dnf... + dnf -y install epel-release + dnf -y install zlib-devel bzip2-devel xz-devel curl-devel samtools bcftools htslib-tools + +elif test -x /usr/bin/yum; then + if yum -y install epel-release; then + echo Installing prerequisites via yum... + yum -y install zlib-devel bzip2-devel xz-devel curl-devel samtools bcftools htslib-tools + else + echo Installing non-test prerequisites via yum... + yum -y install zlib-devel bzip2-devel xz-devel curl-devel + fi + +elif test -d /etc/dpkg; then + echo Installing prerequisites via apt-get... + apt-get update + apt-get install -y --no-install-recommends --no-install-suggests libcurl4-openssl-dev zlib1g-dev libbz2-dev liblzma-dev samtools bcftools tabix + +elif test -x /sbin/apk; then + echo Installing non-test prerequisites via apk... + apk update + apk add zlib-dev bzip2-dev xz-dev curl-dev + +elif test -x ${HOMEBREW_PREFIX-/usr/local}/bin/brew; then + echo Installing prerequisites via brew... + HOMEBREW_NO_AUTO_UPDATE=1 brew install -q samtools bcftools + brew unlink xz || true + +else + echo No package manager detected +fi diff --git a/doc/conf.py b/doc/conf.py index aaf1d35..1ada4bc 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -11,7 +11,7 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os, setuptools +import sys, os, re, setuptools # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -29,6 +29,7 @@ if os.path.exists(_libdir): # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary', + 'sphinx.ext.extlinks', 'sphinx.ext.todo', 'sphinx.ext.ifconfig', 'sphinx.ext.intersphinx', @@ -50,7 +51,7 @@ master_doc = 'index' # General information about the project. project = u'pysam' -copyright = u'2009–2021, Andreas Heger, Kevin Jacobs, et al' +copyright = '2009–2023 Andreas Heger, John Marshall, Kevin Jacobs, et al' # Included at the end of each rst file rst_epilog = ''' @@ -120,12 +121,33 @@ pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] +# -- Rewrite "PR #NNN" and "#NNN" in NEWS as URL links ------------------------- + +extlinks = { + 'issue': ('https://github.com/pysam-developers/pysam/issues/%s', '#%s'), + 'pull': ('https://github.com/pysam-developers/pysam/pull/%s', 'PR #%s'), + } + +def expand_github_references(text): + text = re.sub(r'PR\s*#(\d+)', r':pull:`\1`', text) + text = re.sub(r'#(\d+)', r':issue:`\1`', text) + return text + +def include_read(app, relative_path, parent_docname, source): + if relative_path.name == 'NEWS': + source[0] = expand_github_references(source[0]) + +def setup(app): + try: + app.connect('include-read', include_read) + except: + pass # Sphinx is too old to link issues/PRs # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'default' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -207,7 +229,7 @@ htmlhelp_basename = 'samtoolsdoc' # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'pysam.tex', u'pysam documentation', - u'Andreas Heger, Kevin Jacobs, et al.', 'manual'), + 'Andreas Heger, John Marshall, Kevin Jacobs, et al', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/doc/index.rst b/doc/index.rst index 30474e6..0b4485c 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,7 +1,7 @@ pysam: htslib interface for python ================================== -:Author: Andreas Heger, Kevin Jacobs and contributors +:Author: Andreas Heger, John Marshall, Kevin Jacobs, and contributors :Date: |today| :Version: |version| @@ -18,7 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as using cython and a high-level, pythonic API for convenient access to the data within genomic file formats. -The current version wraps *htslib-1.17*, *samtools-1.17*, and *bcftools-1.17*. +The current version wraps *htslib-1.18*, *samtools-1.18*, and *bcftools-1.18*. To install the latest release, type:: @@ -26,6 +26,11 @@ To install the latest release, type:: See the :ref:`Installation notes ` for details. +This module is unrelated to NREL-PySAM_, which wraps the National Renewable +Energy Laboratory's System Advisor Model. + +.. _NREL-PySAM: https://nrel-pysam.readthedocs.io/ + Contents -------- diff --git a/doc/installation.rst b/doc/installation.rst index a286c27..a659f9d 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -35,7 +35,11 @@ The typical installation will be through pypi_:: pip install pysam -This will compile the ``builtin`` htslib source code within pysam. +Generally you will have the ``wheel`` package installed and +this command will speedily install pysam from a pre-built wheel. +Otherwise, or if you use pip's ``--no-binary`` option, this will +compile the ``builtin`` htslib source code within pysam and allow +the configuration facilities described below to be used. htslib_ can be configured at compilation to turn on additional features such support using encrypted configurations, enable plugins, diff --git a/doc/release.rst b/doc/release.rst index d731c36..0676a27 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,840 +2,4 @@ Release notes ============= -Release 0.19.1 -============== - -This release wraps htslib/samtools/bcftools version 1.15.1. - -* [#1104] add an add_samples() method to quickly add multiple samples - to VCF. - -Release 0.19.0 -============== - -This release wraps htslib/samtools/bcftools version 1.15. - -* [#1085] Improve getopt()/getopt_long() resetting when running samtools/bcftools commands - -* [#1078] Support BAM_CPAD in get_aligned_pairs - -* [#1063] Run flake8 and fix some linting issues - -* [#1088] Add AlignedSegment is_mapped/mate_is_mapped/is_forward/mate_is_forward properties - -* Write an absent AlignedSegment.qual as all-bytes-0xff - -* Fix BGZFile.read() behaviour near or at EOF - -* First API for the htslib modified bases interface - -Release 0.18.0 -============== - -This release wraps htslib/samtools/bcftools version 1.14. - -* [#1048] and [#1060], clarify documentation of index statistics with CRAM files -* Prevent "retval may be used uninitialised" warning. -* Add new "samples" subcommand to pysam/samtools.py -* Introduce TupleProxyIterator iterator object class - -Release 0.17.0 -============== - -This release wraps htslib/samtools/bcftools version 1.13. Corresponding -to new samtools commands, `pysam.samtools` now has additional functions -`ampliconclip`, `ampliconstats`, `fqimport`, and `version`. - -Bugs fixed: - -* [#447] The maximum QNAME length is fully restored to 254 -* [#506, #958, #1000] Don't crash the Python interpreter on ``pysam.bcftools.*()`` errors -* [#603] count_coverage: ignore reads that have no SEQ field -* [#928] Fix ``pysam.bcftools.mpileup()`` segmentation fault -* [#983] Add win32/\*.[ch] to MANIFEST.in -* [#994] Raise exception in ``get_tid()`` if header could not be parsed -* [#995] Choose TBI/CSI in ``tabix_index()`` via both min_shift and csi -* [#996] ``AlignmentFile.fetch()`` now works with large chromosomes longer than 2\ :sup:`29` bases -* [#1019] Fix Sphinx documentation generation by avoiding Python 2 ``ur'string'`` syntax -* [#1035] Improved handling of file iteration errors -* [#1038] ``tabix_index()`` no longer leaks file descriptors -* [#1040] ``print(aligned_segment)`` now prints the correct TLEN value - (it also now prints RNAME/RNEXT more clearly and prints POS/PNEXT 1-based) -* *setup.py* longer uses ``setup(use_2to3)`` for compatibility with setuptools >= v58.0.0 - -New facilities: - -* [PR #963] Additional VCF classes are exposed to pysam programmers -* [#998, PR #1001] Add ``get/set_encoding_error_handler()`` to control UTF-8 conversion -* [PR #1012] Running ``python setup.py sdist`` now automatically runs cythonize -* Running tests with ``pytest`` now automatically runs ``make`` to generate test data - -Documentation improvements: - -* [#726] Clarify get_forward_sequence/get_forward_qualities documentation -* [#865] Improved example -* [#968] ``get_index_statstics`` parameters -* [#986] Clarify ``VariantFile.fetch`` start/stop region parameters are 0-based and half-open. -* [#990] Corrected ``PileupColumn.get_query_sequences`` documentation -* [#999] Fix documentation for ``AlignmentFile.get_reference_length()`` -* [#1002] Document the default min_base_quality for ``pileup()`` - - -Release 0.16.0 -============== - -This release wraps htslib/bcftools version 1.10.2 and samtools version -1.10. The following bugs reported against pysam are fixed due to this: - -* [#447] Writing out QNAME longer than 251 characters corrupts BAM -* [#640, #734, #843] Setting VariantRecord pos or stop raises error -* [#738, #919] FastxFile truncates concatenated plain gzip compressed files - -Additional bugfixes: - -* [#840] Pileup doesn't work on python3 when `index_filename` is used -* [#886] FastqProxy raises ValueError when instantiated from python -* [#904] VariantFile.fetch() throws ValueError on files with no records -* [#909] Fix incorrect quoting in VariantFile contig records -* [#915, #916] Implement pileup() for unindexed files and/or SAM files - -Backwards incompatible changes: - -* The `samtools import` command was removed in samtools 1.10, so pysam - no longer exports a `samimport` function. Use `pysam.view()` instead. - - -Release 0.15.4 -============== - -Bugfix release. Principal reason for release is to update cython -version in order to fix pip install pysam with python 3.8. - -* [#879] Fix add_meta function in libcbcf.pyx, so meta-information - lines in header added with this function have double-quoting rules - in accordance to rules specified in VCF4.2 and VCF4.3 specifications -* [#863] Force arg to bytes to support non-ASCII encoding -* [#875] Bump minimum Cython version -* [#868] Prevent segfault on Python 2.7 AlignedSegment.compare(other=None) -* [#867] Fix wheel building on TravisCI -* [#863] Force arg to bytes to support non-ASCII encoding -* [#799] disambiguate interpretation of bcf_read return code -* [#841] Fix silent truncation of FASTQ with bad q strings -* [#846] Prevent segmentation fault on ID, when handling malformed records -* [#829] Run configure with the correct CC/CFLAGS/LDFLAGS env vars - - -Release 0.15.3 -============== - -Bugfix release. - -* [#824] allow reading of UTF-8 encoded text in VCF/BCF files. -* [#780] close all filehandles before opening new ones in pysam_dispatch -* [#773] do not cache VariantRecord.id to avoid memory leak -* [#781] default of multiple_iterators=True is changed to False for - CRAM files. -* [#825] fix collections.abc import -* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy - bug when setting FORMAT fields. -* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its - memory as needed, rather than a fixed-size char buffer. -* [#814] Build wheels and upload them to PyPI -* [#755] Allow passing flags and arguments to index methods -* [#763] Strip \0 in header check -* [#761] Test Tabix index contents, not the compression - -Release 0.15.2 -============== - -Bugfix release. - -* [#746] catch pileup itorator out-of-scope segfaults -* [#747] fix faixd fetch with region -* [#748] increase max_pos to (1<<31)-1 -* [#645] Add missing macOS stub files in `MANIFEST.in`, @SoapZA -* [#737] Fix bug in get_aligned_pairs, @bkohrn - -Release 0.15.1 -============== - -Bugfix release. - -* [#716] raise ValueError if tid is out of range when writing -* [#697] release version using cython 0.28.5 for python 3.7 - compatibility - -Release 0.15.0 -============== - -This release wraps htslib/samtools/bcftools version 1.9.0. - -* [#673] permit dash in chromosome name of region string -* [#656] Support `text` when opening a SAM file for writing -* [#658] return None in get_forward_sequence if sequence not in record -* [#683] allow lower case bases in MD tags -* Ensure that = and X CIGAR ops are treated the same as M - -Release 0.14.1 -============== - -This is mostly a bugfix release, though bcftools has now also been -upgraded to 1.7.0. - -* [#621] Add a warning to count_coverage when an alignment has an - empty QUAL field -* [#635] Speed-up of AlignedSegment.find_intro() -* treat border case of all bases in pileup column below quality score -* [#634] Fix access to pileup reference_sequence - - -Release 0.14.0 -============== - -This release wraps htslib/samtools versions 1.7.0. - -* SAM/BAM/CRAM headers are now managed by a separate AlignmentHeader - class. -* AlignmentFile.header.as_dict() returns an ordered dictionary. -* Use "stop" instead of "end" to ensure consistency to - VariantFile. The end designations have been kept for backwards - compatibility. - -* [#611] and [#293] CRAM repeated fetch now works, each iterator - reloads index if multiple_iterators=True -* [#608] pysam now wraps htslib 1.7 and samtools 1.7. -* [#580] reference_name and next_reference_name can now be set to "*" - (will be converted to None to indicate an unmapped location) -* [#302] providing no coordinate to count_coverage will not count from - start/end of contig. -* [#325] @SQ records will be automatically added to header if they are - absent from text section of header. -* [#529] add get_forward_sequence() and get_forward_qualities() - methods -* [#577] add from_string() and to_dict()/from_dict() methods to - AlignedSegment. Rename tostring() to to_string() throughout for - consistency -* [#589] return None from build_alignment_sequence if no MD tag is set -* [#528] add PileupColumn.__len__ method - -Backwards incompatible changes: - -* AlignmentFile.header now returns an AlignmentHeader object. Use - AlignmentFile.header.to_dict() to get the dictionary as - previously. Most dictionary accessor methods (keys(), values(), - __getitem__, ...) have been implemented to ensure some level of - backwards compatibility when only reading. - - The rationale for this change is to have consistency between - AlignmentFile and VariantFile. - -* AlignmentFile and FastaFile now raise IOError instead of OSError - -Medium term we plan to have a 1.0 release. The pysam -interface has grown over the years and the API is cluttered with -deprecated names (Samfile, getrname(), gettid(), ...). To work towards -this, the next release (0.15.0) will yield DeprecationWarnings -for any parts of the API that are considered obsolete and will not be -in 1.0. Once 1.0 has been reached, we will use semantic versioning. - -Release 0.13.0 -=============== - -This release wraps htslib/samtools/bcftools versions 1.6.0 and -contains a series of bugfixes. - -* [#544] reading header from remote TabixFiles now works. -* [#531] add missing tag types H and A. A python float will now be - added as 'f' type instead of 'd' type. -* [#543] use FastaFile instead of Fastafile in pileup. -* [#546] set is_modified flag in setAttribute so updated attributes - are output. -* [#537] allow tabix index files to be created in a custom location. -* [#530] add get_index_statistics() method - - -Release 0.12.0.1 -================ - -Bugfix release to solve compilation issue due to missinge -bcftools/config.h file. - -Release 0.12.0 -============== - -This release wraps htslib/samtools/bcftools versions 1.5.0 and -contains a series of bugfixes. - -* [#473] A new FastxRecord class that can be instantiated from class and - modified in-place. Replaces PersistentFastqProxy. -* [#521] In AligmentFile, Simplify file detection logic and allow remote index files - - * Removed attempts to guess data and index file names; this is magic left - to htslib. - * Removed file existence check prior to opening files with htslib - * Better error checking after opening files that raise the appropriate - error (IOError for when errno is set, ValueError otherwise for backward - compatibility). - * Report IO errors when loading an index by name. - * Allow remote indices (tested using S3 signed URLs). - * Document filepath_index and make it an alias for index_filename. - * Added a require_index parameter to AlignmentFile - -* [#526] handle unset ref when creating new records -* [#513] fix bcf_translate to skip deleted FORMAT fields to avoid - segfaults -* [#516] expose IO errors via IOError exceptions -* [#487] add tabix line_skip, remove 'pileup' preset -* add FastxRecord, replaces PersistentFastqProxy (still present for - backwards compatibility) -* [#496] upgrade to htslib/samtools/bcftools versions 1.5 -* add start/stop to AlignmentFile.fetch() to be consistent with - VariantFile.fetch(). "end" is kept for backwards compatibility. -* [#512] add get_index_statistics() method to AlignmentFile. - -Upcoming changes: - -In the next release we are plannig to separate the header information -from AlignmentFile into a separate class AlignmentHeader. This layout -is similar to VariantFile/VariantHeader. With this change we will -ensure that an AlignedSegment record will be linked to a header so -that chromosome names can be automatically translated from the numeric -representation. As a consequence, the way new AlignedSegment records -are created will need to change as the constructor requires a header:: - - header = pysam.AlignmentHeader( - reference_names=["chr1", "chr2"], - reference_lengths=[1000, 1000]) - - read = pysam.AlignedSegment(header) - -This will affect all code that instantiates AlignedSegment objects -directly. We have not yet merged to allow users to provide feed-back. -The pull-request is here: https://github.com/pysam-developers/pysam/pull/518 -Please comment on github. - -Release 0.11.2.2 -================ - -Bugfix release to address two issues: - -* Changes in 0.11.2.1 broke the GTF/GFF3 parser. Corrected and - more tests have been added. -* [#479] Correct VariantRecord edge cases described in issue - -Release 0.11.2.1 -================ - -Release to fix release tar-ball containing 0.11.1 pre-compiled -C-files. - -Release 0.11.2 -============== - -This release wraps htslib/samtools/bcfools versions 1.4.1 in response -to a security fix in these libraries. Additionally the following -issues have been fixed: - -* [#452] add GFF3 support for tabix parsers -* [#461] Multiple fixes related to VariantRecordInfo and handling of INFO/END -* [#447] limit query name to 251 characters (only partially addresses issue) - -VariantFile and related object fixes - -* Restore VariantFile.\_\_dealloc\_\_ -* Correct handling of bcf_str_missing in bcf_array_to_object and - bcf_object_to_array -* Added update() and pop() methods to some dict-like proxy objects -* scalar INFO entries could not be set again after being deleted -* VariantRecordInfo.__delitem__ now allows unset flags to be deleted without - raising a KeyError -* Multiple other fixes for VariantRecordInfo methods -* INFO/END is now accessible only via VariantRecord.stop and - VariantRecord.rlen. Even if present behind the scenes, it is no longer - accessible via VariantRecordInfo. -* Add argument to issue a warning instead of an exception if input appears - to be truncated - -Other features and fixes: - -* Make AlignmentFile \_\_dealloc\_\_ and close more - stringent -* Add argument AlignmentFile to issue a warning instead of an - exception if input appears to be truncated - -Release 0.11.1 -============== - -Bugfix release - -* [#440] add deprecated 'always' option to infer_query_length for backwards compatibility. - -Release 0.11.0 -============== - -This release wraps the latest versions of htslib/samtools/bcftools and -implements a few bugfixes. - -* [#413] Wrap HTSlib/Samtools/BCFtools 1.4 -* [#422] Fix missing pysam.sort.usage() message -* [#411] Fix BGZfile initialization bug -* [#412] Add seek support for BGZFile -* [#395] Make BGZfile iterable -* [#433] Correct getQueryEnd -* [#419] Export SAM enums such as pysam.CMATCH -* [#415] Fix access by tid in AlignmentFile.fetch() -* [#405] Writing SAM now outputs a header by default. -* [#332] split infer_query_length(always) into infer_query_length and infer_read_length - -Release 0.10.0 -============== - -This release implements further functionality in the VariantFile API -and includes several bugfixes: - -* treat special case -c option in samtools view outputs to stdout even - if -o given, fixes #315 -* permit reading BAM files with CSI index, closes #370 -* raise Error if query name exceeds maximum length, fixes #373 -* new method to compute hash value for AlignedSegment -* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile -* Avoid segfault by detecting out of range reference_id and - next_reference in AlignedSegment.tostring -* Issue #355: Implement streams using file descriptors for VariantFile -* upgrade to htslib 1.3.2 -* fix compilation with musl libc -* Issue #316, #360: Rename all Cython modules to have lib as a prefix -* Issue #332, hardclipped bases in cigar included by - pysam.AlignedSegment.infer_query_length() -* Added support for Python 3.6 filename encoding protocol -* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord -* Issue #331, fix failure in VariantFile.reset() method -* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and - VariantRecord.copy() methods to create new VariantRecord objects -* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters -* Preliminary (potentially unsafe) support for removing and altering header metadata -* Many minor fixes and improvements to VariantFile and related objects - -Please note that all internal cython extensions now have a lib prefix -to facilitate linking against pysam extension modules. Any user cython -extensions using cimport to import pysam definitions will need -changes, for example:: - - cimport pysam.csamtools - -will become:: - - cimport pysam.libcsamtools - -Release 0.9.1 -============= - -This is a bugfix release addressing some installation problems -in pysam 0.9.0, in particular: - -* patch included htslib to work with older libcurl versions, fixes #262. -* do not require cython for python 3 install, fixes #260 -* FastaFile does not accept filepath_index any more, see #270 -* add AlignedSegment.get_cigar_stats method. -* py3 bugfix in VariantFile.subset_samples, fixes #272 -* add missing sysconfig import, fixes #278 -* do not redirect stdout, but instead write to a separately - created file. This should resolve issues when pysam is used - in notebooks or other environments that redirect stdout. -* wrap htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1 -* use bgzf throughout instead of gzip -* allow specifying a fasta reference for CRAM file when opening - for both read and write, fixes #280 - -Release 0.9.0 -============= - -Overview --------- - -The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other -enhancements and bugfixes. See below for a detailed list. - -`Htslib 1.3 `_ -comes with additional capabilities for remote file access which depend -on the presence of optional system libraries. As a consequence, the -installation script :file:`setup.py` has become more complex. For an -overview, see :ref:`installation`. We have tested installation on -linux and OS X, but could not capture all variations. It is possible -that a 0.9.1 release might follow soon addressing installation issues. - -The :py:class:`~.pysam.VariantFile` class provides access to -:term:`vcf` and :term:`bcf` formatted files. The class is certainly -usable and interface is reaching completion, but the API and the -functionality is subject to change. - -Detailed release notes ----------------------- - -* upgrade to htslib 1.3 -* python 3 compatibility tested throughout. -* added a first set of bcftools commands in the pysam.bcftools - submodule. -* samtools commands are now in the pysam.samtools module. For - backwards compatibility they are still imported into the pysam - namespace. -* samtools/bcftools return stdout as a single (byte) string. As output - can be binary (VCF.gz, BAM) this is necessary to ensure py2/py3 - compatibility. To replicate the previous behaviour in py2.7, use:: - - pysam.samtools.view(self.filename).splitlines(True) - -* get_tags() returns the tag type as a character, not an integer (#214) -* TabixFile now raises ValueError on indices created by tabix <1.0 (#206) -* improve OSX installation and develop mode -* FastxIterator now handles empty sequences (#204) -* TabixFile.isremote is not TabixFile.is_remote in line with AlignmentFile -* AlignmentFile.count() has extra optional argument read_callback -* setup.py has been changed to: - * install a single builtin htslib library. Previously, each pysam - module contained its own version. This reduces compilation time - and code bloat. - * run configure for the builtin htslib library in order to detect - optional libraries such as libcurl. Configure behaviour can be - controlled by setting the environment variable - HTSLIB_CONFIGURE_OPTIONS. -* get_reference_sequence() now returns the reference sequence and not - something looking like it. This bug had effects on - get_aligned_pairs(with_seq=True), see #225. If you have relied on on - get_aligned_pairs(with_seq=True) in pysam-0.8.4, please check your - results. -* improved autodetection of file formats in AlignmentFile and VariantFile. - -Release 0.8.4 -============= - -This release contains numerous bugfixes and a first implementation of -a pythonic interface to VCF/BCF files. Note that this code is still -incomplete and preliminary, but does offer a nearly complete immutable -Pythonic interface to VCF/BCF metadata and data with reading and -writing capability. - -Potential isses when upgrading from v0.8.3: - -* binary tags are now returned as python arrays - -* renamed several methods for pep8 compatibility, old names still retained for - backwards compatibility, but should be considered deprecated. - - * gettid() is now get_tid() - * getrname() is now get_reference_name() - * parseRegion() is now parse_region() - -* some methods have changed for pep8 compatibility without the old - names being present: - - * fromQualityString() is now qualitystring_to_array() - * toQualityString() is now qualities_to_qualitystring() - -* faidx now returns strings and not binary strings in py3. - -* The cython components have been broken up into smaller files with - more specific content. This will affect users using the cython - interfaces. - -Edited list of commit log changes: - -* fixes AlignmentFile.check_index to return True -* add RG/PM header tag - closes #179 -* add with_seq option to get_aligned_pairs -* use char * inside reconsituteReferenceSequence -* add soft clipping for get_reference_sequence -* add get_reference_sequence -* queryEnd now computes length from cigar string if no sequence present, closes #176 -* tolerate missing space at end of gtf files, closes #162 -* do not raise Error when receiving output on stderr -* add docu about fetching without index, closes #170 -* FastaFile and FastxFile now return strings in python3, closes #173 -* py3 compat: relative -> absolute imports. -* add reference_name and next_reference_name attributes to AlignedSegment -* add function signatures to cvcf cython. Added note about other VCF code. -* add context manager functions to FastaFile -* add reference_name and next_reference_name attributes to AlignedSegment -* PileupColumn also gets a reference_name attribute. -* add context manager functions to FastaFile -* TabixFile.header for remote files raises AttributeError, fixes #157 -* add context manager interface to TabixFile, closes #165 -* change ctypedef enum to typedef enum for cython 0.23 -* add function signatures to cvcf cython, also added note about other VCF code -* remove exception for custom upper-case header record tags. -* rename VALID_HEADER_FIELDS to KNOWN_HEADER_FIELDS -* fix header record tag parsing for custom tags. -* use cython.str in count_coverage, fixes #141 -* avoid maketrans (issues with python3) -* refactoring: AlignedSegment now in separate module -* do not execute remote tests if URL not available -* fix the unmapped count, incl reads with no SQ group -* add raw output to tags -* added write access for binary tags -* bugfix in call to resize -* implemented writing of binary tags from arrays -* implemented convert_binary_tag to use arrays -* add special cases for reads that are unmapped or whose mates are unmapped. -* rename TabProxies to ctabixproxies -* remove underscores from utility functions -* move utility methods into cutils -* remove callback argument to fetch - closes #128 -* avoid calling close in dealloc -* add unit tests for File object opening -* change AlignmentFile.open to filepath_or_object -* implement copy.copy, close #65 -* add chaching of array attributes in AlignedSegment, closes #121 -* add export of Fastafile -* remove superfluous pysam_dispatch -* use persist option in FastqFile -* get_tag: expose tag type if requested with `with_value_type` -* fix to allow reading vcf record info via tabix-based vcf reader -* add pFastqProxy and pFastqFile objects to make it possible to work with multiple fastq records per file handle, unlike FastqProxy/FastqFile. -* release GIL around htslib IO operations -* More work on read/write support, API improvements -* add `phased` property on `VariantRecordSample` -* add mutable properties to VariantRecord -* BCF fixes and start of read/write support -* VariantHeaderRecord objects now act like mappings for attributes. -* add VariantHeader.alts dict from alt ID->Record. -* Bug fix to strong representation of structured header records. -* VariantHeader is now mutable - - -Release 0.8.3 -============= - -* samtools command now accept the "catch_stdout" option. - -* get_aligned_pairs now works for soft-clipped reads. - -* query_position is now None when a PileupRead is not aligned - to a particular position. - -* AlignedSegments are now comparable and hashable. - -Release 0.8.2.1 -=============== - -* Installation bugfix release. - -Release 0.8.2 -============= - -* Pysam now wraps htslib 1.2.1 and samtools version 1.2. - -* Added CRAM file support to pysam. - -* New alignment info interface. - * opt() and setTag are deprecated, use get_tag() and set_tag() - instead. - * added has_tag() - * tags is deprecated, use get_tags() and set_tags() instead. - -* FastqFile is now FastxFile to reflect that the latter permits - iteration over both fastq- and fasta-formatted files. - -* A Cython wrapper for htslib VCF/BCF reader/writer. The wrapper - provides a nearly complete Pythonic interface to VCF/BCF metadata - with reading and writing capability. However, the interface is still - incomplete and preliminary and lacks capability to mutate the - resulting data. - -Release 0.8.1 -============= - -* Pysam now wraps htslib and samtools versions 1.1. - -* Bugfixes, most notable: - * issue #43: uncompressed BAM output - * issue #42: skip tests requiring network if none available - * issue #19: multiple iterators can now be made to work on the same tabix file - * issue #24: All strings returned from/passed to the pysam API are now unicode in python 3 - * issue #5: type guessing for lists of integers fixed - -* API changes for consistency. The old API is still present, - but deprecated. - In particular: - - * Tabixfile -> TabixFile - * Fastafile -> FastaFile - * Fastqfile -> FastqFile - * Samfile -> AlignmentFile - * AlignedRead -> AlignedSegment - * qname -> query_name - * tid -> reference_id - * pos -> reference_start - * mapq -> mapping_quality - * rnext -> next_reference_id - * pnext -> next_reference_start - * cigar -> cigartuples - * cigarstring -> cigarstring - * tlen -> template_length - * seq -> query_sequence - * qual -> query_qualities, now returns array - * qqual -> query_alignment_qualities, now returns array - * tags -> tags - * alen -> reference_length, reference is always "alignment", so removed - * aend -> reference_end - * rlen -> query_length - * query -> query_alignment_sequence - * qstart -> query_alignment_start - * qend -> query_alignment_end - * qlen -> query_alignment_length - * mrnm -> next_reference_id - * mpos -> next_reference_start - * rname -> reference_id - * isize -> template_length - * blocks -> get_blocks() - * aligned_pairs -> get_aligned_pairs() - * inferred_length -> infer_query_length() - * positions -> get_reference_positions() - * overlap() -> get_overlap() - - * All strings are now passed to or received from the pysam API - as strings, no more bytes. - -Other changes: - * AlignmentFile.fetch(reopen) option is now multiple_iterators. The - default changed to not reopen a file unless requested by the user. - * FastaFile.getReferenceLength is now FastaFile.get_reference_length - -Backwards incompatible changes - -* Empty cigarstring now returns None (instead of '') -* Empty cigar now returns None (instead of []) -* When using the extension classes in cython modules, AlignedRead - needs to be substituted with AlignedSegment. -* fancy_str() has been removed -* qual, qqual now return arrays - -Release 0.8.0 -============= - -* Disabled features - * IteratorColumn.setMask() disabled as htslib does not implement - this functionality? - -* Not implemented yet: - * reading SAM files without header - -Tabix files between version 0.7.8 and 0.8.0 are -not compatible and need to be re-indexed. - -While version 0.7.8 and 0.8.0 should be mostly -compatible, there are some notable exceptions: - -* tabix iterators will fail if there are comments - in the middle or the end of a file. - -* tabix raises always ValueError for invalid intervals. - Previously, different types of errors were raised - (KeyError, IndexError, ValueError) depending on - the type of invalid intervals (missing chromosome, - out-of-range, malformatted interval). - - -Release 0.7.8 -============= - -* added AlignedRead.setTag method -* added AlignedRead.blocks -* unsetting CIGAR strings is now possible -* empty CIGAR string returns empty list -* added reopen flag to Samfile.fetch() -* various bugfixes - -Release 0.7.7 -============= - -* added Fastafile.references, .nreferences and .lengths -* tabix_iterator now uses kseq.h for python 2.7 - -Release 0.7.6 -============= - -* added inferred_length property -* issue 122: MACOSX getline missing, now it works? -* seq and qual can be set None -* added Fastqfile - -Release 0.7.5 -============= - -* switch to samtools 0.1.19 -* issue 122: MACOSX getline missing -* issue 130: clean up tempfiles -* various other bugfixes - -Release 0.7.4 -============= - -* further bugfixes to setup.py and package layout - -Release 0.7.3 -============= - -* further bugfixes to setup.py -* upgraded distribute_setup.py to 0.6.34 - -Release 0.7.2 -============= - -* bugfix in installer - failed when cython not present -* changed installation locations of shared libraries - -Release 0.7.1 -============= - -* bugfix: missing PP tag PG records in header -* added pre-built .c files to distribution - -Release 0.7 -=========== - -* switch to tabix 0.2.6 -* added cigarstring field -* python3 compatibility -* added B tag handling -* added check_sq and check_header options to Samfile.__init__ -* added lazy GTF parsing to tabix -* reworked support for VCF format parsing -* bugfixes - -Release 0.6 -=========== - -* switch to samtools 0.1.18 -* various bugfixes -* removed references to deprecated 'samtools pileup' functionality -* AlignedRead.tags now returns an empty list if there are no tags. -* added pnext, rnext and tlen - -Release 0.5 -=========== - -* switch to samtools 0.1.16 and tabix 0.2.5 -* improved tabix parsing, added vcf support -* re-organized code to permit linking against pysam -* various bugfixes -* added Samfile.positions and Samfile.overlap - -Release 0.4 -=========== - -* switch to samtools 0.1.12a and tabix 0.2.3 -* added snp and indel calling. -* switch from pyrex to cython -* changed handling of samtools stderr -* various bugfixes -* added Samfile.count and Samfile.mate -* deprecated AlignedRead.rname, added AlignedRead.tid - -Release 0.3 -=========== - -* switch to samtools 0.1.8 -* added support for tabix files -* numerous bugfixes including -* permit simultaneous iterators on the same file -* working access to remote files +.. include:: ../NEWS diff --git a/doc/requirements-rtd.txt b/doc/requirements-rtd.txt new file mode 100644 index 0000000..beb55b6 --- /dev/null +++ b/doc/requirements-rtd.txt @@ -0,0 +1,2 @@ +sphinx==7.2.5 +sphinx-rtd-theme==1.3.0 diff --git a/pyproject.toml b/pyproject.toml index 4106783..1f89f9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,7 @@ [project] name = "pysam" -description = "pysam - a python module for reading, manipulating and writing genomic data sets." +description = "Package for reading, manipulating, and writing genomic data" license = { text = "MIT License" } -version = "0.21.0" authors = [ { name = "Andreas Heger", email = "andreas.heger@gmail.com"} ] @@ -11,13 +10,32 @@ requires-python = ">=3.6" dynamic = [ "classifiers", "readme", + "version", ] -dependencies = [ - "cython", -] - +[project.urls] +"Documentation" = "https://pysam.readthedocs.io/" +"Release notes" = "https://pysam.readthedocs.io/en/stable/release.html" [build-system] -requires = ["setuptools>=59.0", "wheel", "Cython>=0.29.30,<3.0"] +requires = ["setuptools>=59.0", "Cython>=0.29.12,<4"] build-backend = "setuptools.build_meta:__legacy__" + +[tool.cibuildwheel] +before-all = "{project}/devtools/install-prerequisites.sh" +# Necessary until we build libhts.a out-of-tree from within build_temp +before-build = "make -C {project}/htslib distclean" + +test-requires = ["pytest"] +test-command = "REF_PATH=: pytest {project}/tests" + +[tool.tox] +legacy_tox_ini = """ + [tox] + envlist = py36, py311 + + [testenv] + deps = pytest + setenv = REF_PATH=: + commands = pytest tests +""" diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index 75b5ee9..3071f37 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -757,7 +757,18 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): elif op == BAM_CHARD_CLIP: pass # advances neither - cdef char * md_tag = bam_aux2Z(md_tag_ptr) + cdef char *md_tag, md_buffer[2]; + cdef uint8_t md_typecode = md_tag_ptr[0] + if md_typecode == b'Z': + md_tag = bam_aux2Z(md_tag_ptr) + elif md_typecode == b'A': + # Work around HTSeq bug that writes 1-character strings as MD:A:v + md_buffer[0] = bam_aux2A(md_tag_ptr) + md_buffer[1] = b'\0' + md_tag = md_buffer + else: + raise TypeError('Tagged field MD:{}: does not have expected type MD:Z'.format(chr(md_typecode))) + cdef int md_idx = 0 cdef char c s_idx = 0 @@ -1083,7 +1094,10 @@ cdef class AlignedSegment: _sam = force_bytes(sam) line.s = _sam - sam_parse1(&line, dest.header.ptr, dest._delegate) + cdef int ret + ret = sam_parse1(&line, dest.header.ptr, dest._delegate) + if ret < 0: + raise ValueError("parsing SAM record string failed (error code {})".format(ret)) return dest @@ -1845,12 +1859,16 @@ cdef class AlignedSegment: def get_reference_positions(self, full_length=False): """a list of reference positions that this read aligns to. - By default, this method only returns positions in the - reference that are within the alignment. If *full_length* is - set, None values will be included for any soft-clipped or - unaligned positions within the read. The returned list will - thus be of the same length as the read. + By default, this method returns the (0-based) positions on the + reference that are within the read's alignment, leaving gaps + corresponding to deletions and other reference skips. + When *full_length* is True, the returned list is the same length + as the read and additionally includes None values corresponding + to insertions or soft-clipping, i.e., to bases of the read that + are not aligned to a reference position. + (See also :meth:`get_aligned_pairs` which additionally returns + the corresponding positions along the read.) """ cdef uint32_t k, i, l, pos cdef int op @@ -1958,6 +1976,10 @@ cdef class AlignedSegment: def get_aligned_pairs(self, matches_only=False, with_seq=False): """a list of aligned read (query) and reference positions. + Each item in the returned list is a tuple consisting of + the 0-based offset from the start of the read sequence + followed by the 0-based reference position. + For inserts, deletions, skipping either query or reference position may be None. @@ -1968,7 +1990,7 @@ cdef class AlignedSegment: ---------- matches_only : bool - If True, only matched bases are returned - no None on either + If True, only matched bases are returned --- no None on either side. with_seq : bool If True, return a third element in the tuple containing the diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index 74637f8..28b395a 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -71,6 +71,10 @@ class AlignmentHeader: def is_valid_tid(self, tid: int) -> bool: ... def get_tid(self, reference: str) -> int: ... +# The iterator produced by AlignmentFile is currently itself, but this may +# change in future and code should not make assumptions about this type. +AlignmentFileIterator = AlignmentFile + class AlignmentFile(HTSFile): def __init__( self, @@ -172,8 +176,8 @@ class AlignmentFile(HTSFile): @property def nocoordinate(self) -> int: ... def get_index_statistics(self) -> List[IndexStats]: ... - def __iter__(self) -> Any: ... - def __next__(self) -> Any: ... + def __iter__(self) -> AlignmentFileIterator: ... + def __next__(self) -> AlignedSegment: ... def is_valid_tid(self, tid: int) -> bool: ... def get_tid(self, reference: str) -> int: ... def get_reference_name(self, tid: int) -> str: ... diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index e37a411..97d4e6d 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -73,7 +73,8 @@ from cpython cimport array as c_array from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn -from pysam.libchtslib cimport HTSFile, hisremote +from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \ + HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL from io import StringIO @@ -1005,7 +1006,8 @@ cdef class AlignmentFile(HTSFile): if cfilename or cindexname: with nogil: - self.index = sam_index_load2(self.htsfile, cfilename, cindexname) + self.index = sam_index_load3(self.htsfile, cfilename, cindexname, + HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL) if not self.index and (cindexname or require_index): if errno: diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index 8c088af..8ecfe5f 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -3479,7 +3479,7 @@ cdef class VariantRecordSample(object): return bcf_format_get_alleles(self) @alleles.setter - def alleles(self, value: tuple): + def alleles(self, value): # Sets the genotype, supply a tuple of alleles to set. # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord # The genotype is reset when an empty tuple, None or (None,) is supplied diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx index 0d88f8d..d66a3c6 100644 --- a/pysam/libcbgzf.pyx +++ b/pysam/libcbgzf.pyx @@ -10,6 +10,7 @@ import io from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdio cimport SEEK_SET from libc.stdlib cimport malloc, calloc, realloc, free from cpython.object cimport PyObject @@ -18,7 +19,7 @@ from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize from pysam.libcutils cimport force_bytes, encode_filename from pysam.libchtslib cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \ bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \ - bgzf_tell, bgzf_getline, kstring_t, SEEK_SET, BGZF + bgzf_tell, bgzf_getline, kstring_t, BGZF __all__ = ["BGZFile"] diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index 30a1b76..56e7460 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -273,8 +273,6 @@ cdef extern from "htslib/bgzf.h" nogil: # Write the data in the buffer to the file. int bgzf_flush(BGZF *fp) - int SEEK_SET - # Return a virtual file pointer to the current location in the file. # No interpretation of the value should be made, other than a subsequent # call to bgzf_seek can be used to position the file at the same point. @@ -285,7 +283,7 @@ cdef extern from "htslib/bgzf.h" nogil: # # @param fp BGZF file handler # @param pos virtual file offset returned by bgzf_tell() - # @param whence must be SEEK_SET + # @param whence must be SEEK_SET (cimported from libc.stdio / posix.unistd) # @return 0 on success and -1 on error # / int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) @@ -362,7 +360,7 @@ cdef extern from "htslib/bgzf.h" nogil: # # @param fp BGZF file handler; must be opened for reading # @param uoffset file offset in the uncompressed data - # @param where SEEK_SET supported atm + # @param where SEEK_SET (cimported from libc.stdio) supported atm # # Returns 0 on success and -1 on error. int bgzf_useek(BGZF *fp, long uoffset, int where) @@ -688,6 +686,17 @@ cdef extern from "htslib/hts.h" nogil: # @return The index, or NULL if an error occurred. hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) + #### Load a specific index file + # @param fn Input BAM/BCF/etc filename + # @param fnidx The input index filename + # @param fmt One of the HTS_FMT_* index formats + # @param flags Flags to alter behaviour (see description) + # @return The index, or NULL if an error occurred. + hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags) + + int HTS_IDX_SAVE_REMOTE + int HTS_IDX_SILENT_FAIL + uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) @@ -1092,6 +1101,14 @@ cdef extern from "htslib/sam.h" nogil: # @return The index, or NULL if an error occurred. hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) + # Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file + # @param fp File handle of the data file whose index is being opened + # @param fn BAM/CRAM/etc data file filename + # @param fnidx Index filename, or NULL to search alongside @a fn + # @param flags Flags to alter behaviour + # @return The index, or NULL if an error occurred. + hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) + # Generate and save an index file # @param fn Input BAM/etc filename, to which .csi/etc will be added # @param min_shift Positive to generate CSI, or 0 to generate BAI @@ -1466,6 +1483,7 @@ cdef extern from "htslib/tbx.h" nogil: tbx_t * tbx_index_load(char *fn) tbx_t *tbx_index_load2(const char *fn, const char *fnidx) + tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags) # free the array but not the values char **tbx_seqnames(tbx_t *tbx, int *n) @@ -2088,6 +2106,7 @@ cdef extern from "htslib/vcf.h" nogil: #************************************************************************ hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) + hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags) int bcf_index_build(const char *fn, int min_shift) int bcf_index_build2(const char *fn, const char *fnidx, int min_shift) diff --git a/pysam/libchtslib.pyi b/pysam/libchtslib.pyi index 925828b..fcd7935 100644 --- a/pysam/libchtslib.pyi +++ b/pysam/libchtslib.pyi @@ -96,7 +96,7 @@ class HTSFile: @property def is_bcf(self) -> bool: ... def reset(self) -> None: ... - def seek(self, offset: int) -> int: ... + def seek(self, offset: int, whence: int = ...) -> int: ... def tell(self) -> int: ... def add_hts_options(self, format_options: Optional[List[str]] = ...) -> None: ... def parse_region( diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index 760d268..3cb7b7a 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -14,7 +14,7 @@ from libc.stdint cimport INT32_MAX from cpython cimport PyBytes_FromStringAndSize from pysam.libchtslib cimport * from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len -from pysam.libcutils cimport encode_filename, from_string_and_size +from pysam.libcutils cimport encode_filename, from_string_and_size, libc_whence_from_io ######################################################################## @@ -35,11 +35,6 @@ from warnings import warn __all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile'] -# defines imported from samtools -DEF SEEK_SET = 0 -DEF SEEK_CUR = 1 -DEF SEEK_END = 2 - # maximum genomic coordinace cdef int MAX_POS = (1 << 31) - 1 @@ -108,7 +103,7 @@ cdef class HFile(object): self.fp = NULL if hclose(fp) != 0: - raise IOError(herrno(self.fp), 'failed to close HFile', self.name) + raise IOError(errno, 'failed to close HFile', self.name) def fileno(self): if self.fp == NULL: @@ -246,11 +241,11 @@ cdef class HFile(object): def readlines(self): return list(self) - def seek(self, Py_ssize_t offset, int whence=SEEK_SET): + def seek(self, Py_ssize_t offset, int whence=io.SEEK_SET): if self.fp == NULL: raise IOError('operation on closed HFile') - cdef Py_ssize_t off = hseek(self.fp, offset, whence) + cdef Py_ssize_t off = hseek(self.fp, offset, libc_whence_from_io(whence)) if off < 0: raise IOError(herrno(self.fp), 'seek failed on HFile', self.name) @@ -479,19 +474,21 @@ cdef class HTSFile(object): """ return self.seek(self.start_offset) - def seek(self, uint64_t offset): + def seek(self, uint64_t offset, int whence=io.SEEK_SET): """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`.""" if not self.is_open: raise ValueError('I/O operation on closed file') if self.is_stream: raise IOError('seek not available in streams') + whence = libc_whence_from_io(whence) + cdef int64_t ret if self.htsfile.format.compression == bgzf: with nogil: - ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) + ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence) elif self.htsfile.format.compression == no_compression: - ret = 0 if (hseek(self.htsfile.fp.hfile, offset, SEEK_SET) >= 0) else -1 + ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1 else: raise NotImplementedError("seek not implemented in files compressed by method {}".format( self.htsfile.format.compression)) diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd index de7f115..1bce057 100644 --- a/pysam/libcutils.pxd +++ b/pysam/libcutils.pxd @@ -7,6 +7,8 @@ from cpython cimport array as c_array cpdef parse_region(contig=*, start=*, stop=*, region=*, reference=*, end=*) +cdef int libc_whence_from_io(int whence) + ######################################################################### # Utility functions for quality string conversions diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index 246c835..cb08ef2 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -18,6 +18,7 @@ from libc.stdint cimport INT32_MAX, int32_t from libc.stdio cimport fprintf, stderr, fflush from libc.stdio cimport stdout as c_stdout from posix.fcntl cimport open as c_open, O_WRONLY +from posix.unistd cimport SEEK_SET, SEEK_CUR, SEEK_END from libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \ samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn @@ -261,6 +262,16 @@ cpdef parse_region(contig=None, return contig, rstart, rstop +cdef int libc_whence_from_io(int whence): + # io.SEEK_SET/_CUR/_END are by definition 0/1/2 but C/POSIX's equivalents + # have unspecified values. So we must translate, but checking for 0/1/2 + # rather than io.SEEK_SET/etc suffices. + if whence == 0: return SEEK_SET + if whence == 1: return SEEK_CUR + if whence == 2: return SEEK_END + return whence # Otherwise likely invalid, but let HTSlib or OS report it + + def _pysam_dispatch(collection, method, args=None, diff --git a/pysam/version.h b/pysam/version.h index 6d353c5..645557b 100644 --- a/pysam/version.h +++ b/pysam/version.h @@ -1,5 +1,5 @@ // Version information used while compiling samtools, bcftools, and htslib -#define SAMTOOLS_VERSION "1.17 (pysam)" -#define BCFTOOLS_VERSION "1.17 (pysam)" -#define HTS_VERSION_TEXT "1.17 (pysam)" +#define SAMTOOLS_VERSION "1.18 (pysam)" +#define BCFTOOLS_VERSION "1.18 (pysam)" +#define HTS_VERSION_TEXT "1.18 (pysam)" diff --git a/pysam/version.py b/pysam/version.py index 78b3ffd..62a9f31 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,6 +1,6 @@ # pysam versioning information -__version__ = "0.21.0" +__version__ = "0.22.0" -__samtools_version__ = "1.17" -__bcftools_version__ = "1.17" -__htslib_version__ = "1.17" +__samtools_version__ = "1.18" +__bcftools_version__ = "1.18" +__htslib_version__ = "1.18" diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..420c17e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +Cython>=0.29.12,<4 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f937d1c..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -cython>=0.29.12 diff --git a/samtools/README b/samtools/README index 60b37ac..8f4f236 100644 --- a/samtools/README +++ b/samtools/README @@ -9,7 +9,7 @@ Building samtools The typical simple case of building Samtools using the HTSlib bundled within this Samtools release tarball is done as follows: - cd .../samtools-1.17 # Within the unpacked release directory + cd .../samtools-1.18 # Within the unpacked release directory ./configure make @@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for installation using the HTSlib bundled within this Samtools release tarball, and building the various HTSlib utilities such as bgzip is done as follows: - cd .../samtools-1.17 # Within the unpacked release directory + cd .../samtools-1.18 # Within the unpacked release directory ./configure --prefix=/path/to/location make all all-htslib make install install-htslib @@ -48,7 +48,7 @@ There are two advantages to this: To build with plug-ins, you need to use the --enable-plugins configure option as follows: - cd .../samtools-1.17 # Within the unpacked release directory + cd .../samtools-1.18 # Within the unpacked release directory ./configure --enable-plugins --prefix=/path/to/location make all all-htslib make install install-htslib @@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from the source distribution instead of installing the package. In that case you can use: - cd .../samtools-1.17 # Within the unpacked release directory - ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.17 + cd .../samtools-1.18 # Within the unpacked release directory + ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.18 make all all-htslib It is possible to override the built-in search path using the HTS_PATH diff --git a/samtools/bam_ampliconclip.c b/samtools/bam_ampliconclip.c index 91fc858..72f39bd 100644 --- a/samtools/bam_ampliconclip.c +++ b/samtools/bam_ampliconclip.c @@ -1,7 +1,7 @@ /* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads from the 5' end. - Copyright (C) 2020-2022 Genome Research Ltd. + Copyright (C) 2020-2023 Genome Research Ltd. Authors: Andrew Whitwham Rob Davies @@ -59,6 +59,7 @@ typedef struct { int oa_tag; int del_tag; int tol; + int unmap_len; char *arg_list; char *stats_file; char *rejects_file; @@ -638,6 +639,7 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, long filtered = 0, written = 0, failed = 0; kstring_t str = KS_INITIALIZE; kstring_t oat = KS_INITIALIZE; + kstring_t seq = KS_INITIALIZE; bed_entry_list_t *sites; FILE *stats_fp = stderr; khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash); @@ -829,16 +831,46 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, } } - if (param->fail_len >= 0 || param->filter_len >= 0) { - hts_pos_t aql = active_query_len(b); + if (param->fail_len >= 0 || param->filter_len >= 0 || param->unmap_len >= 0) { + hts_pos_t aql = active_query_len(b); - if (param->fail_len >= 0 && aql <= param->fail_len) { - b->core.flag |= BAM_FQCFAIL; - } + if (param->fail_len >= 0 && aql <= param->fail_len) { + b->core.flag |= BAM_FQCFAIL; + } + + if (param->filter_len >= 0 && aql <= param->filter_len) { + filter = 1; + } + + if (param->unmap_len >= 0 && aql <= param->unmap_len) { + + if (ks_resize(&seq, b->core.l_qseq) < 0) { + fprintf(stderr, "[ampliconclip] error: allocate memory for sequence %s\n", bam_get_seq(b)); + goto fail; + } + + ks_clear(&seq); + char *sb = ks_str(&seq); + uint8_t *sequence = bam_get_seq(b); + int i; - if (param->filter_len >= 0 && aql <= param->filter_len) { - filter = 1; - } + for (i = 0; i < b->core.l_qseq ; ++i) { + *sb++ = seq_nt16_str[bam_seqi(sequence, i)]; + } + + if (bam_set1(b_tmp, b->core.l_qname - b->core.l_extranul - 1, bam_get_qname(b), + (b->core.flag | BAM_FUNMAP), b->core.tid, b->core.pos, 0, + 0, NULL, b->core.mtid, b->core.mpos, b->core.isize, + b->core.l_qseq, seq.s, (const char *)bam_get_qual(b), + bam_get_l_aux(b)) < 0) { + fprintf(stderr, "[ampliconclip] error: could not unmap read %s\n", bam_get_seq(b)); + goto fail; + } + + memcpy(bam_get_aux(b_tmp), bam_get_aux(b), bam_get_l_aux(b)); + b_tmp->l_data += bam_get_l_aux(b); + swap_bams(&b, &b_tmp); + } } if (b->core.flag & BAM_FQCFAIL) { @@ -913,6 +945,7 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, fail: destroy_bed_hash(bed_hash); ks_free(&oat); + ks_free(&seq); sam_hdr_destroy(header); bam_destroy1(b); bam_destroy1(b_tmp); @@ -935,6 +968,7 @@ static void usage(void) { fprintf(stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n"); fprintf(stderr, " --filter-len INT do not output reads INT size or shorter.\n"); fprintf(stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n"); + fprintf(stderr, " --unmap-len INT unmap reads INT size or shorter, default 0.\n"); fprintf(stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n"); fprintf(stderr, " --rejects-file FILE file to write filtered reads.\n"); fprintf(stderr, " --original for clipped entries add an OA tag with original data.\n"); @@ -955,7 +989,7 @@ int amplicon_clip_main(int argc, char **argv) { htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL, *reject = NULL; clipping_type clipping = soft_clip; - cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL}; + cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, 0, NULL, NULL, NULL}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -973,6 +1007,7 @@ int amplicon_clip_main(int argc, char **argv) { {"original", no_argument, NULL, 1013}, {"keep-tag", no_argument, NULL, 1014}, {"tolerance", required_argument, NULL, 1015}, + {"unmap-len", required_argument, NULL, 1016}, {NULL, 0, NULL, 0} }; @@ -996,6 +1031,7 @@ int amplicon_clip_main(int argc, char **argv) { case 1013: param.oa_tag = 1; break; case 1014: param.del_tag = 0; break; case 1015: param.tol = atoi(optarg); break; + case 1016: param.unmap_len = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage(); exit(1); @@ -1014,7 +1050,7 @@ int amplicon_clip_main(int argc, char **argv) { if (param.tol < 0) { fprintf(stderr, "[ampliconclip] warning: invalid tolerance of %d," - " reseting tolerance to default of 5.\n", param.tol); + " resetting tolerance to default of 5.\n", param.tol); param.tol = 5; } diff --git a/samtools/bam_ampliconclip.c.pysam.c b/samtools/bam_ampliconclip.c.pysam.c index 4eb9c5a..0c36850 100644 --- a/samtools/bam_ampliconclip.c.pysam.c +++ b/samtools/bam_ampliconclip.c.pysam.c @@ -3,7 +3,7 @@ /* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads from the 5' end. - Copyright (C) 2020-2022 Genome Research Ltd. + Copyright (C) 2020-2023 Genome Research Ltd. Authors: Andrew Whitwham Rob Davies @@ -61,6 +61,7 @@ typedef struct { int oa_tag; int del_tag; int tol; + int unmap_len; char *arg_list; char *stats_file; char *rejects_file; @@ -640,6 +641,7 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, long filtered = 0, written = 0, failed = 0; kstring_t str = KS_INITIALIZE; kstring_t oat = KS_INITIALIZE; + kstring_t seq = KS_INITIALIZE; bed_entry_list_t *sites; FILE *stats_fp = samtools_stderr; khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash); @@ -831,16 +833,46 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, } } - if (param->fail_len >= 0 || param->filter_len >= 0) { - hts_pos_t aql = active_query_len(b); + if (param->fail_len >= 0 || param->filter_len >= 0 || param->unmap_len >= 0) { + hts_pos_t aql = active_query_len(b); - if (param->fail_len >= 0 && aql <= param->fail_len) { - b->core.flag |= BAM_FQCFAIL; - } + if (param->fail_len >= 0 && aql <= param->fail_len) { + b->core.flag |= BAM_FQCFAIL; + } + + if (param->filter_len >= 0 && aql <= param->filter_len) { + filter = 1; + } + + if (param->unmap_len >= 0 && aql <= param->unmap_len) { + + if (ks_resize(&seq, b->core.l_qseq) < 0) { + fprintf(samtools_stderr, "[ampliconclip] error: allocate memory for sequence %s\n", bam_get_seq(b)); + goto fail; + } + + ks_clear(&seq); + char *sb = ks_str(&seq); + uint8_t *sequence = bam_get_seq(b); + int i; - if (param->filter_len >= 0 && aql <= param->filter_len) { - filter = 1; - } + for (i = 0; i < b->core.l_qseq ; ++i) { + *sb++ = seq_nt16_str[bam_seqi(sequence, i)]; + } + + if (bam_set1(b_tmp, b->core.l_qname - b->core.l_extranul - 1, bam_get_qname(b), + (b->core.flag | BAM_FUNMAP), b->core.tid, b->core.pos, 0, + 0, NULL, b->core.mtid, b->core.mpos, b->core.isize, + b->core.l_qseq, seq.s, (const char *)bam_get_qual(b), + bam_get_l_aux(b)) < 0) { + fprintf(samtools_stderr, "[ampliconclip] error: could not unmap read %s\n", bam_get_seq(b)); + goto fail; + } + + memcpy(bam_get_aux(b_tmp), bam_get_aux(b), bam_get_l_aux(b)); + b_tmp->l_data += bam_get_l_aux(b); + swap_bams(&b, &b_tmp); + } } if (b->core.flag & BAM_FQCFAIL) { @@ -915,6 +947,7 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, fail: destroy_bed_hash(bed_hash); ks_free(&oat); + ks_free(&seq); sam_hdr_destroy(header); bam_destroy1(b); bam_destroy1(b_tmp); @@ -937,6 +970,7 @@ static void usage(void) { fprintf(samtools_stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n"); fprintf(samtools_stderr, " --filter-len INT do not output reads INT size or shorter.\n"); fprintf(samtools_stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n"); + fprintf(samtools_stderr, " --unmap-len INT unmap reads INT size or shorter, default 0.\n"); fprintf(samtools_stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n"); fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n"); fprintf(samtools_stderr, " --original for clipped entries add an OA tag with original data.\n"); @@ -957,7 +991,7 @@ int amplicon_clip_main(int argc, char **argv) { htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL, *reject = NULL; clipping_type clipping = soft_clip; - cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL}; + cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, 0, NULL, NULL, NULL}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -975,6 +1009,7 @@ int amplicon_clip_main(int argc, char **argv) { {"original", no_argument, NULL, 1013}, {"keep-tag", no_argument, NULL, 1014}, {"tolerance", required_argument, NULL, 1015}, + {"unmap-len", required_argument, NULL, 1016}, {NULL, 0, NULL, 0} }; @@ -998,6 +1033,7 @@ int amplicon_clip_main(int argc, char **argv) { case 1013: param.oa_tag = 1; break; case 1014: param.del_tag = 0; break; case 1015: param.tol = atoi(optarg); break; + case 1016: param.unmap_len = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage(); samtools_exit(1); @@ -1016,7 +1052,7 @@ int amplicon_clip_main(int argc, char **argv) { if (param.tol < 0) { fprintf(samtools_stderr, "[ampliconclip] warning: invalid tolerance of %d," - " reseting tolerance to default of 5.\n", param.tol); + " resetting tolerance to default of 5.\n", param.tol); param.tol = 5; } diff --git a/samtools/bam_consensus.c b/samtools/bam_consensus.c index 4cdaf3f..3cbb24f 100644 --- a/samtools/bam_consensus.c +++ b/samtools/bam_consensus.c @@ -2043,20 +2043,30 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } if (opts->all_bases) { - if (tid != opts->last_tid && opts->last_tid >= 0) { - hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); - if (opts->iter) - len = MIN(opts->iter->end, len); - if (empty_pileup2(opts, opts->h, opts->last_tid, opts->last_pos, - len) < 0) - return -1; - if (tid >= 0) { - if (empty_pileup2(opts, opts->h, tid, - opts->iter ? opts->iter->beg : 0, - pos-1) < 0) + if (tid != opts->last_tid && opts->last_tid >= -1) { + if (opts->last_tid >= 0) { + // remainder of previous ref + hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); + if (opts->iter) + len = MIN(opts->iter->end, len); + if (empty_pileup2(opts, opts->h, opts->last_tid, + opts->last_pos, len) < 0) + return -1; + } + + opts->last_pos = opts->iter ? opts->iter->beg : 0; + } + + // Any refs between last_tid and tid + if (!opts->iter && tid > opts->last_tid && opts->all_bases > 1) { + while (++opts->last_tid < tid) { + hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); + if (empty_pileup2(opts, opts->h, opts->last_tid, 0, len) < 0) return -1; } } + + // Any gaps in this ref (same tid) or at start of this new tid if (opts->last_pos >= 0 && pos > opts->last_pos+1) { if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos, pos-1) < 0) @@ -2167,9 +2177,11 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, return 0; } + next_ref: if (tid != opts->last_tid) { if (opts->last_tid != -1) { if (opts->all_bases) { + // Fill in remainder of previous reference int i, N; if (opts->iter) { opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1); @@ -2197,9 +2209,13 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } seq->l = 0; qual->l = 0; + + if (!opts->iter && opts->all_bases > 1 && ++opts->last_tid < tid) { + opts->last_pos = 0; + goto next_ref; + } + opts->last_tid = tid; -// if (opts->all_bases) -// opts->last_pos = 0; if (opts->iter) opts->last_pos = opts->iter->beg; else @@ -2710,6 +2726,13 @@ int main_consensus(int argc, char **argv) { if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0) goto err; } + while (!opts.iter && opts.all_bases > 1 && + ++opts.last_tid < opts.h->n_targets) { + int len = sam_hdr_tid2len(opts.h, opts.last_tid); + if (empty_pileup2(&opts, opts.h, opts.last_tid, 0, len) < 0) + goto err; + } + } else { if (pileup_loop(opts.fp, opts.h, readaln2, opts.mode != MODE_SIMPLE ? nm_init : NULL, @@ -2717,6 +2740,8 @@ int main_consensus(int argc, char **argv) { opts.mode != MODE_SIMPLE ? nm_free : NULL, &opts) < 0) goto err; + + next_ref_q: if (opts.all_bases) { // fill out terminator int tid = opts.iter ? opts.iter->tid : opts.last_tid; @@ -2744,6 +2769,13 @@ int main_consensus(int argc, char **argv) { dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid), opts.ks_ins_seq.s, opts.ks_ins_seq.l, opts.ks_ins_qual.s, opts.ks_ins_qual.l); + + if (!opts.iter && opts.all_bases > 1 && + ++opts.last_tid < opts.h->n_targets) { + opts.last_pos = 0; + opts.ks_ins_seq.l = opts.ks_ins_qual.l = 0; + goto next_ref_q; + } // if (consensus_loop(&opts) < 0) { // print_error_errno("consensus", "Failed"); // goto err; diff --git a/samtools/bam_consensus.c.pysam.c b/samtools/bam_consensus.c.pysam.c index 70f47ba..b090a9a 100644 --- a/samtools/bam_consensus.c.pysam.c +++ b/samtools/bam_consensus.c.pysam.c @@ -2045,20 +2045,30 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } if (opts->all_bases) { - if (tid != opts->last_tid && opts->last_tid >= 0) { - hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); - if (opts->iter) - len = MIN(opts->iter->end, len); - if (empty_pileup2(opts, opts->h, opts->last_tid, opts->last_pos, - len) < 0) - return -1; - if (tid >= 0) { - if (empty_pileup2(opts, opts->h, tid, - opts->iter ? opts->iter->beg : 0, - pos-1) < 0) + if (tid != opts->last_tid && opts->last_tid >= -1) { + if (opts->last_tid >= 0) { + // remainder of previous ref + hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); + if (opts->iter) + len = MIN(opts->iter->end, len); + if (empty_pileup2(opts, opts->h, opts->last_tid, + opts->last_pos, len) < 0) + return -1; + } + + opts->last_pos = opts->iter ? opts->iter->beg : 0; + } + + // Any refs between last_tid and tid + if (!opts->iter && tid > opts->last_tid && opts->all_bases > 1) { + while (++opts->last_tid < tid) { + hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); + if (empty_pileup2(opts, opts->h, opts->last_tid, 0, len) < 0) return -1; } } + + // Any gaps in this ref (same tid) or at start of this new tid if (opts->last_pos >= 0 && pos > opts->last_pos+1) { if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos, pos-1) < 0) @@ -2169,9 +2179,11 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, return 0; } + next_ref: if (tid != opts->last_tid) { if (opts->last_tid != -1) { if (opts->all_bases) { + // Fill in remainder of previous reference int i, N; if (opts->iter) { opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1); @@ -2199,9 +2211,13 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } seq->l = 0; qual->l = 0; + + if (!opts->iter && opts->all_bases > 1 && ++opts->last_tid < tid) { + opts->last_pos = 0; + goto next_ref; + } + opts->last_tid = tid; -// if (opts->all_bases) -// opts->last_pos = 0; if (opts->iter) opts->last_pos = opts->iter->beg; else @@ -2712,6 +2728,13 @@ int main_consensus(int argc, char **argv) { if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0) goto err; } + while (!opts.iter && opts.all_bases > 1 && + ++opts.last_tid < opts.h->n_targets) { + int len = sam_hdr_tid2len(opts.h, opts.last_tid); + if (empty_pileup2(&opts, opts.h, opts.last_tid, 0, len) < 0) + goto err; + } + } else { if (pileup_loop(opts.fp, opts.h, readaln2, opts.mode != MODE_SIMPLE ? nm_init : NULL, @@ -2719,6 +2742,8 @@ int main_consensus(int argc, char **argv) { opts.mode != MODE_SIMPLE ? nm_free : NULL, &opts) < 0) goto err; + + next_ref_q: if (opts.all_bases) { // fill out terminator int tid = opts.iter ? opts.iter->tid : opts.last_tid; @@ -2746,6 +2771,13 @@ int main_consensus(int argc, char **argv) { dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid), opts.ks_ins_seq.s, opts.ks_ins_seq.l, opts.ks_ins_qual.s, opts.ks_ins_qual.l); + + if (!opts.iter && opts.all_bases > 1 && + ++opts.last_tid < opts.h->n_targets) { + opts.last_pos = 0; + opts.ks_ins_seq.l = opts.ks_ins_qual.l = 0; + goto next_ref_q; + } // if (consensus_loop(&opts) < 0) { // print_error_errno("consensus", "Failed"); // goto err; diff --git a/samtools/bam_fastq.c b/samtools/bam_fastq.c index c17821d..e4701b1 100644 --- a/samtools/bam_fastq.c +++ b/samtools/bam_fastq.c @@ -1,6 +1,6 @@ /* bam_fastq.c -- FASTA and FASTQ file generation - Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd. + Copyright (C) 2009-2017, 2019-2020, 2023 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "htslib/sam.h" #include "htslib/klist.h" @@ -64,8 +65,14 @@ static void bam2fq_usage(FILE *to, const char *command) " -o FILE write reads designated READ1 or READ2 to FILE\n" " note: if a singleton file is specified with -s, only\n" " paired reads will be written to the -1 and -2 files.\n" -" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 +" -d, --tag TAG[:VAL]\n" +" only include reads containing TAG, optionally with value VAL\n" +" -f, --require-flags INT\n" +" only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F, --excl[ude]-flags INT\n" +" only include reads with none of the FLAGs in INT present [0x900]\n" // F&x == 0 +" --rf, --incl[ude]-flags INT\n" +" only include reads with any of the FLAGs in INT present [0]\n" // !(F&x == 0) " -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) " -n don't append /1 and /2 to the read name\n" " -N always append /1 and /2 to the read name\n", @@ -132,7 +139,7 @@ typedef struct bam2fq_opts { char *fnr[3]; char *fn_input; // pointer to input filename in argv do not free bool has12, has12always, use_oq, copy_tags, illumina_tag; - int flag_on, flag_off, flag_alloff; + int flag_on, flag_off, flag_alloff, flag_anyon; sam_global_args ga; fastfile filetype; int def_qual; @@ -142,6 +149,10 @@ typedef struct bam2fq_opts { char *index_format; char *extra_tags; char compression_level; + const char *filter_tag; // -d opt + const char *filter_value_str; + int64_t filter_value_int; + float filter_value_flt; } bam2fq_opts_t; typedef struct bam2fq_state { @@ -152,7 +163,7 @@ typedef struct bam2fq_state { samFile *hstdout; sam_hdr_t *h; bool has12, use_oq, copy_tags, illumina_tag; - int flag_on, flag_off, flag_alloff; + int flag_on, flag_off, flag_alloff, flag_anyon; fastfile filetype; int def_qual; char *index_sequence; @@ -176,6 +187,9 @@ static void free_opts(bam2fq_opts_t *opts) free(opts); } +// Make mnemonic distinct values for longoption-only options +#define LONGOPT(c) ((c) + 128) + // return true if valid static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) { @@ -193,12 +207,19 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) opts->extra_tags = NULL; opts->compression_level = 1; opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY; - int flag_off_set = 0; int c; sam_global_args_init(&opts->ga); static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), + {"require-flags", required_argument, NULL, 'f'}, + {"excl-flags", required_argument, NULL, 'F'}, + {"exclude-flags", required_argument, NULL, 'F'}, + // following the same convention as view: g exists as a longoption_only + // argument, accessible from the command line as --rf/--incl[ude]-flags + {"rf", required_argument, NULL, LONGOPT('g')}, + {"incl-flags", required_argument, NULL, LONGOPT('g')}, + {"include-flags", required_argument, NULL, LONGOPT('g')}, {"i1", required_argument, NULL, 1}, {"I1", required_argument, NULL, 1}, {"i2", required_argument, NULL, 2}, @@ -208,9 +229,10 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) {"index-format", required_argument, NULL, 3}, {"barcode-tag", required_argument, NULL, 'b'}, {"quality-tag", required_argument, NULL, 'q'}, + {"tag", required_argument, NULL, 'd'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", + while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:", lopts, NULL)) > 0) { switch (c) { case 'b': opts->barcode_tag = optarg; break; @@ -223,14 +245,11 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) case '2': opts->fnr[2] = optarg; break; case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break; case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; - case 'F': - if (!flag_off_set) { - flag_off_set = 1; - opts->flag_off = 0; - } - opts->flag_off |= strtol(optarg, 0, 0); - break; + // note that flag_off does not have |= because it has a default + // value of 0x900 which needs to be replaced by the optarg + case 'F': opts->flag_off = strtol(optarg, 0, 0); break; case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; + case LONGOPT('g'): opts->flag_anyon |= strtol(optarg, 0, 0); break; case 'n': opts->has12 = false; break; case 'N': opts->has12always = true; break; case 'O': opts->use_oq = true; break; @@ -247,6 +266,22 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) case 'T': opts->extra_tags = optarg; break; case 'v': opts->def_qual = atoi(optarg); break; + case 'd': + if (strlen(optarg) < 2 || + (strlen(optarg) > 2 && optarg[2] != ':')) { + print_error("fastq", + "Invalid \"tag:value\" option: \"%s\"", + optarg); + free_opts(opts); + return false; + } + + opts->filter_tag = optarg; + opts->filter_value_str = strlen(optarg) > 2 ? optarg+3 : NULL; + opts->filter_value_int = INT64_MAX; // fill out later + opts->filter_value_flt = FLT_MAX; + break; + case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); @@ -401,6 +436,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->flag_on = opts->flag_on; state->flag_off = opts->flag_off; state->flag_alloff = opts->flag_alloff; + state->flag_anyon = opts->flag_anyon; state->has12 = opts->has12; state->use_oq = opts->use_oq; state->illumina_tag = opts->illumina_tag; @@ -411,7 +447,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->hstdout = NULL; state->compression_level = opts->compression_level; - state->fp = sam_open(opts->fn_input, "r"); + state->fp = sam_open_format(opts->fn_input, "r", &opts->ga.in); if (state->fp == NULL) { print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); free(state); @@ -430,7 +466,17 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) } uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; - if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; + if (opts->use_oq || opts->extra_tags || opts->index_file[0]) + rf |= SAM_AUX; + if (opts->filter_tag) { + if (memcmp(opts->filter_tag, "NM", 2) == 0 || + memcmp(opts->filter_tag, "MD", 2) == 0) + rf |= SAM_AUX | SAM_SEQ; + else if (memcmp(opts->filter_tag, "RG", 2) == 0) + rf |= SAM_RGAUX; + else + rf |= SAM_AUX; + } if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); free(state); @@ -576,10 +622,59 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* return valid; } -static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) +static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state, + bam2fq_opts_t *opts) { + if (opts->filter_tag) { + uint8_t *s = bam_aux_get(b, opts->filter_tag); + if (!s) + return true; + + if (opts->filter_value_str) { + switch (*s) { + case 'i': case 'I': + case 's': case 'S': + case 'c': case 'C': + if (opts->filter_value_int == INT64_MAX) + // cache integer conversion for repeated use + opts->filter_value_int = + strtoll(opts->filter_value_str, NULL, 0); + if (opts->filter_value_int != bam_aux2i(s)) + return true; + break; + + case 'f': + if (opts->filter_value_flt == FLT_MAX) + opts->filter_value_flt = atof(opts->filter_value_str); + // Comparing floats is hard. + // Eg (double)0.1 - (double)0.1f is -1.5e-9. + // Given BAM binary encoding is float however, just keep it. + // This means rounding errors will (hopefully) always be the + // same and basic equality still works. + if (opts->filter_value_flt != (float)bam_aux2f(s)) + return true; + break; + + case 'A': + if (s[1] != *opts->filter_value_str) + return true; + break; + + case 'Z': case 'H': + if (strcmp((char *)s+1, opts->filter_value_str) != 0) + return true; + break; + + default: + // Anything unsupported fails the filter match too. + return true; + } + } + } + return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags || (b->core.flag&(state->flag_off)) != 0 + || (((b->core.flag&(state->flag_anyon)) == 0) && (state->flag_anyon != 0)) || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); } @@ -798,7 +893,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) } at_eof = res < 0; - if (!at_eof && filter_it_out(b[n], state)) + if (!at_eof && filter_it_out(b[n], state, opts)) continue; if (!at_eof) { ++n_reads; diff --git a/samtools/bam_fastq.c.pysam.c b/samtools/bam_fastq.c.pysam.c index fbe65fb..cd8fa27 100644 --- a/samtools/bam_fastq.c.pysam.c +++ b/samtools/bam_fastq.c.pysam.c @@ -2,7 +2,7 @@ /* bam_fastq.c -- FASTA and FASTQ file generation - Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd. + Copyright (C) 2009-2017, 2019-2020, 2023 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "htslib/sam.h" #include "htslib/klist.h" @@ -66,8 +67,14 @@ static void bam2fq_usage(FILE *to, const char *command) " -o FILE write reads designated READ1 or READ2 to FILE\n" " note: if a singleton file is specified with -s, only\n" " paired reads will be written to the -1 and -2 files.\n" -" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 +" -d, --tag TAG[:VAL]\n" +" only include reads containing TAG, optionally with value VAL\n" +" -f, --require-flags INT\n" +" only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F, --excl[ude]-flags INT\n" +" only include reads with none of the FLAGs in INT present [0x900]\n" // F&x == 0 +" --rf, --incl[ude]-flags INT\n" +" only include reads with any of the FLAGs in INT present [0]\n" // !(F&x == 0) " -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) " -n don't append /1 and /2 to the read name\n" " -N always append /1 and /2 to the read name\n", @@ -134,7 +141,7 @@ typedef struct bam2fq_opts { char *fnr[3]; char *fn_input; // pointer to input filename in argv do not free bool has12, has12always, use_oq, copy_tags, illumina_tag; - int flag_on, flag_off, flag_alloff; + int flag_on, flag_off, flag_alloff, flag_anyon; sam_global_args ga; fastfile filetype; int def_qual; @@ -144,6 +151,10 @@ typedef struct bam2fq_opts { char *index_format; char *extra_tags; char compression_level; + const char *filter_tag; // -d opt + const char *filter_value_str; + int64_t filter_value_int; + float filter_value_flt; } bam2fq_opts_t; typedef struct bam2fq_state { @@ -154,7 +165,7 @@ typedef struct bam2fq_state { samFile *hstdout; sam_hdr_t *h; bool has12, use_oq, copy_tags, illumina_tag; - int flag_on, flag_off, flag_alloff; + int flag_on, flag_off, flag_alloff, flag_anyon; fastfile filetype; int def_qual; char *index_sequence; @@ -178,6 +189,9 @@ static void free_opts(bam2fq_opts_t *opts) free(opts); } +// Make mnemonic distinct values for longoption-only options +#define LONGOPT(c) ((c) + 128) + // return true if valid static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) { @@ -195,12 +209,19 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) opts->extra_tags = NULL; opts->compression_level = 1; opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY; - int flag_off_set = 0; int c; sam_global_args_init(&opts->ga); static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), + {"require-flags", required_argument, NULL, 'f'}, + {"excl-flags", required_argument, NULL, 'F'}, + {"exclude-flags", required_argument, NULL, 'F'}, + // following the same convention as view: g exists as a longoption_only + // argument, accessible from the command line as --rf/--incl[ude]-flags + {"rf", required_argument, NULL, LONGOPT('g')}, + {"incl-flags", required_argument, NULL, LONGOPT('g')}, + {"include-flags", required_argument, NULL, LONGOPT('g')}, {"i1", required_argument, NULL, 1}, {"I1", required_argument, NULL, 1}, {"i2", required_argument, NULL, 2}, @@ -210,9 +231,10 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) {"index-format", required_argument, NULL, 3}, {"barcode-tag", required_argument, NULL, 'b'}, {"quality-tag", required_argument, NULL, 'q'}, + {"tag", required_argument, NULL, 'd'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", + while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:", lopts, NULL)) > 0) { switch (c) { case 'b': opts->barcode_tag = optarg; break; @@ -225,14 +247,11 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) case '2': opts->fnr[2] = optarg; break; case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break; case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; - case 'F': - if (!flag_off_set) { - flag_off_set = 1; - opts->flag_off = 0; - } - opts->flag_off |= strtol(optarg, 0, 0); - break; + // note that flag_off does not have |= because it has a default + // value of 0x900 which needs to be replaced by the optarg + case 'F': opts->flag_off = strtol(optarg, 0, 0); break; case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; + case LONGOPT('g'): opts->flag_anyon |= strtol(optarg, 0, 0); break; case 'n': opts->has12 = false; break; case 'N': opts->has12always = true; break; case 'O': opts->use_oq = true; break; @@ -249,6 +268,22 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) case 'T': opts->extra_tags = optarg; break; case 'v': opts->def_qual = atoi(optarg); break; + case 'd': + if (strlen(optarg) < 2 || + (strlen(optarg) > 2 && optarg[2] != ':')) { + print_error("fastq", + "Invalid \"tag:value\" option: \"%s\"", + optarg); + free_opts(opts); + return false; + } + + opts->filter_tag = optarg; + opts->filter_value_str = strlen(optarg) > 2 ? optarg+3 : NULL; + opts->filter_value_int = INT64_MAX; // fill out later + opts->filter_value_flt = FLT_MAX; + break; + case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); @@ -403,6 +438,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->flag_on = opts->flag_on; state->flag_off = opts->flag_off; state->flag_alloff = opts->flag_alloff; + state->flag_anyon = opts->flag_anyon; state->has12 = opts->has12; state->use_oq = opts->use_oq; state->illumina_tag = opts->illumina_tag; @@ -413,7 +449,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->hstdout = NULL; state->compression_level = opts->compression_level; - state->fp = sam_open(opts->fn_input, "r"); + state->fp = sam_open_format(opts->fn_input, "r", &opts->ga.in); if (state->fp == NULL) { print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); free(state); @@ -432,7 +468,17 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) } uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; - if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; + if (opts->use_oq || opts->extra_tags || opts->index_file[0]) + rf |= SAM_AUX; + if (opts->filter_tag) { + if (memcmp(opts->filter_tag, "NM", 2) == 0 || + memcmp(opts->filter_tag, "MD", 2) == 0) + rf |= SAM_AUX | SAM_SEQ; + else if (memcmp(opts->filter_tag, "RG", 2) == 0) + rf |= SAM_RGAUX; + else + rf |= SAM_AUX; + } if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); free(state); @@ -578,10 +624,59 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* return valid; } -static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) +static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state, + bam2fq_opts_t *opts) { + if (opts->filter_tag) { + uint8_t *s = bam_aux_get(b, opts->filter_tag); + if (!s) + return true; + + if (opts->filter_value_str) { + switch (*s) { + case 'i': case 'I': + case 's': case 'S': + case 'c': case 'C': + if (opts->filter_value_int == INT64_MAX) + // cache integer conversion for repeated use + opts->filter_value_int = + strtoll(opts->filter_value_str, NULL, 0); + if (opts->filter_value_int != bam_aux2i(s)) + return true; + break; + + case 'f': + if (opts->filter_value_flt == FLT_MAX) + opts->filter_value_flt = atof(opts->filter_value_str); + // Comparing floats is hard. + // Eg (double)0.1 - (double)0.1f is -1.5e-9. + // Given BAM binary encoding is float however, just keep it. + // This means rounding errors will (hopefully) always be the + // same and basic equality still works. + if (opts->filter_value_flt != (float)bam_aux2f(s)) + return true; + break; + + case 'A': + if (s[1] != *opts->filter_value_str) + return true; + break; + + case 'Z': case 'H': + if (strcmp((char *)s+1, opts->filter_value_str) != 0) + return true; + break; + + default: + // Anything unsupported fails the filter match too. + return true; + } + } + } + return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags || (b->core.flag&(state->flag_off)) != 0 + || (((b->core.flag&(state->flag_anyon)) == 0) && (state->flag_anyon != 0)) || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); } @@ -800,7 +895,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) } at_eof = res < 0; - if (!at_eof && filter_it_out(b[n], state)) + if (!at_eof && filter_it_out(b[n], state, opts)) continue; if (!at_eof) { ++n_reads; diff --git a/samtools/bam_import.c b/samtools/bam_import.c index 14ff0b0..079e04b 100644 --- a/samtools/bam_import.c +++ b/samtools/bam_import.c @@ -4,7 +4,7 @@ * samtools import a_1.fq a_2.fq * samtools import a_interleaved.fq * - * Copyright (C) 2020-2021 Genome Research Ltd. + * Copyright (C) 2020-2021, 2023 Genome Research Ltd. * * Author: James Bonfield */ @@ -93,6 +93,7 @@ typedef struct { char *rg; char *rg_line; char *order; + int order_str; int compress_level; htsThreadPool p; int name2; @@ -358,9 +359,23 @@ static int import_fastq(int argc, char **argv, opts_t *opts) { } if (opts->order) { - if (bam_aux_update_int(b, opts->order, read_num++) < 0) { - ret = -1; - goto err; + if (opts->order_str) { + char buf[25]; + snprintf(buf, sizeof(buf), "%0*"PRIu64, + opts->order_str, read_num++); + if (bam_aux_update_str(b, opts->order, + strlen(buf), buf) < 0) { + ret = -1; + goto err; + } + } else { + if (bam_aux_update_int(b, opts->order, read_num++) < 0) { + ret = -1; + goto err; + } + if (read_num == UINT_MAX) + fprintf(stderr, "Warning: --order tag has overflowed." + " Consider using TAG:LENGTH instead\n"); } } @@ -421,6 +436,7 @@ int main_import(int argc, char *argv[]) { .rg = NULL, .rg_line = NULL, .order = NULL, + .order_str = 0, .compress_level = -1, .name2 = 0, }; @@ -470,7 +486,11 @@ int main_import(int argc, char *argv[]) { case 'N': opts.name2 = 1; break; case 9: opts.no_pg = 1; break; - case 3: opts.order = optarg; break; + case 3: + opts.order = optarg; + if (strlen(optarg) > 3 && optarg[2] == ':') + opts.order_str = atoi(optarg+3); + break; case 'h': return usage(stdout, EXIT_SUCCESS); case '?': return usage(stderr, EXIT_FAILURE); diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c index 842ff60..f16a781 100644 --- a/samtools/bam_import.c.pysam.c +++ b/samtools/bam_import.c.pysam.c @@ -6,7 +6,7 @@ * samtools import a_1.fq a_2.fq * samtools import a_interleaved.fq * - * Copyright (C) 2020-2021 Genome Research Ltd. + * Copyright (C) 2020-2021, 2023 Genome Research Ltd. * * Author: James Bonfield */ @@ -95,6 +95,7 @@ typedef struct { char *rg; char *rg_line; char *order; + int order_str; int compress_level; htsThreadPool p; int name2; @@ -360,9 +361,23 @@ static int import_fastq(int argc, char **argv, opts_t *opts) { } if (opts->order) { - if (bam_aux_update_int(b, opts->order, read_num++) < 0) { - ret = -1; - goto err; + if (opts->order_str) { + char buf[25]; + snprintf(buf, sizeof(buf), "%0*"PRIu64, + opts->order_str, read_num++); + if (bam_aux_update_str(b, opts->order, + strlen(buf), buf) < 0) { + ret = -1; + goto err; + } + } else { + if (bam_aux_update_int(b, opts->order, read_num++) < 0) { + ret = -1; + goto err; + } + if (read_num == UINT_MAX) + fprintf(samtools_stderr, "Warning: --order tag has overflowed." + " Consider using TAG:LENGTH instead\n"); } } @@ -423,6 +438,7 @@ int main_import(int argc, char *argv[]) { .rg = NULL, .rg_line = NULL, .order = NULL, + .order_str = 0, .compress_level = -1, .name2 = 0, }; @@ -472,7 +488,11 @@ int main_import(int argc, char *argv[]) { case 'N': opts.name2 = 1; break; case 9: opts.no_pg = 1; break; - case 3: opts.order = optarg; break; + case 3: + opts.order = optarg; + if (strlen(optarg) > 3 && optarg[2] == ':') + opts.order_str = atoi(optarg+3); + break; case 'h': return usage(samtools_stdout, EXIT_SUCCESS); case '?': return usage(samtools_stderr, EXIT_FAILURE); diff --git a/samtools/bam_index.c b/samtools/bam_index.c index f7c3358..0803f3e 100644 --- a/samtools/bam_index.c +++ b/samtools/bam_index.c @@ -1,6 +1,6 @@ /* bam_index.c -- index and idxstats subcommands. - Copyright (C) 2008-2011, 2013-2016, 2018, 2019 Genome Research Ltd. + Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023 Genome Research Ltd. Portions copyright (C) 2010 Broad Institute. Portions copyright (C) 2013 Peter Cock, The James Hutton Institute. @@ -47,12 +47,12 @@ static void index_usage(FILE *fp) "Usage: samtools index -M [-bc] [-m INT] ...\n" " or: samtools index [-bc] [-m INT] [out.index]\n" "Options:\n" -" -b Generate BAI-format index for BAM files [default]\n" -" -c Generate CSI-format index for BAM files\n" -" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n" -" -M Interpret all filename arguments as files to be indexed\n" -" -o FILE Write index to FILE [alternative to as an argument]\n" -" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT); +" -b, --bai Generate BAI-format index for BAM files [default]\n" +" -c, --csi Generate CSI-format index for BAM files\n" +" -m, --min-shift INT Set minimum interval size for CSI indices to 2^INT [%d]\n" +" -M Interpret all filename arguments as files to be indexed\n" +" -o, --output FILE Write index to FILE [alternative to in args]\n" +" -@, --threads INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT); } // Returns 1 if the file does not exist or can be positively @@ -80,7 +80,16 @@ int bam_index(int argc, char *argv[]) int n_files, c, i, ret; const char *fn_idx = NULL; - while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0) + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'), + {"output", required_argument, NULL, 'o'}, + {"bai", no_argument, NULL, 'b'}, + {"csi", no_argument, NULL, 'c'}, + {"min-shift", required_argument, NULL, 'm'}, + { NULL, 0, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, "bcm:Mo:@:", lopts, NULL)) >= 0) switch (c) { case 'b': csi = 0; break; case 'c': csi = 1; break; diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c index 6627cfa..3093c01 100644 --- a/samtools/bam_index.c.pysam.c +++ b/samtools/bam_index.c.pysam.c @@ -2,7 +2,7 @@ /* bam_index.c -- index and idxstats subcommands. - Copyright (C) 2008-2011, 2013-2016, 2018, 2019 Genome Research Ltd. + Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023 Genome Research Ltd. Portions copyright (C) 2010 Broad Institute. Portions copyright (C) 2013 Peter Cock, The James Hutton Institute. @@ -49,12 +49,12 @@ static void index_usage(FILE *fp) "Usage: samtools index -M [-bc] [-m INT] ...\n" " or: samtools index [-bc] [-m INT] [out.index]\n" "Options:\n" -" -b Generate BAI-format index for BAM files [default]\n" -" -c Generate CSI-format index for BAM files\n" -" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n" -" -M Interpret all filename arguments as files to be indexed\n" -" -o FILE Write index to FILE [alternative to as an argument]\n" -" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT); +" -b, --bai Generate BAI-format index for BAM files [default]\n" +" -c, --csi Generate CSI-format index for BAM files\n" +" -m, --min-shift INT Set minimum interval size for CSI indices to 2^INT [%d]\n" +" -M Interpret all filename arguments as files to be indexed\n" +" -o, --output FILE Write index to FILE [alternative to in args]\n" +" -@, --threads INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT); } // Returns 1 if the file does not exist or can be positively @@ -82,7 +82,16 @@ int bam_index(int argc, char *argv[]) int n_files, c, i, ret; const char *fn_idx = NULL; - while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0) + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'), + {"output", required_argument, NULL, 'o'}, + {"bai", no_argument, NULL, 'b'}, + {"csi", no_argument, NULL, 'c'}, + {"min-shift", required_argument, NULL, 'm'}, + { NULL, 0, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, "bcm:Mo:@:", lopts, NULL)) >= 0) switch (c) { case 'b': csi = 0; break; case 'c': csi = 1; break; diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c index fc333c4..677a47f 100644 --- a/samtools/bam_markdup.c +++ b/samtools/bam_markdup.c @@ -76,6 +76,7 @@ typedef struct { regex_t *bc_rgx; int read_groups; int json; + int dc; } md_param_t; typedef struct { @@ -96,6 +97,7 @@ typedef struct read_queue_s { bam1_t *b; struct read_queue_s *duplicate; struct read_queue_s *original; + int dc; hts_pos_t pos; int dup_checked; int read_group; @@ -1616,6 +1618,7 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->original = NULL; in_read->dup_checked = 0; in_read->read_group = 0; + in_read->dc = 1; if (param->read_groups) { uint8_t *data; @@ -1703,6 +1706,7 @@ static int bam_mark_duplicates(md_param_t *param) { } bp->p = in_read; + bp->p->dc += 1; if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; @@ -1765,6 +1769,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (new_score + tie_add > old_score) { // swap reads dup = bp->p->b; + in_read->dc += bp->p->dc; if (param->check_chain) { @@ -1805,6 +1810,7 @@ static int bam_mark_duplicates(md_param_t *param) { } dup = in_read->b; + bp->p->dc += 1; } if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings)) @@ -1846,6 +1852,8 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->original = bp->p; } + bp->p->dc += 1; + if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; @@ -1860,6 +1868,7 @@ static int bam_mark_duplicates(md_param_t *param) { // to the single hash and mark the other as duplicate if (new_score > old_score) { // swap reads dup = bp->p->b; + in_read->dc += bp->p->dc; if (param->check_chain) { in_read->duplicate = bp->p; @@ -1877,6 +1886,7 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->original = bp->p; } + bp->p->dc += 1; dup = in_read->b; } @@ -1914,6 +1924,9 @@ static int bam_mark_duplicates(md_param_t *param) { } if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { + if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) { + bam_aux_update_int(in_read->b, "dc", in_read->dc); + } if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { print_error("markdup", "error, writing temp output failed.\n"); @@ -1977,12 +1990,20 @@ static int bam_mark_duplicates(md_param_t *param) { } if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { + if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) { + bam_aux_update_int(in_read->b, "dc", in_read->dc); + } + if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { print_error("markdup", "error, writing temp output failed on final write.\n"); goto fail; } } else { + if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) { + bam_aux_update_int(in_read->b, "dc", in_read->dc); + } + if (sam_write1(param->out, header, in_read->b) < 0) { print_error("markdup", "error, writing output failed on final write.\n"); goto fail; @@ -2044,6 +2065,10 @@ static int bam_mark_duplicates(md_param_t *param) { } if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) { + if (param->dc && (b->core.flag & BAM_FDUP)) { + uint8_t* data = bam_aux_get(b, "dc"); + if(data) bam_aux_del(b, data); + } if (sam_write1(param->out, header, b) < 0) { print_error("markdup", "error, writing final output failed.\n"); goto fail; @@ -2179,6 +2204,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->check_chain && (param->tag || param->opt_dist)) free(dup_list.c); + free(idx_fn); free(stat_array); kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); @@ -2205,6 +2231,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->check_chain && (param->tag || param->opt_dist)) free(dup_list.c); + free(idx_fn); free(stat_array); kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); @@ -2242,6 +2269,7 @@ static int markdup_usage(void) { fprintf(stderr, " --use-read-groups Use the read group tags in duplicate matching.\n"); fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); + fprintf(stderr, " --duplicate-count Record the original primary read duplication count(include itself) in a \'dc\' tag.\n"); sam_global_opt_help(stderr, "-.O..@.."); @@ -2263,7 +2291,7 @@ int bam_markdup(int argc, char **argv) { char *regex = NULL, *bc_regex = NULL; char *regex_order = "txy"; md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0}; + 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0, 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -2278,6 +2306,7 @@ int bam_markdup(int argc, char **argv) { {"barcode-rgx", required_argument, NULL, 1008}, {"use-read-groups", no_argument, NULL, 1009}, {"json", no_argument, NULL, 1010}, + {"duplicate-count", no_argument, NULL, 1011}, {NULL, 0, NULL, 0} }; @@ -2314,6 +2343,7 @@ int bam_markdup(int argc, char **argv) { case 1008: bc_name = 1, bc_regex = optarg; break; case 1009: param.read_groups = 1; break; case 1010: param.json = 1; param.do_stats = 1; break; + case 1011: param.dc = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c index 3e3b0b5..e8fea3d 100644 --- a/samtools/bam_markdup.c.pysam.c +++ b/samtools/bam_markdup.c.pysam.c @@ -78,6 +78,7 @@ typedef struct { regex_t *bc_rgx; int read_groups; int json; + int dc; } md_param_t; typedef struct { @@ -98,6 +99,7 @@ typedef struct read_queue_s { bam1_t *b; struct read_queue_s *duplicate; struct read_queue_s *original; + int dc; hts_pos_t pos; int dup_checked; int read_group; @@ -1618,6 +1620,7 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->original = NULL; in_read->dup_checked = 0; in_read->read_group = 0; + in_read->dc = 1; if (param->read_groups) { uint8_t *data; @@ -1705,6 +1708,7 @@ static int bam_mark_duplicates(md_param_t *param) { } bp->p = in_read; + bp->p->dc += 1; if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; @@ -1767,6 +1771,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (new_score + tie_add > old_score) { // swap reads dup = bp->p->b; + in_read->dc += bp->p->dc; if (param->check_chain) { @@ -1807,6 +1812,7 @@ static int bam_mark_duplicates(md_param_t *param) { } dup = in_read->b; + bp->p->dc += 1; } if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings)) @@ -1848,6 +1854,8 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->original = bp->p; } + bp->p->dc += 1; + if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; @@ -1862,6 +1870,7 @@ static int bam_mark_duplicates(md_param_t *param) { // to the single hash and mark the other as duplicate if (new_score > old_score) { // swap reads dup = bp->p->b; + in_read->dc += bp->p->dc; if (param->check_chain) { in_read->duplicate = bp->p; @@ -1879,6 +1888,7 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->original = bp->p; } + bp->p->dc += 1; dup = in_read->b; } @@ -1916,6 +1926,9 @@ static int bam_mark_duplicates(md_param_t *param) { } if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { + if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) { + bam_aux_update_int(in_read->b, "dc", in_read->dc); + } if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { print_error("markdup", "error, writing temp output failed.\n"); @@ -1979,12 +1992,20 @@ static int bam_mark_duplicates(md_param_t *param) { } if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { + if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) { + bam_aux_update_int(in_read->b, "dc", in_read->dc); + } + if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { print_error("markdup", "error, writing temp output failed on final write.\n"); goto fail; } } else { + if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) { + bam_aux_update_int(in_read->b, "dc", in_read->dc); + } + if (sam_write1(param->out, header, in_read->b) < 0) { print_error("markdup", "error, writing output failed on final write.\n"); goto fail; @@ -2046,6 +2067,10 @@ static int bam_mark_duplicates(md_param_t *param) { } if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) { + if (param->dc && (b->core.flag & BAM_FDUP)) { + uint8_t* data = bam_aux_get(b, "dc"); + if(data) bam_aux_del(b, data); + } if (sam_write1(param->out, header, b) < 0) { print_error("markdup", "error, writing final output failed.\n"); goto fail; @@ -2181,6 +2206,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->check_chain && (param->tag || param->opt_dist)) free(dup_list.c); + free(idx_fn); free(stat_array); kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); @@ -2207,6 +2233,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->check_chain && (param->tag || param->opt_dist)) free(dup_list.c); + free(idx_fn); free(stat_array); kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); @@ -2244,6 +2271,7 @@ static int markdup_usage(void) { fprintf(samtools_stderr, " --use-read-groups Use the read group tags in duplicate matching.\n"); fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); + fprintf(samtools_stderr, " --duplicate-count Record the original primary read duplication count(include itself) in a \'dc\' tag.\n"); sam_global_opt_help(samtools_stderr, "-.O..@.."); @@ -2265,7 +2293,7 @@ int bam_markdup(int argc, char **argv) { char *regex = NULL, *bc_regex = NULL; char *regex_order = "txy"; md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0}; + 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0, 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -2280,6 +2308,7 @@ int bam_markdup(int argc, char **argv) { {"barcode-rgx", required_argument, NULL, 1008}, {"use-read-groups", no_argument, NULL, 1009}, {"json", no_argument, NULL, 1010}, + {"duplicate-count", no_argument, NULL, 1011}, {NULL, 0, NULL, 0} }; @@ -2316,6 +2345,7 @@ int bam_markdup(int argc, char **argv) { case 1008: bc_name = 1, bc_regex = optarg; break; case 1009: param.read_groups = 1; break; case 1010: param.json = 1; param.do_stats = 1; break; + case 1011: param.dc = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); diff --git a/samtools/bam_md.c b/samtools/bam_md.c index d7fd60f..b9182b6 100644 --- a/samtools/bam_md.c +++ b/samtools/bam_md.c @@ -411,8 +411,11 @@ int bam_fillmd(int argc, char *argv[]) header = sam_hdr_read(fp); if (header == NULL || sam_hdr_nref(header) == 0) { - fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); - goto fail; + // NB: if we have no SQ headers but have aligned data, then this will + // be caught during processing with e.g. + // "[E::sam_parse1] no SQ lines present in the header" + fprintf(stderr, "[bam_fillmd] warning: input SAM does not have " + "header, performing a no-op.\n"); } fpout = sam_open_format("-", mode_w, &ga.out); diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c index 0daf177..795eccb 100644 --- a/samtools/bam_md.c.pysam.c +++ b/samtools/bam_md.c.pysam.c @@ -413,8 +413,11 @@ int bam_fillmd(int argc, char *argv[]) header = sam_hdr_read(fp); if (header == NULL || sam_hdr_nref(header) == 0) { - fprintf(samtools_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); - goto fail; + // NB: if we have no SQ headers but have aligned data, then this will + // be caught during processing with e.g. + // "[E::sam_parse1] no SQ lines present in the header" + fprintf(samtools_stderr, "[bam_fillmd] warning: input SAM does not have " + "header, performing a no-op.\n"); } fpout = sam_open_format(samtools_stdout_fn, mode_w, &ga.out); diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c index 0ad308a..f84c805 100644 --- a/samtools/bam_reheader.c +++ b/samtools/bam_reheader.c @@ -127,6 +127,11 @@ int cram_reheader(cram_fd *in, sam_hdr_t *h, const char *arg_list, int no_pg) if (!h) return ret; + // Match output version number with input file. + char vers[99]; + sprintf(vers, "%d.%d", cram_major_vers(in), cram_minor_vers(in)); + cram_set_option(out, CRAM_OPT_VERSION, vers); + // Attempt to fill out a cram->refs[] array from @SQ headers sam_hdr_t *cram_h = sam_hdr_dup(h); if (!cram_h) diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index 22a6cd9..5a78c66 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -129,6 +129,11 @@ int cram_reheader(cram_fd *in, sam_hdr_t *h, const char *arg_list, int no_pg) if (!h) return ret; + // Match output version number with input file. + char vers[99]; + sprintf(vers, "%d.%d", cram_major_vers(in), cram_minor_vers(in)); + cram_set_option(out, CRAM_OPT_VERSION, vers); + // Attempt to fill out a cram->refs[] array from @SQ headers sam_hdr_t *cram_h = sam_hdr_dup(h); if (!cram_h) diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 875e29c..b44bd66 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -1,6 +1,6 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2022 Genome Research Ltd. + Copyright (C) 2008-2023 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -54,6 +54,8 @@ DEALINGS IN THE SOFTWARE. */ #include "bedidx.h" #include "bam.h" +//#define DEBUG_MINHASH + #define BAM_BLOCK_SIZE 2*1024*1024 #define MAX_TMP_FILES 64 @@ -1783,7 +1785,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, htsThreadPool *htspool, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, - int write_index) { + int write_index, int final_out) { samFile *fpout = NULL, **fp = NULL; heap1_t *heap = NULL; uint64_t idx = 0; @@ -1884,7 +1886,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, ks_heapmake(heap, heap_size, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->entry.bam_record; - if (g_sam_order == MinHash && b->core.tid == -1) { + if (g_sam_order == MinHash && b->core.tid == -1 && final_out) { // Remove the cached minhash value b->core.pos = -1; b->core.mpos = -1; @@ -2052,6 +2054,11 @@ static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b) // // The 64-bit sort key is split over the bam pos and isize fields. // This permits it to survive writing to temporary file and coming back. + +#ifdef DEBUG_MINHASH +static int ntot = 0, nmis = 0, ndup = 0; +#endif + static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) { const bam1_t *A = a.bam_record; @@ -2062,16 +2069,18 @@ static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b); - const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos; - const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos; + const uint64_t m_a = (((uint64_t)A->core.pos)<<31)|(uint32_t)A->core.mpos; + const uint64_t m_b = (((uint64_t)B->core.pos)<<31)|(uint32_t)B->core.mpos; if (m_a < m_b) // by hash return -1; else if (m_a > m_b) return 1; - else if (A->core.isize < B->core.isize) // by hash location in seq + + // Bigger pos with size minhash means starts further to left + else if (A->core.isize > B->core.isize) // by hash location in seq return -1; - else if (A->core.isize > B->core.isize) + else if (A->core.isize < B->core.isize) return 1; else return bam1_cmp_core(a,b); @@ -2243,6 +2252,8 @@ typedef struct { int error; int large_pos; int minimiser_kmer; + bool try_rev; + bool no_squash; } worker_t; // Returns 0 for success @@ -2273,6 +2284,8 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu for (i = 0; i < l; ++i) { bam1_t *b = buf[i].bam_record; if (clear_minhash && b->core.tid == -1) { + // To see the position for debugging + // b->core.pos = ((((uint64_t)b->core.pos)<<31)|(uint32_t)b->core.mpos) + b->core.isize; // Remove the cached minhash value b->core.pos = -1; b->core.mpos = -1; @@ -2381,8 +2394,15 @@ err: return ret; } +KHASH_MAP_INIT_INT64(kmer, int64_t) +static khash_t(kmer) *kmer_h = NULL; + +// Punt homopolymers somewhere central in the hash space +#define XOR 0xdead7878beef7878 + /* - * Computes the minhash of a sequence using both forward and reverse strands. + * Computes the minhash of a sequence using forward strand and if requested + * reverse strand. * * This is used as a sort key for unmapped data, to collate like sequences * together and to improve compression ratio. @@ -2390,13 +2410,219 @@ err: * The minhash is returned and *pos filled out with location of this hash * key in the sequence if pos != NULL. */ -static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) { +static uint64_t minhash(bam1_t *b, int kmer, int window, int *curr_pos, + int *end, int *is_rev, int try_fwd, int try_rev, + int no_squash) { uint64_t hashf = 0, minhashf = UINT64_MAX; - uint64_t hashr = 0, minhashr = UINT64_MAX; - int minhashpf = 0, minhashpr = 0, i; + int minhashpf = *curr_pos, i, j; + uint64_t mask = (1L<<(2*kmer))-1; + uint8_t *seq = bam_get_seq(b); + int len = b->core.l_qseq; + uint64_t xor = XOR & mask; + + if (is_rev) *is_rev = 0; + + // Lookup tables for bam_seqi to 0123 fwd/rev hashes + // =ACM GRSV TWYH KDBN +#define X 0 + static unsigned char L[16] = { + X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X, + }; + uint64_t R[16] = { + X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X, + }; + for (i = 0; i < 16; i++) + R[i] <<= 2*(kmer-1); + + int i_start = *curr_pos; + int i_end = MIN(i_start + window, len); + int last_base = -1; + + if (try_fwd) { + // Initialise hash keys + for (i = i_start, j = 0; j < kmer-1 && i < i_end; i++) { + int base = bam_seqi(seq, i); + // collapse homopolymers + if (no_squash || last_base != base) { + last_base = base; + hashf = (hashf<<2) | L[base]; + j++; + } + } + + // Loop to find minimum + if (no_squash) { + for (; i < i_end; i++) { + int base = bam_seqi(seq, i); + hashf = (hashf<<2) | L[base]; + uint64_t hashfx = (hashf ^ XOR) & mask; + if (minhashf > hashfx) + minhashf = hashfx, minhashpf = i; + } + } else { + for (; i < i_end; i++) { + int base = bam_seqi(seq, i); + if (last_base != base) { + last_base = base; + hashf = (hashf<<2) | L[base]; + uint64_t hashfx = (hashf ^ XOR) & mask; + if (minhashf > hashfx) + minhashf = hashfx, minhashpf = i; + } + } + } + } + + // Same as above for the reverse strand. + // Not used for now, but we may wish to consider indexing in both + // strands, recording the strand in value (pos), and comparing in one + // strand only. Right now we compare on both against a single-stranded + // index. + if (try_rev) { + uint64_t hashr = 0, minhashr = UINT64_MAX; + int minhashpr = *curr_pos; + int last_base = -1; + + for (i = i_start, j = 0; j < kmer-1 && i < len; i++) { + int base = bam_seqi(seq, i); + if (no_squash || last_base != base) { + last_base = base; + hashr = (hashr>>2) | R[base]; + j++; + } + } + + if (no_squash) { + for (; i < i_end; i++) { + int base = bam_seqi(seq, i); + hashr = (hashr>>2) | R[base]; + if (minhashr > (hashr^xor)) + minhashr = (hashr^xor), minhashpr = len-i+kmer-2; + } + } else { + for (; i < i_end; i++) { + int base = bam_seqi(seq, i); + if (last_base != base) { + last_base = base; + hashr = (hashr>>2) | R[base]; + if (minhashr > (hashr^xor)) + minhashr = (hashr^xor), minhashpr = len-i+kmer-2; + } + } + } + + if (minhashr < minhashf) { + minhashf = minhashr; + minhashpf = minhashpr; + if (is_rev) *is_rev = 1; + } + } + + // "*curr_pos = minhashpf" is faster here, but is sometimes + // poorer in compression. Eg 10 million novaseq records with + // 75.1MB vs 76.9MB cram BA field. + //*curr_pos = minhashpf; + *curr_pos = minhashpf - (kmer-1); + if (end) *end = (i_end == len); + return minhashf; +} + +#define UNIQ_BIT 60 +#define UNIQ_TEST(x) (((x) & (1ULL<= 0) { + //fprintf(stderr, "LEN\t%d\t%s\n", b->core.l_qseq, bam_get_qname(b)); + uint64_t hashf; + int pos = 0, end = 0; + khiter_t k; + int ret; + + if (b->core.l_qseq < window) + continue; + + // fwd + while (!end) { + int last_pos = pos; + hashf = minhash(b, kmer, window, &pos, &end, NULL, 1, 0, + no_squash); + k = kh_put(kmer, kmer_h, hashf, &ret); + kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<core.l_qseq; + +// We could also add reverse keys to the index here. +// This would avoid reverse complementing during the matching stage. +// We'd need to add a flag (another high bit of kh_value) to indicate +// strand. +// I'm unsure if this is a good trade-off or not. + +// // rev +// pos = 0; end = 0; +// while (!end) { +// hashf = minhash(b, kmer, window, &pos, &end, NULL, 0, 1, +// no_squash); +// k = kh_put(kmer, kmer_h, hashf, &ret); +// kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<core.l_qseq; + } + if (r < -1) + goto err; + + ret = 0; + err: + if (b) bam_destroy1(b); + if (h) sam_hdr_destroy(h); + sam_close(in); + + return ret; +} + +/* + * A variant of minhash that compares against a previously built index. + * + * We follow the same steps of scanning through this sequence to find the + * minimum hash, but we prefer hash keys that have unique placement in the + * index, or if not unique, then non-uniquely placed, over ones that + * are absent from the index. + */ +static uint64_t minhash_with_idx(bam1_t *b, int kmer, int *pos, int *rev, + bool try_rev) { + uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX; + uint64_t minhashfd = UINT64_MAX; + int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j; uint64_t mask = (1L<<(2*kmer))-1; unsigned char *seq = bam_get_seq(b); int len = b->core.l_qseq; + const uint64_t xor = XOR & mask; // Lookup tables for bam_seqi to 0123 fwd/rev hashes // =ACM GRSV TWYH KDBN @@ -2410,39 +2636,266 @@ static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) { for (i = 0; i < 16; i++) R[i] <<= 2*(kmer-1); - // Punt homopolymers somewhere central in the hash space -#define XOR (0xdead7878beef7878 & mask) - // Initialise hash keys - for (i = 0; i < kmer-1 && i < len; i++) { + for (i = j = 0; j < kmer-1 && i < len; i++, j++) { int base = bam_seqi(seq, i); hashf = (hashf<<2) | L[base]; - hashr = (hashr>>2) | R[base]; } // Loop to find minimum + int found_f = 0, found_r = 0; for (; i < len; i++) { int base = bam_seqi(seq, i); + hashf = ((hashf<<2) | L[base]) & mask; + const uint64_t hashfx = hashf^xor; + + // Priority for sorting + // 1. Unique key in index + // 2. Dup key in index + // 3. Everything else + int index = 0; + if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) { + khiter_t k = kh_get(kmer, kmer_h, hashfx); + if (k != kh_end(kmer_h)) + index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1; + } + found_f |= index; + switch (index) { + case 2: minhashfi = hashfx, minhashpfi = i; break; + case 1: minhashfd = hashfx, minhashpfd = i; break; + + default: + if (minhashf > hashfx) + minhashf = hashfx, minhashpf = i; + } + } + + if (minhashfi != UINT64_MAX) + minhashf = minhashfi, minhashpf = minhashpfi; + else if (minhashfd != UINT64_MAX) + minhashf = minhashfd, minhashpf = minhashpfd; + + // Same as above for the reverse strand + int dir = 0; + if (try_rev) { + uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX; + uint64_t minhashrd = UINT64_MAX; + int minhashpr = 0, minhashpri = 0, minhashprd = 0; + + for (i = j = 0; j < kmer-1 && i < len; i++, j++) { + int base = bam_seqi(seq, i); + hashr = (hashr>>2) | R[base]; + } + for (; i < len; i++) { + int base = bam_seqi(seq, i); + hashr = (hashr>>2) | R[base]; + const uint64_t hashrx = hashr^xor; + + int index = 0; + if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) { + khiter_t k = kh_get(kmer, kmer_h, hashrx); + if (k != kh_end(kmer_h)) + index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1; + } + found_r |= index; + switch (index) { + case 2: minhashri = hashrx, minhashpri = i; break; + case 1: minhashrd = hashrx, minhashprd = i; break; + + default: + if (minhashr > hashrx) + minhashr = hashrx, minhashpr = i; + } + } + if (minhashri != UINT64_MAX) + minhashr = minhashri, minhashpr = minhashpri; + else if (minhashrd != UINT64_MAX) + minhashr = minhashrd, minhashpr = minhashprd; + + // Pick reverse if better mapping + if ((minhashf > minhashr) || (!found_f && found_r)) { + if (!found_f || found_r) { + minhashf = minhashr; + minhashpf = b->core.l_qseq - minhashpr + kmer - 2; + dir = 1; + } + } + } + +#ifdef DEBUG_MINHASH + ntot++; + khiter_t k = kh_get(kmer, kmer_h, minhashf); + if (k != kh_end(kmer_h)) { + if (!UNIQ_TEST(kh_value(kmer_h, k))) + ndup++; + minhashf = kh_value(kmer_h, k) & UNIQ_MASK; + } else { + nmis++; + } +#else + // For indexed kmers, our hash key is the position the kmer + // occurs in the concatenated reference rather than the hash itself. + khiter_t k = kh_get(kmer, kmer_h, minhashf); + if (k != kh_end(kmer_h)) + minhashf = kh_value(kmer_h, k) & UNIQ_MASK; +#endif + + if (rev) *rev = dir; + if (pos) *pos = minhashpf; + return minhashf != UINT64_MAX ? minhashf : 0; +} + +// As per minhash_with_idx but with homopolymer squashing enabled. +// This function is duplicated to remove conditionals and speed up the +// hashing code. (Minus the ifdef-ed out code, which is kept above mainly +// for posterity.) +static uint64_t minhash_with_idx_squash(bam1_t *b, int kmer, int *pos, + int *rev, bool try_rev) { + uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX; + uint64_t minhashfd = UINT64_MAX; + int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j; + uint64_t mask = (1L<<(2*kmer))-1; + unsigned char *seq = bam_get_seq(b); + int len = b->core.l_qseq; + const uint64_t xor = XOR & mask; + + // Lookup tables for bam_seqi to 0123 fwd/rev hashes + // =ACM GRSV TWYH KDBN +#define X 0 + unsigned char L[16] = { + X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X, + }; + uint64_t R[16] = { + X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X, + }; + for (i = 0; i < 16; i++) + R[i] <<= 2*(kmer-1); + + // Initialise hash keys + int last_base = -1; + for (i = j = 0; j < kmer-1 && i < len; i++) { + int base = bam_seqi(seq, i); + if (base == last_base) + continue; + last_base = base; + j++; + hashf = (hashf<<2) | L[base]; + } + + // Loop to find minimum + int found_f = 0, found_r = 0; + for (; i < len; i++) { + int base = bam_seqi(seq, i); + if (base == last_base) + continue; + last_base = base; hashf = ((hashf<<2) | L[base]) & mask; - hashr = (hashr>>2) | R[base]; + const uint64_t hashfx = hashf^xor; + + // Priority for sorting + // 1. Unique key in index + // 2. Dup key in index + // 3. Everything else + int index = 0; + if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) { + khiter_t k = kh_get(kmer, kmer_h, hashfx); + if (k != kh_end(kmer_h)) + index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1; + } + found_f |= index; + switch (index) { + case 2: minhashfi = hashfx, minhashpfi = i; break; + case 1: minhashfd = hashfx, minhashpfd = i; break; - if (minhashf > (hashf^XOR)) - minhashf = (hashf^XOR), minhashpf = i; - if (minhashr > (hashr^XOR)) - minhashr = (hashr^XOR), minhashpr = len-i+kmer-2; + default: + if (minhashf > hashfx) + minhashf = hashfx, minhashpf = i; + } + } + + if (minhashfi != UINT64_MAX) + minhashf = minhashfi, minhashpf = minhashpfi; + else if (minhashfd != UINT64_MAX) + minhashf = minhashfd, minhashpf = minhashpfd; + + // Same as above for the reverse strand + int dir = 0; + if (try_rev) { + uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX; + uint64_t minhashrd = UINT64_MAX; + int minhashpr = 0, minhashpri = 0, minhashprd = 0; + int last_base = -1; + + for (i = j = 0; j < kmer-1 && i < len; i++) { + int base = bam_seqi(seq, i); + if (base == last_base) + continue; + last_base = base; + j++; + hashr = (hashr>>2) | R[base]; + } + for (; i < len; i++) { + int base = bam_seqi(seq, i); + if (base == last_base) + continue; + last_base = base; + hashr = (hashr>>2) | R[base]; + const uint64_t hashrx = hashr^xor; + + int index = 0; + if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) { + khiter_t k = kh_get(kmer, kmer_h, hashrx); + if (k != kh_end(kmer_h)) + index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1; + } + found_r |= index; + switch (index) { + case 2: minhashri = hashrx, minhashpri = i; break; + case 1: minhashrd = hashrx, minhashprd = i; break; + default: + if (minhashr > hashrx) + minhashr = hashrx, minhashpr = i; + } + } + if (minhashri != UINT64_MAX) + minhashr = minhashri, minhashpr = minhashpri; + else if (minhashrd != UINT64_MAX) + minhashr = minhashrd, minhashpr = minhashprd; + + // Pick reverse if better mapping + if ((minhashf > minhashr) || (!found_f && found_r)) { + if (!found_f || found_r) { + minhashf = minhashr; + minhashpf = b->core.l_qseq - minhashpr + kmer - 2; + dir = 1; + } + } } - if (minhashf <= minhashr) { - if (rev) *rev = 0; - if (pos) *pos = minhashpf; - return minhashf; +#ifdef DEBUG_MINHASH + ntot++; + khiter_t k = kh_get(kmer, kmer_h, minhashf); + if (k != kh_end(kmer_h)) { + if (!UNIQ_TEST(kh_value(kmer_h, k))) + ndup++; + minhashf = kh_value(kmer_h, k) & UNIQ_MASK; } else { - if (rev) *rev = 1; - if (pos) *pos = minhashpr; - return minhashr; + nmis++; } +#else + // For indexed kmers, our hash key is the position the kmer + // occurs in the concatenated reference rather than the hash itself. + khiter_t k = kh_get(kmer, kmer_h, minhashf); + if (k != kh_end(kmer_h)) + minhashf = kh_value(kmer_h, k) & UNIQ_MASK; +#endif + + if (rev) *rev = dir; + if (pos) *pos = minhashpf; + + return minhashf != UINT64_MAX ? minhashf : 0; } //--- Start of candidates to punt to htslib @@ -2556,18 +3009,35 @@ static inline void worker_minhash(worker_t *w) { continue; int pos = 0, rev = 0; - uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev); + uint64_t mh = kmer_h + ? (w->no_squash + ? minhash_with_idx(b, w->minimiser_kmer, &pos, &rev, + w->try_rev) + : minhash_with_idx_squash(b, w->minimiser_kmer, &pos, &rev, + w->try_rev) + ) + : minhash(b, w->minimiser_kmer, b->core.l_qseq, + &pos, NULL, &rev, 1, w->try_rev, w->no_squash); if (rev) reverse_complement(b); + if (!kmer_h) { + mh += 1LL<<30; + pos = 65535-pos >= 0 ? 65535-pos : 0; + } else { + mh -= pos; + pos = 0; + } + + // Store 64-bit hash in unmapped pos and mpos fields. // The position of hash is in isize, which we use for // resolving ties when sorting by hash key. // These are unused for completely unmapped data and // will be reset during final output. - b->core.pos = mh>>31; + b->core.pos = (mh>>31) & 0x7fffffff; b->core.mpos = mh&0x7fffffff; - b->core.isize = 65535-pos >=0 ? 65535-pos : 0; + b->core.isize = pos; } } @@ -2595,7 +3065,8 @@ static void *worker(void *data) static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h, int n_threads, buf_region *in_mem, - int large_pos, int minimiser_kmer) + int large_pos, int minimiser_kmer, bool try_rev, + bool no_squash) { int i; size_t pos, rest; @@ -2619,6 +3090,8 @@ static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h, w[i].h = h; w[i].large_pos = large_pos; w[i].minimiser_kmer = minimiser_kmer; + w[i].try_rev = try_rev; + w[i].no_squash = no_squash; in_mem[i].from = pos; in_mem[i].to = pos + w[i].buf_len; pos += w[i].buf_len; rest -= w[i].buf_len; @@ -2700,6 +3173,7 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header) @param sam_order the order in which the sort should occur @param sort_tag the tag to use if sorting by Tag @param minimiser_kmer the kmer size when sorting by MinHash + @param try_rev try reverse strand when sorting by MinHash @param fn name of the file to be sorted @param prefix prefix of the temporary files (prefix.NNNN.bam are written) @param fnout name of the final output file to be written @@ -2717,9 +3191,9 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header) NOT thread safe. */ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, - const char *fn, const char *prefix, - const char *fnout, const char *modeout, - size_t _max_mem, int n_threads, + bool try_rev, bool no_squash, const char *fn, + const char *prefix, const char *fnout, + const char *modeout, size_t _max_mem, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { @@ -2958,7 +3432,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, goto err; int sort_res = sort_blocks(k, buf, header, n_threads, - in_mem, large_pos, minimiser_kmer); + in_mem, large_pos, minimiser_kmer, + try_rev, no_squash); if (sort_res < 0) goto err; @@ -2988,7 +3463,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, &fns[consolidate_from], n_threads, in_mem, buf, keys, lib_lookup, &htspool, "sort", NULL, NULL, - NULL, 1, 0) >= 0) { + NULL, 1, 0, 0) >= 0) { merge_res = 0; break; } @@ -3031,7 +3506,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, // Sort last records if (k > 0) { num_in_mem = sort_blocks(k, buf, header, n_threads, - in_mem, large_pos, minimiser_kmer); + in_mem, large_pos, minimiser_kmer, try_rev, + no_squash); if (num_in_mem < 0) goto err; } else { num_in_mem = 0; @@ -3060,7 +3536,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header, n_files, fns, num_in_mem, in_mem, buf, keys, lib_lookup, &htspool, "sort", in_fmt, out_fmt, - arg_list, no_pg, write_index) < 0) { + arg_list, no_pg, write_index, 1) < 0) { // Propagate bam_merge_simple() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; @@ -3109,7 +3585,8 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma sprintf(fnout, "%s.bam", prefix); SamOrder sam_order = is_by_qname ? QueryName : Coordinate; g_sam_order = sam_order; - ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); + ret = bam_sort_core_ext(sam_order, NULL, 0, false, true, fn, prefix, + fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); free(fnout); return ret; } @@ -3123,7 +3600,11 @@ static void sort_usage(FILE *fp) " -u Output uncompressed data (equivalent to -l 0)\n" " -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n" " -M Use minimiser for clustering unaligned/unplaced reads\n" +" -R Do not use reverse strand (only compatible with -M)\n" " -K INT Kmer size to use for minimiser [20]\n" +" -I FILE Order minimisers by their position in FILE FASTA\n" +" -w INT Window size for minimiser indexing via -I ref.fa [100]\n" +" -H Squash homopolymers when computing minimiser\n" " -n Sort by read name (not compatible with samtools index command)\n" " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" " -o FILE Write final output to FILE rather than standard output\n" @@ -3159,11 +3640,15 @@ int bam_sort(int argc, char *argv[]) SamOrder sam_order = Coordinate; bool by_tag = false; int minimiser_kmer = 20; + bool try_rev = true; char* sort_tag = NULL, *arg_list = NULL; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; struct stat st; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + int window = 100; + char *minimiser_ref = NULL; + int no_squash = 1; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -3173,7 +3658,7 @@ int bam_sort(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) { switch (c) { case 'o': fnout = optarg; o_seen = 1; break; case 'n': sam_order = QueryName; break; @@ -3192,6 +3677,15 @@ int bam_sort(int argc, char *argv[]) case 1: no_pg = 1; break; case 2: sam_order = TemplateCoordinate; break; case 'M': sam_order = MinHash; break; + case 'I': + sam_order = MinHash; // implicit option + minimiser_ref = optarg; + break; + case 'H': no_squash = 0; break; + + case 'w': window = atoi(optarg); break; + + case 'R': try_rev = false; break; case 'K': minimiser_kmer = atoi(optarg); if (minimiser_kmer < 1) @@ -3206,6 +3700,17 @@ int bam_sort(int argc, char *argv[]) } } + if (minimiser_ref) { + fprintf(stderr, "Building index ... "); + fflush(stderr); + if (build_minhash_index(minimiser_ref, minimiser_kmer, window, + no_squash)) { + ret = EXIT_FAILURE; + goto sort_end; + } + fprintf(stderr, "done\n"); + } + // Change sort order if tag sorting is requested. Must update based on secondary index if (by_tag) { sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate; @@ -3262,7 +3767,9 @@ int bam_sort(int argc, char *argv[]) ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); } - ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0, + ret = bam_sort_core_ext(sam_order, sort_tag, + (sam_order == MinHash) ? minimiser_kmer : 0, + try_rev, no_squash, (nargs > 0) ? argv[optind] : "-", tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, &ga.in, &ga.out, arg_list, no_pg, ga.write_index); @@ -3278,6 +3785,12 @@ int bam_sort(int argc, char *argv[]) ret = EXIT_FAILURE; } +#ifdef DEBUG_MINHASH + fprintf(stderr, "Missed %.1f%%, dup %.1f%%\n", + 100.0*nmis/(ntot+.1), + 100.0*ndup/(ntot+.1)); +#endif + sort_end: free(tmpprefix.s); free(arg_list); diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 4353f61..80aa4d8 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -2,7 +2,7 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2022 Genome Research Ltd. + Copyright (C) 2008-2023 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -56,6 +56,8 @@ DEALINGS IN THE SOFTWARE. */ #include "bedidx.h" #include "bam.h" +//#define DEBUG_MINHASH + #define BAM_BLOCK_SIZE 2*1024*1024 #define MAX_TMP_FILES 64 @@ -1785,7 +1787,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, htsThreadPool *htspool, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, - int write_index) { + int write_index, int final_out) { samFile *fpout = NULL, **fp = NULL; heap1_t *heap = NULL; uint64_t idx = 0; @@ -1886,7 +1888,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, ks_heapmake(heap, heap_size, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->entry.bam_record; - if (g_sam_order == MinHash && b->core.tid == -1) { + if (g_sam_order == MinHash && b->core.tid == -1 && final_out) { // Remove the cached minhash value b->core.pos = -1; b->core.mpos = -1; @@ -2054,6 +2056,11 @@ static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b) // // The 64-bit sort key is split over the bam pos and isize fields. // This permits it to survive writing to temporary file and coming back. + +#ifdef DEBUG_MINHASH +static int ntot = 0, nmis = 0, ndup = 0; +#endif + static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) { const bam1_t *A = a.bam_record; @@ -2064,16 +2071,18 @@ static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b); - const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos; - const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos; + const uint64_t m_a = (((uint64_t)A->core.pos)<<31)|(uint32_t)A->core.mpos; + const uint64_t m_b = (((uint64_t)B->core.pos)<<31)|(uint32_t)B->core.mpos; if (m_a < m_b) // by hash return -1; else if (m_a > m_b) return 1; - else if (A->core.isize < B->core.isize) // by hash location in seq + + // Bigger pos with size minhash means starts further to left + else if (A->core.isize > B->core.isize) // by hash location in seq return -1; - else if (A->core.isize > B->core.isize) + else if (A->core.isize < B->core.isize) return 1; else return bam1_cmp_core(a,b); @@ -2245,6 +2254,8 @@ typedef struct { int error; int large_pos; int minimiser_kmer; + bool try_rev; + bool no_squash; } worker_t; // Returns 0 for success @@ -2275,6 +2286,8 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu for (i = 0; i < l; ++i) { bam1_t *b = buf[i].bam_record; if (clear_minhash && b->core.tid == -1) { + // To see the position for debugging + // b->core.pos = ((((uint64_t)b->core.pos)<<31)|(uint32_t)b->core.mpos) + b->core.isize; // Remove the cached minhash value b->core.pos = -1; b->core.mpos = -1; @@ -2383,8 +2396,15 @@ err: return ret; } +KHASH_MAP_INIT_INT64(kmer, int64_t) +static khash_t(kmer) *kmer_h = NULL; + +// Punt homopolymers somewhere central in the hash space +#define XOR 0xdead7878beef7878 + /* - * Computes the minhash of a sequence using both forward and reverse strands. + * Computes the minhash of a sequence using forward strand and if requested + * reverse strand. * * This is used as a sort key for unmapped data, to collate like sequences * together and to improve compression ratio. @@ -2392,13 +2412,219 @@ err: * The minhash is returned and *pos filled out with location of this hash * key in the sequence if pos != NULL. */ -static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) { +static uint64_t minhash(bam1_t *b, int kmer, int window, int *curr_pos, + int *end, int *is_rev, int try_fwd, int try_rev, + int no_squash) { uint64_t hashf = 0, minhashf = UINT64_MAX; - uint64_t hashr = 0, minhashr = UINT64_MAX; - int minhashpf = 0, minhashpr = 0, i; + int minhashpf = *curr_pos, i, j; + uint64_t mask = (1L<<(2*kmer))-1; + uint8_t *seq = bam_get_seq(b); + int len = b->core.l_qseq; + uint64_t xor = XOR & mask; + + if (is_rev) *is_rev = 0; + + // Lookup tables for bam_seqi to 0123 fwd/rev hashes + // =ACM GRSV TWYH KDBN +#define X 0 + static unsigned char L[16] = { + X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X, + }; + uint64_t R[16] = { + X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X, + }; + for (i = 0; i < 16; i++) + R[i] <<= 2*(kmer-1); + + int i_start = *curr_pos; + int i_end = MIN(i_start + window, len); + int last_base = -1; + + if (try_fwd) { + // Initialise hash keys + for (i = i_start, j = 0; j < kmer-1 && i < i_end; i++) { + int base = bam_seqi(seq, i); + // collapse homopolymers + if (no_squash || last_base != base) { + last_base = base; + hashf = (hashf<<2) | L[base]; + j++; + } + } + + // Loop to find minimum + if (no_squash) { + for (; i < i_end; i++) { + int base = bam_seqi(seq, i); + hashf = (hashf<<2) | L[base]; + uint64_t hashfx = (hashf ^ XOR) & mask; + if (minhashf > hashfx) + minhashf = hashfx, minhashpf = i; + } + } else { + for (; i < i_end; i++) { + int base = bam_seqi(seq, i); + if (last_base != base) { + last_base = base; + hashf = (hashf<<2) | L[base]; + uint64_t hashfx = (hashf ^ XOR) & mask; + if (minhashf > hashfx) + minhashf = hashfx, minhashpf = i; + } + } + } + } + + // Same as above for the reverse strand. + // Not used for now, but we may wish to consider indexing in both + // strands, recording the strand in value (pos), and comparing in one + // strand only. Right now we compare on both against a single-stranded + // index. + if (try_rev) { + uint64_t hashr = 0, minhashr = UINT64_MAX; + int minhashpr = *curr_pos; + int last_base = -1; + + for (i = i_start, j = 0; j < kmer-1 && i < len; i++) { + int base = bam_seqi(seq, i); + if (no_squash || last_base != base) { + last_base = base; + hashr = (hashr>>2) | R[base]; + j++; + } + } + + if (no_squash) { + for (; i < i_end; i++) { + int base = bam_seqi(seq, i); + hashr = (hashr>>2) | R[base]; + if (minhashr > (hashr^xor)) + minhashr = (hashr^xor), minhashpr = len-i+kmer-2; + } + } else { + for (; i < i_end; i++) { + int base = bam_seqi(seq, i); + if (last_base != base) { + last_base = base; + hashr = (hashr>>2) | R[base]; + if (minhashr > (hashr^xor)) + minhashr = (hashr^xor), minhashpr = len-i+kmer-2; + } + } + } + + if (minhashr < minhashf) { + minhashf = minhashr; + minhashpf = minhashpr; + if (is_rev) *is_rev = 1; + } + } + + // "*curr_pos = minhashpf" is faster here, but is sometimes + // poorer in compression. Eg 10 million novaseq records with + // 75.1MB vs 76.9MB cram BA field. + //*curr_pos = minhashpf; + *curr_pos = minhashpf - (kmer-1); + if (end) *end = (i_end == len); + return minhashf; +} + +#define UNIQ_BIT 60 +#define UNIQ_TEST(x) (((x) & (1ULL<= 0) { + //fprintf(samtools_stderr, "LEN\t%d\t%s\n", b->core.l_qseq, bam_get_qname(b)); + uint64_t hashf; + int pos = 0, end = 0; + khiter_t k; + int ret; + + if (b->core.l_qseq < window) + continue; + + // fwd + while (!end) { + int last_pos = pos; + hashf = minhash(b, kmer, window, &pos, &end, NULL, 1, 0, + no_squash); + k = kh_put(kmer, kmer_h, hashf, &ret); + kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<core.l_qseq; + +// We could also add reverse keys to the index here. +// This would avoid reverse complementing during the matching stage. +// We'd need to add a flag (another high bit of kh_value) to indicate +// strand. +// I'm unsure if this is a good trade-off or not. + +// // rev +// pos = 0; end = 0; +// while (!end) { +// hashf = minhash(b, kmer, window, &pos, &end, NULL, 0, 1, +// no_squash); +// k = kh_put(kmer, kmer_h, hashf, &ret); +// kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<core.l_qseq; + } + if (r < -1) + goto err; + + ret = 0; + err: + if (b) bam_destroy1(b); + if (h) sam_hdr_destroy(h); + sam_close(in); + + return ret; +} + +/* + * A variant of minhash that compares against a previously built index. + * + * We follow the same steps of scanning through this sequence to find the + * minimum hash, but we prefer hash keys that have unique placement in the + * index, or if not unique, then non-uniquely placed, over ones that + * are absent from the index. + */ +static uint64_t minhash_with_idx(bam1_t *b, int kmer, int *pos, int *rev, + bool try_rev) { + uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX; + uint64_t minhashfd = UINT64_MAX; + int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j; uint64_t mask = (1L<<(2*kmer))-1; unsigned char *seq = bam_get_seq(b); int len = b->core.l_qseq; + const uint64_t xor = XOR & mask; // Lookup tables for bam_seqi to 0123 fwd/rev hashes // =ACM GRSV TWYH KDBN @@ -2412,39 +2638,266 @@ static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) { for (i = 0; i < 16; i++) R[i] <<= 2*(kmer-1); - // Punt homopolymers somewhere central in the hash space -#define XOR (0xdead7878beef7878 & mask) - // Initialise hash keys - for (i = 0; i < kmer-1 && i < len; i++) { + for (i = j = 0; j < kmer-1 && i < len; i++, j++) { int base = bam_seqi(seq, i); hashf = (hashf<<2) | L[base]; - hashr = (hashr>>2) | R[base]; } // Loop to find minimum + int found_f = 0, found_r = 0; for (; i < len; i++) { int base = bam_seqi(seq, i); + hashf = ((hashf<<2) | L[base]) & mask; + const uint64_t hashfx = hashf^xor; + + // Priority for sorting + // 1. Unique key in index + // 2. Dup key in index + // 3. Everything else + int index = 0; + if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) { + khiter_t k = kh_get(kmer, kmer_h, hashfx); + if (k != kh_end(kmer_h)) + index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1; + } + found_f |= index; + switch (index) { + case 2: minhashfi = hashfx, minhashpfi = i; break; + case 1: minhashfd = hashfx, minhashpfd = i; break; + + default: + if (minhashf > hashfx) + minhashf = hashfx, minhashpf = i; + } + } + + if (minhashfi != UINT64_MAX) + minhashf = minhashfi, minhashpf = minhashpfi; + else if (minhashfd != UINT64_MAX) + minhashf = minhashfd, minhashpf = minhashpfd; + + // Same as above for the reverse strand + int dir = 0; + if (try_rev) { + uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX; + uint64_t minhashrd = UINT64_MAX; + int minhashpr = 0, minhashpri = 0, minhashprd = 0; + + for (i = j = 0; j < kmer-1 && i < len; i++, j++) { + int base = bam_seqi(seq, i); + hashr = (hashr>>2) | R[base]; + } + for (; i < len; i++) { + int base = bam_seqi(seq, i); + hashr = (hashr>>2) | R[base]; + const uint64_t hashrx = hashr^xor; + + int index = 0; + if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) { + khiter_t k = kh_get(kmer, kmer_h, hashrx); + if (k != kh_end(kmer_h)) + index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1; + } + found_r |= index; + switch (index) { + case 2: minhashri = hashrx, minhashpri = i; break; + case 1: minhashrd = hashrx, minhashprd = i; break; + + default: + if (minhashr > hashrx) + minhashr = hashrx, minhashpr = i; + } + } + if (minhashri != UINT64_MAX) + minhashr = minhashri, minhashpr = minhashpri; + else if (minhashrd != UINT64_MAX) + minhashr = minhashrd, minhashpr = minhashprd; + + // Pick reverse if better mapping + if ((minhashf > minhashr) || (!found_f && found_r)) { + if (!found_f || found_r) { + minhashf = minhashr; + minhashpf = b->core.l_qseq - minhashpr + kmer - 2; + dir = 1; + } + } + } + +#ifdef DEBUG_MINHASH + ntot++; + khiter_t k = kh_get(kmer, kmer_h, minhashf); + if (k != kh_end(kmer_h)) { + if (!UNIQ_TEST(kh_value(kmer_h, k))) + ndup++; + minhashf = kh_value(kmer_h, k) & UNIQ_MASK; + } else { + nmis++; + } +#else + // For indexed kmers, our hash key is the position the kmer + // occurs in the concatenated reference rather than the hash itself. + khiter_t k = kh_get(kmer, kmer_h, minhashf); + if (k != kh_end(kmer_h)) + minhashf = kh_value(kmer_h, k) & UNIQ_MASK; +#endif + + if (rev) *rev = dir; + if (pos) *pos = minhashpf; + return minhashf != UINT64_MAX ? minhashf : 0; +} + +// As per minhash_with_idx but with homopolymer squashing enabled. +// This function is duplicated to remove conditionals and speed up the +// hashing code. (Minus the ifdef-ed out code, which is kept above mainly +// for posterity.) +static uint64_t minhash_with_idx_squash(bam1_t *b, int kmer, int *pos, + int *rev, bool try_rev) { + uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX; + uint64_t minhashfd = UINT64_MAX; + int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j; + uint64_t mask = (1L<<(2*kmer))-1; + unsigned char *seq = bam_get_seq(b); + int len = b->core.l_qseq; + const uint64_t xor = XOR & mask; + + // Lookup tables for bam_seqi to 0123 fwd/rev hashes + // =ACM GRSV TWYH KDBN +#define X 0 + unsigned char L[16] = { + X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X, + }; + uint64_t R[16] = { + X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X, + }; + for (i = 0; i < 16; i++) + R[i] <<= 2*(kmer-1); + + // Initialise hash keys + int last_base = -1; + for (i = j = 0; j < kmer-1 && i < len; i++) { + int base = bam_seqi(seq, i); + if (base == last_base) + continue; + last_base = base; + j++; + hashf = (hashf<<2) | L[base]; + } + + // Loop to find minimum + int found_f = 0, found_r = 0; + for (; i < len; i++) { + int base = bam_seqi(seq, i); + if (base == last_base) + continue; + last_base = base; hashf = ((hashf<<2) | L[base]) & mask; - hashr = (hashr>>2) | R[base]; + const uint64_t hashfx = hashf^xor; + + // Priority for sorting + // 1. Unique key in index + // 2. Dup key in index + // 3. Everything else + int index = 0; + if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) { + khiter_t k = kh_get(kmer, kmer_h, hashfx); + if (k != kh_end(kmer_h)) + index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1; + } + found_f |= index; + switch (index) { + case 2: minhashfi = hashfx, minhashpfi = i; break; + case 1: minhashfd = hashfx, minhashpfd = i; break; - if (minhashf > (hashf^XOR)) - minhashf = (hashf^XOR), minhashpf = i; - if (minhashr > (hashr^XOR)) - minhashr = (hashr^XOR), minhashpr = len-i+kmer-2; + default: + if (minhashf > hashfx) + minhashf = hashfx, minhashpf = i; + } + } + + if (minhashfi != UINT64_MAX) + minhashf = minhashfi, minhashpf = minhashpfi; + else if (minhashfd != UINT64_MAX) + minhashf = minhashfd, minhashpf = minhashpfd; + + // Same as above for the reverse strand + int dir = 0; + if (try_rev) { + uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX; + uint64_t minhashrd = UINT64_MAX; + int minhashpr = 0, minhashpri = 0, minhashprd = 0; + int last_base = -1; + + for (i = j = 0; j < kmer-1 && i < len; i++) { + int base = bam_seqi(seq, i); + if (base == last_base) + continue; + last_base = base; + j++; + hashr = (hashr>>2) | R[base]; + } + for (; i < len; i++) { + int base = bam_seqi(seq, i); + if (base == last_base) + continue; + last_base = base; + hashr = (hashr>>2) | R[base]; + const uint64_t hashrx = hashr^xor; + + int index = 0; + if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) { + khiter_t k = kh_get(kmer, kmer_h, hashrx); + if (k != kh_end(kmer_h)) + index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1; + } + found_r |= index; + switch (index) { + case 2: minhashri = hashrx, minhashpri = i; break; + case 1: minhashrd = hashrx, minhashprd = i; break; + default: + if (minhashr > hashrx) + minhashr = hashrx, minhashpr = i; + } + } + if (minhashri != UINT64_MAX) + minhashr = minhashri, minhashpr = minhashpri; + else if (minhashrd != UINT64_MAX) + minhashr = minhashrd, minhashpr = minhashprd; + + // Pick reverse if better mapping + if ((minhashf > minhashr) || (!found_f && found_r)) { + if (!found_f || found_r) { + minhashf = minhashr; + minhashpf = b->core.l_qseq - minhashpr + kmer - 2; + dir = 1; + } + } } - if (minhashf <= minhashr) { - if (rev) *rev = 0; - if (pos) *pos = minhashpf; - return minhashf; +#ifdef DEBUG_MINHASH + ntot++; + khiter_t k = kh_get(kmer, kmer_h, minhashf); + if (k != kh_end(kmer_h)) { + if (!UNIQ_TEST(kh_value(kmer_h, k))) + ndup++; + minhashf = kh_value(kmer_h, k) & UNIQ_MASK; } else { - if (rev) *rev = 1; - if (pos) *pos = minhashpr; - return minhashr; + nmis++; } +#else + // For indexed kmers, our hash key is the position the kmer + // occurs in the concatenated reference rather than the hash itself. + khiter_t k = kh_get(kmer, kmer_h, minhashf); + if (k != kh_end(kmer_h)) + minhashf = kh_value(kmer_h, k) & UNIQ_MASK; +#endif + + if (rev) *rev = dir; + if (pos) *pos = minhashpf; + + return minhashf != UINT64_MAX ? minhashf : 0; } //--- Start of candidates to punt to htslib @@ -2558,18 +3011,35 @@ static inline void worker_minhash(worker_t *w) { continue; int pos = 0, rev = 0; - uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev); + uint64_t mh = kmer_h + ? (w->no_squash + ? minhash_with_idx(b, w->minimiser_kmer, &pos, &rev, + w->try_rev) + : minhash_with_idx_squash(b, w->minimiser_kmer, &pos, &rev, + w->try_rev) + ) + : minhash(b, w->minimiser_kmer, b->core.l_qseq, + &pos, NULL, &rev, 1, w->try_rev, w->no_squash); if (rev) reverse_complement(b); + if (!kmer_h) { + mh += 1LL<<30; + pos = 65535-pos >= 0 ? 65535-pos : 0; + } else { + mh -= pos; + pos = 0; + } + + // Store 64-bit hash in unmapped pos and mpos fields. // The position of hash is in isize, which we use for // resolving ties when sorting by hash key. // These are unused for completely unmapped data and // will be reset during final output. - b->core.pos = mh>>31; + b->core.pos = (mh>>31) & 0x7fffffff; b->core.mpos = mh&0x7fffffff; - b->core.isize = 65535-pos >=0 ? 65535-pos : 0; + b->core.isize = pos; } } @@ -2597,7 +3067,8 @@ static void *worker(void *data) static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h, int n_threads, buf_region *in_mem, - int large_pos, int minimiser_kmer) + int large_pos, int minimiser_kmer, bool try_rev, + bool no_squash) { int i; size_t pos, rest; @@ -2621,6 +3092,8 @@ static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h, w[i].h = h; w[i].large_pos = large_pos; w[i].minimiser_kmer = minimiser_kmer; + w[i].try_rev = try_rev; + w[i].no_squash = no_squash; in_mem[i].from = pos; in_mem[i].to = pos + w[i].buf_len; pos += w[i].buf_len; rest -= w[i].buf_len; @@ -2702,6 +3175,7 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header) @param sam_order the order in which the sort should occur @param sort_tag the tag to use if sorting by Tag @param minimiser_kmer the kmer size when sorting by MinHash + @param try_rev try reverse strand when sorting by MinHash @param fn name of the file to be sorted @param prefix prefix of the temporary files (prefix.NNNN.bam are written) @param fnout name of the final output file to be written @@ -2719,9 +3193,9 @@ static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header) NOT thread safe. */ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, - const char *fn, const char *prefix, - const char *fnout, const char *modeout, - size_t _max_mem, int n_threads, + bool try_rev, bool no_squash, const char *fn, + const char *prefix, const char *fnout, + const char *modeout, size_t _max_mem, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { @@ -2960,7 +3434,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, goto err; int sort_res = sort_blocks(k, buf, header, n_threads, - in_mem, large_pos, minimiser_kmer); + in_mem, large_pos, minimiser_kmer, + try_rev, no_squash); if (sort_res < 0) goto err; @@ -2990,7 +3465,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, &fns[consolidate_from], n_threads, in_mem, buf, keys, lib_lookup, &htspool, "sort", NULL, NULL, - NULL, 1, 0) >= 0) { + NULL, 1, 0, 0) >= 0) { merge_res = 0; break; } @@ -3033,7 +3508,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, // Sort last records if (k > 0) { num_in_mem = sort_blocks(k, buf, header, n_threads, - in_mem, large_pos, minimiser_kmer); + in_mem, large_pos, minimiser_kmer, try_rev, + no_squash); if (num_in_mem < 0) goto err; } else { num_in_mem = 0; @@ -3062,7 +3538,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header, n_files, fns, num_in_mem, in_mem, buf, keys, lib_lookup, &htspool, "sort", in_fmt, out_fmt, - arg_list, no_pg, write_index) < 0) { + arg_list, no_pg, write_index, 1) < 0) { // Propagate bam_merge_simple() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; @@ -3111,7 +3587,8 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma sprintf(fnout, "%s.bam", prefix); SamOrder sam_order = is_by_qname ? QueryName : Coordinate; g_sam_order = sam_order; - ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); + ret = bam_sort_core_ext(sam_order, NULL, 0, false, true, fn, prefix, + fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); free(fnout); return ret; } @@ -3125,7 +3602,11 @@ static void sort_usage(FILE *fp) " -u Output uncompressed data (equivalent to -l 0)\n" " -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n" " -M Use minimiser for clustering unaligned/unplaced reads\n" +" -R Do not use reverse strand (only compatible with -M)\n" " -K INT Kmer size to use for minimiser [20]\n" +" -I FILE Order minimisers by their position in FILE FASTA\n" +" -w INT Window size for minimiser indexing via -I ref.fa [100]\n" +" -H Squash homopolymers when computing minimiser\n" " -n Sort by read name (not compatible with samtools index command)\n" " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" " -o FILE Write final output to FILE rather than standard output\n" @@ -3161,11 +3642,15 @@ int bam_sort(int argc, char *argv[]) SamOrder sam_order = Coordinate; bool by_tag = false; int minimiser_kmer = 20; + bool try_rev = true; char* sort_tag = NULL, *arg_list = NULL; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; struct stat st; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + int window = 100; + char *minimiser_ref = NULL; + int no_squash = 1; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -3175,7 +3660,7 @@ int bam_sort(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) { switch (c) { case 'o': fnout = optarg; o_seen = 1; break; case 'n': sam_order = QueryName; break; @@ -3194,6 +3679,15 @@ int bam_sort(int argc, char *argv[]) case 1: no_pg = 1; break; case 2: sam_order = TemplateCoordinate; break; case 'M': sam_order = MinHash; break; + case 'I': + sam_order = MinHash; // implicit option + minimiser_ref = optarg; + break; + case 'H': no_squash = 0; break; + + case 'w': window = atoi(optarg); break; + + case 'R': try_rev = false; break; case 'K': minimiser_kmer = atoi(optarg); if (minimiser_kmer < 1) @@ -3208,6 +3702,17 @@ int bam_sort(int argc, char *argv[]) } } + if (minimiser_ref) { + fprintf(samtools_stderr, "Building index ... "); + fflush(samtools_stderr); + if (build_minhash_index(minimiser_ref, minimiser_kmer, window, + no_squash)) { + ret = EXIT_FAILURE; + goto sort_end; + } + fprintf(samtools_stderr, "done\n"); + } + // Change sort order if tag sorting is requested. Must update based on secondary index if (by_tag) { sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate; @@ -3264,7 +3769,9 @@ int bam_sort(int argc, char *argv[]) ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); } - ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0, + ret = bam_sort_core_ext(sam_order, sort_tag, + (sam_order == MinHash) ? minimiser_kmer : 0, + try_rev, no_squash, (nargs > 0) ? argv[optind] : "-", tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, &ga.in, &ga.out, arg_list, no_pg, ga.write_index); @@ -3280,6 +3787,12 @@ int bam_sort(int argc, char *argv[]) ret = EXIT_FAILURE; } +#ifdef DEBUG_MINHASH + fprintf(samtools_stderr, "Missed %.1f%%, dup %.1f%%\n", + 100.0*nmis/(ntot+.1), + 100.0*ndup/(ntot+.1)); +#endif + sort_end: free(tmpprefix.s); free(arg_list); diff --git a/samtools/bam_split.c b/samtools/bam_split.c index 72a6298..e9f0fb5 100644 --- a/samtools/bam_split.c +++ b/samtools/bam_split.c @@ -1,6 +1,6 @@ /* bam_split.c -- split subcommand. - Copyright (C) 2013-2016,2018-2019 Genome Research Ltd. + Copyright (C) 2013-2016,2018-2019,2023 Genome Research Ltd. Author: Martin Pollard @@ -292,7 +292,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } } - retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); + retval->merged_input_file = sam_open_format(opts->merged_input_name, "r", &opts->ga.in); if (!retval->merged_input_file) { print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name); cleanup_state(retval, false); @@ -341,7 +341,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } } - retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); + char outmode[4] = "w"; + sam_open_mode(outmode + 1, opts->unaccounted_name, NULL); + retval->unaccounted_file = sam_open_format(opts->unaccounted_name, outmode, &opts->ga.out); + if (retval->unaccounted_file == NULL) { print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name); cleanup_state(retval, false); @@ -381,6 +384,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) size_t i; for (i = 0; i < retval->output_count; i++) { char* output_filename = NULL; + char outmode[4] = "w"; output_filename = expand_format_string(opts->output_format_string, input_base_name, @@ -394,7 +398,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } retval->rg_output_file_name[i] = output_filename; - retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out); + + sam_open_mode(outmode + 1, output_filename, NULL); + retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out); + if (retval->rg_output_file[i] == NULL) { print_error_errno("split", "Could not open \"%s\"", output_filename); cleanup_state(retval, false); diff --git a/samtools/bam_split.c.pysam.c b/samtools/bam_split.c.pysam.c index 1015234..6c48466 100644 --- a/samtools/bam_split.c.pysam.c +++ b/samtools/bam_split.c.pysam.c @@ -2,7 +2,7 @@ /* bam_split.c -- split subcommand. - Copyright (C) 2013-2016,2018-2019 Genome Research Ltd. + Copyright (C) 2013-2016,2018-2019,2023 Genome Research Ltd. Author: Martin Pollard @@ -294,7 +294,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } } - retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); + retval->merged_input_file = sam_open_format(opts->merged_input_name, "r", &opts->ga.in); if (!retval->merged_input_file) { print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name); cleanup_state(retval, false); @@ -343,7 +343,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } } - retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); + char outmode[4] = "w"; + sam_open_mode(outmode + 1, opts->unaccounted_name, NULL); + retval->unaccounted_file = sam_open_format(opts->unaccounted_name, outmode, &opts->ga.out); + if (retval->unaccounted_file == NULL) { print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name); cleanup_state(retval, false); @@ -383,6 +386,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) size_t i; for (i = 0; i < retval->output_count; i++) { char* output_filename = NULL; + char outmode[4] = "w"; output_filename = expand_format_string(opts->output_format_string, input_base_name, @@ -396,7 +400,10 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) } retval->rg_output_file_name[i] = output_filename; - retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out); + + sam_open_mode(outmode + 1, output_filename, NULL); + retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out); + if (retval->rg_output_file[i] == NULL) { print_error_errno("split", "Could not open \"%s\"", output_filename); cleanup_state(retval, false); diff --git a/samtools/bamshuf.c b/samtools/bamshuf.c index 05442bf..c297c5d 100644 --- a/samtools/bamshuf.c +++ b/samtools/bamshuf.c @@ -537,7 +537,7 @@ static int usage(FILE *fp, int n_files, int reads_store) { " -l INT Compression level [%d]\n" // DEF_CLEVEL " -n INT Number of temporary files [%d]\n" // n_files " -T PREFIX\n" - " Write tempory files to PREFIX.nnnn.bam\n" + " Write temporary files to PREFIX.nnnn.bam\n" " --no-PG do not add a PG line\n", reads_store, DEF_CLEVEL, n_files); diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c index 6547b3c..e98bc8b 100644 --- a/samtools/bamshuf.c.pysam.c +++ b/samtools/bamshuf.c.pysam.c @@ -539,7 +539,7 @@ static int usage(FILE *fp, int n_files, int reads_store) { " -l INT Compression level [%d]\n" // DEF_CLEVEL " -n INT Number of temporary files [%d]\n" // n_files " -T PREFIX\n" - " Write tempory files to PREFIX.nnnn.bam\n" + " Write temporary files to PREFIX.nnnn.bam\n" " --no-PG do not add a PG line\n", reads_store, DEF_CLEVEL, n_files); diff --git a/samtools/consensus_pileup.c b/samtools/consensus_pileup.c index b48aac2..c9667b3 100644 --- a/samtools/consensus_pileup.c +++ b/samtools/consensus_pileup.c @@ -1,6 +1,6 @@ /* consensus__pileup.h -- Pileup orientated data per consensus column - Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd. + Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd. Author: James Bonfied diff --git a/samtools/consensus_pileup.c.pysam.c b/samtools/consensus_pileup.c.pysam.c index 99fb957..adb6869 100644 --- a/samtools/consensus_pileup.c.pysam.c +++ b/samtools/consensus_pileup.c.pysam.c @@ -2,7 +2,7 @@ /* consensus__pileup.h -- Pileup orientated data per consensus column - Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd. + Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd. Author: James Bonfied diff --git a/samtools/consensus_pileup.h b/samtools/consensus_pileup.h index 7aacfaa..cc400aa 100644 --- a/samtools/consensus_pileup.h +++ b/samtools/consensus_pileup.h @@ -1,6 +1,6 @@ /* consensus_pileup.h -- Pileup orientated data per consensus column - Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd. + Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd. Author: James Bonfied diff --git a/samtools/cram_size.c b/samtools/cram_size.c index 6c397bc..54a987c 100644 --- a/samtools/cram_size.c +++ b/samtools/cram_size.c @@ -558,8 +558,8 @@ static int cram_size(hFILE *hf_in, samFile *in, sam_hdr_t *h, FILE *outfp, fprintf(outfp, "Number of slices %18"PRId64"\n", nslice); fprintf(outfp, "Number of sequences %18"PRId64"\n", nseqs); fprintf(outfp, "Number of bases %18"PRId64"\n", nbases); - fprintf(outfp, "Total file size %18"PRId64"\n", end); - fprintf(outfp, "Format overhead size %18"PRId64"\n", end - tot_size); + fprintf(outfp, "Total file size %18"PRId64"\n", (int64_t) end); + fprintf(outfp, "Format overhead size %18"PRId64"\n", (int64_t) (end - tot_size)); return 0; diff --git a/samtools/cram_size.c.pysam.c b/samtools/cram_size.c.pysam.c index f260419..b3031d0 100644 --- a/samtools/cram_size.c.pysam.c +++ b/samtools/cram_size.c.pysam.c @@ -560,8 +560,8 @@ static int cram_size(hFILE *hf_in, samFile *in, sam_hdr_t *h, FILE *outfp, fprintf(outfp, "Number of slices %18"PRId64"\n", nslice); fprintf(outfp, "Number of sequences %18"PRId64"\n", nseqs); fprintf(outfp, "Number of bases %18"PRId64"\n", nbases); - fprintf(outfp, "Total file size %18"PRId64"\n", end); - fprintf(outfp, "Format overhead size %18"PRId64"\n", end - tot_size); + fprintf(outfp, "Total file size %18"PRId64"\n", (int64_t) end); + fprintf(outfp, "Format overhead size %18"PRId64"\n", (int64_t) (end - tot_size)); return 0; diff --git a/samtools/reset.c b/samtools/reset.c index f9b0c09..4e522cd 100644 --- a/samtools/reset.c +++ b/samtools/reset.c @@ -25,6 +25,8 @@ DEALINGS IN THE SOFTWARE */ +#include + #include "samtools.h" #include "htslib/sam.h" #include "sam_opts.h" diff --git a/samtools/reset.c.pysam.c b/samtools/reset.c.pysam.c index fdf44b9..c98946f 100644 --- a/samtools/reset.c.pysam.c +++ b/samtools/reset.c.pysam.c @@ -27,6 +27,8 @@ DEALINGS IN THE SOFTWARE */ +#include + #include "samtools.h" #include "htslib/sam.h" #include "sam_opts.h" diff --git a/samtools/sam_view.c b/samtools/sam_view.c index d23e965..aa5b923 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -139,11 +139,17 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) } } -// Returns 0 to indicate read should be output 1 otherwise +// Returns 0 to indicate read should be output 1 otherwise, +// and -1 on error. static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { - if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) - return 1; + if (settings->filter) { + int r = sam_passes_filter(h, b, settings->filter); + if (r < 0) // err + return -1; + if (r == 0) // filter-out + return 1; + } if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { @@ -581,7 +587,9 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * while ((r =sam_itr_multi_next(conf->in, iter, rec))>=0) { if ( (rec->core.flag & BAM_FPAIRED) == 0 ) continue; if ( rec->core.mtid>=0 && bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.mtid), rec->core.mpos, rec->core.mpos) ) continue; - if ( process_aln(conf->header, rec, conf) ) continue; + int p = process_aln(conf->header, rec, conf); + if (p < 0) goto out; + if (p == 1) continue; nmates++; @@ -632,13 +640,16 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * k = kh_get(names,mate_names,bam_get_qname(rec)); if ( k != kh_end(mate_names) ) drop = 0; } - if (!drop && process_aln(conf->header, rec, conf) == 0) { + int p = 0; + if (!drop && (p=process_aln(conf->header, rec, conf))== 0) { if (adjust_tags(conf->header, rec, conf) != 0) goto out; if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out, &write_error) < 0) goto out; } + if (p < 0) + goto out; } if (r < -1) { @@ -669,7 +680,12 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b, if (bam_sanitize(conf->header, b, conf->sanitize) < 0) return -1; - if (!process_aln(conf->header, b, conf)) { + int p; + if ((p = process_aln(conf->header, b, conf)) < 0) { + // error + return -1; + } else if (p == 0) { + // emit read if (!conf->is_count) { change_flag(b, conf); if (adjust_tags(conf->header, b, conf) != 0) @@ -710,17 +726,17 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b, static int stream_view(samview_settings_t *conf) { bam1_t *b = bam_init1(); - int write_error = 0, r; + int write_error = 0, r, p = 0; if (!b) { print_error_errno("view", "could not allocate bam record"); return 1; } errno = 0; // prevent false error messages. while ((r = sam_read1(conf->in, conf->header, b)) >= 0) { - if (process_one_record(conf, b, &write_error) < 0) break; + if ((p = process_one_record(conf, b, &write_error)) < 0) break; } bam_destroy1(b); - if (r < -1) { + if (r < -1 || p < 0) { print_error_errno("view", "error reading file \"%s\"", conf->fn_in); return 1; } diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 7961862..e1b681b 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -141,11 +141,17 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) } } -// Returns 0 to indicate read should be output 1 otherwise +// Returns 0 to indicate read should be output 1 otherwise, +// and -1 on error. static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { - if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) - return 1; + if (settings->filter) { + int r = sam_passes_filter(h, b, settings->filter); + if (r < 0) // err + return -1; + if (r == 0) // filter-out + return 1; + } if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { @@ -583,7 +589,9 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * while ((r =sam_itr_multi_next(conf->in, iter, rec))>=0) { if ( (rec->core.flag & BAM_FPAIRED) == 0 ) continue; if ( rec->core.mtid>=0 && bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.mtid), rec->core.mpos, rec->core.mpos) ) continue; - if ( process_aln(conf->header, rec, conf) ) continue; + int p = process_aln(conf->header, rec, conf); + if (p < 0) goto out; + if (p == 1) continue; nmates++; @@ -634,13 +642,16 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * k = kh_get(names,mate_names,bam_get_qname(rec)); if ( k != kh_end(mate_names) ) drop = 0; } - if (!drop && process_aln(conf->header, rec, conf) == 0) { + int p = 0; + if (!drop && (p=process_aln(conf->header, rec, conf))== 0) { if (adjust_tags(conf->header, rec, conf) != 0) goto out; if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out, &write_error) < 0) goto out; } + if (p < 0) + goto out; } if (r < -1) { @@ -671,7 +682,12 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b, if (bam_sanitize(conf->header, b, conf->sanitize) < 0) return -1; - if (!process_aln(conf->header, b, conf)) { + int p; + if ((p = process_aln(conf->header, b, conf)) < 0) { + // error + return -1; + } else if (p == 0) { + // emit read if (!conf->is_count) { change_flag(b, conf); if (adjust_tags(conf->header, b, conf) != 0) @@ -712,17 +728,17 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b, static int stream_view(samview_settings_t *conf) { bam1_t *b = bam_init1(); - int write_error = 0, r; + int write_error = 0, r, p = 0; if (!b) { print_error_errno("view", "could not allocate bam record"); return 1; } errno = 0; // prevent false error messages. while ((r = sam_read1(conf->in, conf->header, b)) >= 0) { - if (process_one_record(conf, b, &write_error) < 0) break; + if ((p = process_one_record(conf, b, &write_error)) < 0) break; } bam_destroy1(b); - if (r < -1) { + if (r < -1 || p < 0) { print_error_errno("view", "error reading file \"%s\"", conf->fn_in); return 1; } diff --git a/samtools/stats.c b/samtools/stats.c index 06802b1..44783a9 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -1556,7 +1556,13 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches); fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0); - float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0; + float avg_read_length = (stats->nreads_1st + + stats->nreads_2nd + + stats->nreads_other) + ? (float)stats->total_len / (stats->nreads_1st + + stats->nreads_2nd + + stats->nreads_other) + : 0; fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length); fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0); fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0); diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index c333195..b3462cc 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -1558,7 +1558,13 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches); fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0); - float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0; + float avg_read_length = (stats->nreads_1st + + stats->nreads_2nd + + stats->nreads_other) + ? (float)stats->total_len / (stats->nreads_1st + + stats->nreads_2nd + + stats->nreads_other) + : 0; fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length); fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0); fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0); diff --git a/samtools/version.sh b/samtools/version.sh index 1ac9413..7d17aee 100755 --- a/samtools/version.sh +++ b/samtools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.17 +VERSION=1.18 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/setup.py b/setup.py index 291d0f9..a4bf36d 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,21 @@ #! /usr/bin/python -'''pysam - a python module for reading, manipulating and writing +'''pysam --- a Python package for reading, manipulating, and writing genomic data sets. -pysam is a lightweight wrapper of the htslib C-API and provides -facilities to read and write SAM/BAM/VCF/BCF/BED/GFF/GTF/FASTA/FASTQ -files as well as access to the command line functionality of the -samtools and bcftools packages. The module supports compression and -random access through indexing. - -This module provides a low-level wrapper around the htslib C-API as -using cython and a high-level API for convenient access to the data -within standard genomic file formats. - -See: -http://www.htslib.org -https://github.com/pysam-developers/pysam -http://pysam.readthedocs.org/en/stable +pysam is a lightweight wrapper of the HTSlib API and provides facilities +to read and write SAM/BAM/CRAM/VCF/BCF/BED/GFF/GTF/FASTA/FASTQ files +as well as access to the command-line functionality of samtools and bcftools. +The module supports compression and random access through indexing. +This module provides a low-level wrapper around HTSlib's C API using Cython +and a high-level API for convenient access to the data within standard genomic +file formats. ''' import collections import glob +import logging import os import platform import re @@ -29,13 +23,20 @@ import subprocess import sys import sysconfig from contextlib import contextmanager -from distutils import log from setuptools import setup, Command -from distutils.command.build import build from setuptools.command.sdist import sdist -from distutils.errors import LinkError +from setuptools.extension import Extension + +try: + from setuptools.errors import LinkError +except ImportError: + from distutils.errors import LinkError + +try: + from Cython.Distutils import build_ext +except ImportError: + from setuptools.command.build_ext import build_ext -from cy_build import CyExtension as Extension, cy_build_ext as build_ext try: import cython # noqa HAVE_CYTHON = True @@ -45,6 +46,8 @@ except ImportError: IS_PYTHON3 = sys.version_info.major >= 3 IS_DARWIN = platform.system() == 'Darwin' +log = logging.getLogger('pysam') + @contextmanager def changedir(path): @@ -233,19 +236,34 @@ class cythonize_sdist(sdist): sdist.run(self) -# Override build command to add extra build steps. -class extra_build(build): +# Override Cythonised build_ext command to customise macOS shared libraries. + +class CyExtension(Extension): + def __init__(self, *args, **kwargs): + self._init_func = kwargs.pop("init_func", None) + self._prebuild_func = kwargs.pop("prebuild_func", None) + Extension.__init__(self, *args, **kwargs) + + def extend_includes(self, includes): + self.include_dirs.extend(includes) + + def extend_macros(self, macros): + self.define_macros.extend(macros) + + def extend_extra_objects(self, objs): + self.extra_objects.extend(objs) + + +class cy_build_ext(build_ext): def check_ext_symbol_conflicts(self): """Checks for symbols defined in multiple extension modules, which can lead to crashes due to incorrect functions being invoked. Avoid by adding an appropriate #define to import/pysam.h or in unusual cases adding another rewrite rule to devtools/import.py. """ - build_ext_obj = self.distribution.get_command_obj('build_ext') - symbols = dict() for ext in self.distribution.ext_modules: - for sym in run_nm_defined_symbols(build_ext_obj.get_ext_fullpath(ext.name)): + for sym in run_nm_defined_symbols(self.get_ext_fullpath(ext.name)): symbols.setdefault(sym, []).append(ext.name.lstrip('pysam.')) errors = 0 @@ -257,14 +275,55 @@ class extra_build(build): if errors > 0: raise LinkError("symbols defined in multiple extensions") def run(self): - build.run(self) + if sys.platform == 'darwin': + ldshared = os.environ.get('LDSHARED', sysconfig.get_config_var('LDSHARED')) + os.environ['LDSHARED'] = ldshared.replace('-bundle', '') + + build_ext.run(self) try: if HTSLIB_MODE != 'separate': self.check_ext_symbol_conflicts() except OSError as e: - log.warn("skipping symbol collision check (invoking nm failed: %s)", e) + log.warning("skipping symbol collision check (invoking nm failed: %s)", e) except subprocess.CalledProcessError: - log.warn("skipping symbol collision check (invoking nm failed)") + log.warning("skipping symbol collision check (invoking nm failed)") + + def build_extension(self, ext): + + if isinstance(ext, CyExtension) and ext._init_func: + ext._init_func(ext) + + if not self.inplace: + ext.library_dirs.append(os.path.join(self.build_lib, "pysam")) + + if sys.platform == 'darwin': + # The idea is to give shared libraries an install name of the form + # `@rpath/`, and to set the rpath equal to + # @loader_path. This will allow Python packages to find the library + # in the expected place, while still giving enough flexibility to + # external applications to link against the library. + relative_module_path = ext.name.replace(".", os.sep) + (sysconfig.get_config_var('EXT_SUFFIX') or sysconfig.get_config_var('SO')) + library_path = os.path.join( + "@rpath", os.path.basename(relative_module_path) + ) + + if not ext.extra_link_args: + ext.extra_link_args = [] + ext.extra_link_args += ['-dynamiclib', + '-rpath', '@loader_path', + '-Wl,-headerpad_max_install_names', + '-Wl,-install_name,%s' % library_path, + '-Wl,-x'] + else: + if not ext.extra_link_args: + ext.extra_link_args = [] + + ext.extra_link_args += ['-Wl,-rpath,$ORIGIN'] + + if isinstance(ext, CyExtension) and ext._prebuild_func: + ext._prebuild_func(ext, self.force) + + build_ext.build_extension(self, ext) class clean_ext(Command): @@ -432,7 +491,7 @@ with open(os.path.join("pysam", "config.py"), "w") as outf: for line in inf: if line.startswith("#define"): key, value = re.match( - "#define (\S+)\s+(\S+)", line).groups() + r"#define (\S+)\s+(\S+)", line).groups() config_values[key] = value for key in ["ENABLE_GCS", "ENABLE_PLUGINS", @@ -515,7 +574,7 @@ def prebuild_libchtslib(ext, force): args = " ".join(ext.extra_compile_args) run_make(["ALL_CPPFLAGS=-I. " + args + " $(CPPFLAGS)", "lib-static"]) else: - log.warn("skipping 'libhts.a' (already built)") + log.warning("skipping 'libhts.a' (already built)") def prebuild_libcsamtools(ext, force): @@ -609,8 +668,9 @@ Operating System :: MacOS metadata = { 'name': "pysam", 'version': get_pysam_version(), - 'description': "pysam", + 'description': "Package for reading, manipulating, and writing genomic data", 'long_description': __doc__, + 'long_description_content_type': "text/x-rst", 'author': "Andreas Heger", 'author_email': "andreas.heger@gmail.com", 'license': "MIT", @@ -618,9 +678,8 @@ metadata = { 'classifiers': [_f for _f in classifiers.split("\n") if _f], 'url': "https://github.com/pysam-developers/pysam", 'packages': package_list, - 'requires': ['cython (>=0.29.12)'], - 'ext_modules': [Extension(**opts) for opts in modules], - 'cmdclass': {'build': extra_build, 'build_ext': build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist}, + 'ext_modules': [CyExtension(**opts) for opts in modules], + 'cmdclass': {'build_ext': cy_build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist}, 'package_dir': package_dirs, 'package_data': {'': ['*.pxd', '*.h', 'py.typed', '*.pyi'], }, # do not pack in order to permit linking to csamtools.so diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 1dc72d5..855ae47 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -776,6 +776,32 @@ class TestAlignedSegment(ReadTest): ], ) + def test_get_aligned_pairs_1character_md(self): + a = self.build_read() + a.query_sequence = "A" * 7 + a.cigarstring = "7M" + a.set_tag("MD", "7", value_type="A") + self.assertEqual( + a.get_aligned_pairs(with_seq=True), + [ + (0, 20, "A"), + (1, 21, "A"), + (2, 22, "A"), + (3, 23, "A"), + (4, 24, "A"), + (5, 25, "A"), + (6, 26, "A"), + ], + ) + + def test_get_aligned_pairs_bad_type_md(self): + a = self.build_read() + a.query_sequence = "A" * 7 + a.cigarstring = "7M" + a.set_tag("MD", 7) + with self.assertRaises(TypeError): + a.get_aligned_pairs(with_seq=True) + def testNoSequence(self): """issue 176: retrieving length without query sequence with soft-clipping. diff --git a/tests/_compile_test.pyx b/tests/_compile_test.pyx index dfe7937..ea2c646 100644 --- a/tests/_compile_test.pyx +++ b/tests/_compile_test.pyx @@ -1,3 +1,5 @@ +# cython: language_level=3 + from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment from pysam.libctabix cimport Tabixfile diff --git a/tests/_cython_flagstat.pyx b/tests/_cython_flagstat.pyx index 8e376b0..39cc15f 100644 --- a/tests/_cython_flagstat.pyx +++ b/tests/_cython_flagstat.pyx @@ -1,3 +1,5 @@ +# cython: language_level=3 + from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment from pysam.libcalignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED from pysam.libcalignedsegment cimport pysam_get_flag diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 150c5c5..0000000 --- a/tox.ini +++ /dev/null @@ -1,8 +0,0 @@ -# content of: tox.ini , put in same dir as setup.py -[tox] -envlist = py36 py311 - -[testenv] -deps = pytest # install pytest in the virtualenv where commands will be executed -commands = - pytest tests