--- /dev/null
+build_wheels_task:
+ only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*"
+
+ matrix:
+ - compute_engine_instance:
+ image_project: cirrus-images
+ image: family/docker-builder-arm64
+ architecture: arm64
+ platform: linux
+ matrix:
+ - name: Build ARM Linux py3.6-9 wheels
+ env:
+ CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-*"
+ - name: Build ARM Linux py3.10-12 wheels
+ env:
+ CIBW_BUILD: "cp310-* cp311-* cp312-*"
+
+ - name: Build ARM macOS wheels
+ macos_instance:
+ image: ghcr.io/cirruslabs/macos-ventura-base:latest
+ env:
+ CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*"
+
+ alias: build_wheels
+
+ env:
+ CIRRUS_CLONE_DEPTH: 1
+
+ CIBW_SKIP: "*-musllinux_*"
+ CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28
+
+ install_script: |
+ python3 -m pip install cibuildwheel==2.16.2
+
+ build_script: |
+ cibuildwheel
+
+ wheels_artifacts:
+ path: wheelhouse/*.whl
+
+upload_pypi_task:
+ only_if: $CIRRUS_BRANCH =~ "release/.*" || $CIRRUS_TAG =~ "v0\..*"
+ depends_on: build_wheels
+
+ name: Publish ARM wheels
+
+ container:
+ image: python:latest
+
+ env:
+ CIRRUS_CLONE_DEPTH: 1
+ API_BASEURL: https://api.cirrus-ci.com/v1
+ TWINE_USERNAME: __token__
+
+ install_script: |
+ python3 -m pip install twine
+
+ get_artifacts_script: |
+ curl -sSLO $API_BASEURL/artifact/build/$CIRRUS_BUILD_ID/wheels.zip
+ unzip -q wheels.zip
+
+ upload_script: |
+ case "$CIRRUS_TAG" in
+ v0.*)
+ export TWINE_REPOSITORY=pypi TWINE_PASSWORD=$PYPI_TOKEN ;;
+ *)
+ export TWINE_REPOSITORY=testpypi TWINE_PASSWORD=$TESTPYPI_TOKEN ;;
+ esac
+
+ echo Uploading wheels to $TWINE_REPOSITORY...
+
+ python3 -m twine check wheelhouse/*.whl
+ python3 -m twine upload --disable-progress-bar wheelhouse/*.whl
+++ /dev/null
-os:
- - linux
- - osx
-
-language: c
-
-stages:
- - test
- - name: deploy
- if: tag IS present
-
-env:
- matrix:
- - CONDA_PY=2.7
- - CONDA_PY=3.6
- - CONDA_PY=3.7
- - CONDA_PY=3.8
- global:
- - PYSAM_LINKING_TEST=1
- - TWINE_USERNAME=grepall
- - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU=
-
-_cibw_common: &cibw_common
- addons: {}
- install:
- - python3 -m pip install cibuildwheel>=1.1.0 twine
- script:
- - set -e
- - cibuildwheel --output-dir dist
- - twine check dist/*
- - twine upload --skip-existing dist/*
-
-_cibw_linux: &cibw_linux
- stage: deploy
- os: linux
- language: python
- python: '3.5'
- services:
- - docker
- <<: *cibw_common
-
-_cibw_linux_aarch64: &cibw_linux_aarch64
- stage: deploy
- os: linux
- arch: arm64
- language: python
- python: '3.9'
- services:
- - docker
- <<: *cibw_common
-
-matrix:
- include:
- - stage: deploy
- os: linux
- language: python
- python: '3.5'
- addons:
- apt:
- packages:
- - gcc
- - g++
- - libcurl4-openssl-dev # for libcurl support in sdist
- - libssl-dev # for s3 support in sdist
- install:
- - python3 -m pip install Cython twine
- script:
- - set -e
- - python3 setup.py build_ext --inplace
- - python3 setup.py sdist
- - twine check dist/*
- - twine upload --skip-existing dist/*
- - <<: *cibw_linux
- env:
- - CIBW_BUILD="*_x86_64"
- - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
- - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
- - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
- - CIBW_TEST_COMMAND='python -c "import pysam"'
- - <<: *cibw_linux
- env:
- - CIBW_BUILD="*_i686"
- - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
- - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
- - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
- - CIBW_TEST_COMMAND='python -c "import pysam"'
- - <<: *cibw_linux_aarch64
- env:
- - CIBW_BUILD="*_aarch64"
- - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
- - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
- - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
- - CIBW_TEST_COMMAND='python -c "import pysam"'
- - stage: deploy
- os: osx
- language: generic
- env:
- - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt"
- - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
- - CIBW_TEST_COMMAND='python -c "import pysam"'
- <<: *cibw_common
-
-addons:
- apt:
- packages:
- - gcc
- - g++
-
-script:
- - ./devtools/run_tests_travis.sh
-
-notifications:
- email:
- - andreas.heger@gmail.com
include INSTALL
include KNOWN_BUGS
include THANKS
-include cy_build.py
-include requirements.txt
+include requirements-dev.txt
include pysam/libc*.pxd
include pysam/libc*.pyx
include pysam/libc*.c
include htslib/Makefile htslib/*.mk
exclude htslib/config.mk htslib/htscodecs.mk
-include cy_build.py
-include requirements.txt
-
# documentation
include doc/*.py doc/*.rst
-include doc/Makefile doc/make.bat
+include doc/Makefile doc/make.bat doc/requirements-rtd.txt
-An online version of the installation instructions can be found here:
-http://pysam.readthedocs.io/en/latest/release.html
+.. An online version of the release history can be found here:
+.. http://pysam.readthedocs.io/en/latest/release.html
+
+Release 0.22.0
+==============
+
+.. rubric:: 5 October 2023
+
+This pysam release wraps htslib/samtools/bcftools 1.18 (PR #1208).
+
+It has been tested with Python versions 3.6 through 3.12, and wheels are
+available via pypi_ for all of those Python versions. Python versions 3.6
+and 3.7 are end-of-life; particularly if you use pysam with either of
+these versions, please vote in the version survey at issue #1230.
+
+The final pysam release that supported Python 2.7 was v0.20.0.
+
+Bugs fixed:
+
+* Remove Cython from runtime dependencies (PR #1186, thanks to Nicola Soranzo,
+ also reported by Arya Massarat in PR #1194)
+
+* Miscellaneous dependency improvements (PR #1216, #1217, PR #1218, PR #1219,
+ thanks to Martin Larralde and Arthur Vigil)
+
+* Suppress spurious "Could not retrieve index file" message when opening an
+ AlignmentFile (#939, #1214, reported by ChengYong Tham and Sebastian Röner)
+
+* Propagate SAM parsing errors encounted in :meth:`.AlignedSegment.fromstring`
+ (#1196, reported by DV Klopfenstein)
+
+* Accept invalid MD:A tagged fields produced by HTSeq instead of crashing
+ in :meth:`AlignedSegment.get_aligned_pairs(with_seq=True)
+ <.AlignedSegment.get_aligned_pairs>` (#1226, reported by Isaac Vock)
+
+* Fix multiarch macOS CI builds by removing brewed liblzma (#1205, reported
+ by Till Hartmann)
+
+* Fix :attr:`.VariantRecordSample.alleles` type hint (#1179, reported by
+ David Seifert)
+
+New functionality:
+
+* Add optional :meth:`HTSFile.seek(..., whence) <.HTSFile.seek>` parameter
+ and clarify which functions use libc.SEEK_SET vs io.SEEK_SET
+ (#1185, requested by luyulin)
+
+* File handling improvements in samtools & bcftools commands (should improve
+ #1193 and #1195, reported by Rob Bierman and Sam Chorlton)
+
+* Improve :class:`.FastxFile` performance (PR #1227, thanks to Fabian Klötzl
+ and Valentyn Bezshapkin)
+
+* Improve the accuracy of type hints for :class:`.AlignmentFile` iteration
+ (#1184, PR #1189, reported by @PikalaxALT)
+
+Documentation improvements:
+
+* Clarify that :meth:`.AlignedSegment.get_aligned_pairs` results are 0-based
+ (#1180, reported by Nick Semenkovich)
+
+* Clarify :meth:`.AlignedSegment.get_reference_positions` documentation
+ (#836, #838, reported by Liang Ou and Nick Stoler)
+
+* Clarify that installation via pip usually uses a wheel, and that configuring
+ the build via $HTSLIB_CONFIGURE_OPTIONS etc only applies when installing from
+ an sdist (#1086, reported by Layne Sadler)
+
+A message from pysam's founder, Andreas Heger:
+
+ As many of you will have noticed, John Marshall has been effectively
+ maintaining pysam and supporting users over the last few years.
+ I, Andreas, am very grateful for the countless hours he has contributed.
+ Unfortunately, I will not be able to contribute much in the near and
+ intermediate future. To keep pysam going, John has kindly agreed to
+ continue maintaining and supporting pysam as the principal developer
+ of pysam. I am very happy to know that pysam is in good hands and want
+ to thank again John and the wider pysam community for their suggestions,
+ bug reports, code contributions and general support.
+
+Thank you Andreas for all your work over the years and the solid foundations
+that pysam enjoys and the useful functionality it provides.
-=============
-Release notes
-=============
Release 0.21.0
==============
+.. rubric:: 2 April 2023
+
This release wraps htslib/samtools/bcftools version 1.17.
-Pysam is now compatible with Python 3.11. We have removed python 2.x
-support. Pysam is tested with python versions 3.6 to 3.11.
+Pysam is now compatible with Python 3.11. We have removed Python 2.x
+support. Pysam is tested with Python versions 3.6 to 3.11.
* [#1175] VariantHeader.new_record: set start/stop before alleles
* [#1173] Add multiple build improvements in htscodecs on multi-arch macOS
* [#1149] MacOS universal build compatibility.
* [#1146] Fix build when CFLAGS/etc environment variables are set.
+
Release 0.20.0
==============
+.. rubric:: 29 October 2022
+
This release wraps htslib/bcftools version 1.16 and samtools version 1.16.1.
* [#1113] Full compatibility with setuptools v62.1.0's build directory name changes
Many additional type hints have been provided by the community,
thanks!
+
Release 0.19.1
==============
+.. rubric:: 27 May 2022
+
This release wraps htslib/samtools/bcftools version 1.15.1.
* [#1104] add an add_samples() method to quickly add multiple samples
to VCF.
+
Release 0.19.0
==============
+.. rubric:: 30 March 2022
+
This release wraps htslib/samtools/bcftools version 1.15.
* [#1085] Improve getopt()/getopt_long() resetting when running samtools/bcftools commands
* Fix BGZFile.read() behaviour near or at EOF
* First API for the htslib modified bases interface
-
+
+
Release 0.18.0
==============
+.. rubric:: 17 November 2021
+
This release wraps htslib/samtools/bcftools version 1.14.
* [#1048] and [#1060], clarify documentation of index statistics with CRAM files
* Add new "samples" subcommand to pysam/samtools.py
* Introduce TupleProxyIterator iterator object class
+
Release 0.17.0
==============
+.. rubric:: 30 September 2021
+
This release wraps htslib/samtools/bcftools version 1.13. Corresponding
to new samtools commands, `pysam.samtools` now has additional functions
`ampliconclip`, `ampliconstats`, `fqimport`, and `version`.
Release 0.16.0
==============
+.. rubric:: 8 June 2020
+
This release wraps htslib/bcftools version 1.10.2 and samtools version
1.10. The following bugs reported against pysam are fixed due to this:
* [#846] Prevent segmentation fault on ID, when handling malformed records
* [#829] Run configure with the correct CC/CFLAGS/LDFLAGS env vars
+
Release 0.15.3
==============
Release 0.15.0
==============
-This release wraps htslib (and friends) version 1.9.
+This release wraps htslib/samtools/bcftools version 1.9.
* [#673] permit dash in chromosome name of region string
* [#656] Support `text` when opening a SAM file for writing
* treat border case of all bases in pileup column below quality score
* [#634] Fix access to pileup reference_sequence
+
Release 0.14.0
==============
* [#537] allow tabix index files to be created in a custom location.
* [#530] add get_index_statistics() method
+
Release 0.12.0.1
================
* [#473] A new FastxRecord class that can be instantiated from class and
modified in-place. Replaces PersistentFastqProxy.
* [#521] In AligmentFile, Simplify file detection logic and allow remote index files
+
* Removed attempts to guess data and index file names; this is magic left
to htslib.
* Removed file existence check prior to opening files with htslib
* Allow remote indices (tested using S3 signed URLs).
* Document filepath_index and make it an alias for index_filename.
* Added a require_index parameter to AlignmentFile
+
* [#526] handle unset ref when creating new records
* [#513] fix bcf_translate to skip deleted FORMAT fields to avoid
segfaults
* renamed several methods for pep8 compatibility, old names still retained for
backwards compatibility, but should be considered deprecated.
+
* gettid() is now get_tid()
* getrname() is now get_reference_name()
* parseRegion() is now parse_region()
* some methods have changed for pep8 compatibility without the old
names being present:
+
* fromQualityString() is now qualitystring_to_array()
* toQualityString() is now qualities_to_qualitystring()
* Pysam now wraps htslib and samtools versions 1.1.
* Bugfixes, most notable:
+
* issue #43: uncompressed BAM output
* issue #42: skip tests requiring network if none available
* issue #19: multiple iterators can now be made to work on the same tabix file
compilation options. Especially for OS X this will potentially save a
lot of trouble.
-The current version of pysam wraps 3rd-party code from htslib-1.17, samtools-1.17, and bcftools-1.17.
+The current version of pysam wraps 3rd-party code from htslib-1.18, samtools-1.18, and bcftools-1.18.
Pysam is available through `pypi
<https://pypi.python.org/pypi/pysam>`_. To install, type::
.. _tabix: http://samtools.sourceforge.net/tabix.shtml
.. _Li 2009: http://www.ncbi.nlm.nih.gov/pubmed/19505943
-.. |build-status| image:: https://travis-ci.org/pysam-developers/pysam.svg
+.. |build-status| image:: https://github.com/pysam-developers/pysam/actions/workflows/ci.yaml/badge.svg
:alt: build status
:scale: 100%
- :target: https://travis-ci.org/pysam-developers/pysam
+ :target: https://github.com/pysam-developers/pysam/actions/workflows/ci.yaml
.. |docs| image:: https://readthedocs.org/projects/pysam/badge/?version=latest
:alt: Documentation Status
-----------------------------------------------------------------------------
-LICENSE FOR VariantKey (https://github.com/Genomicsplc/variantkey)
+LICENSE FOR VariantKey (https://github.com/tecnickcom/variantkey)
The MIT License
Copyright (c) 2017-2018 GENOMICS plc
+Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
/* bcftools.h -- utility function declarations.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
// newline will be added by the function.
void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
+// For on the fly index creation with --write-index
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname);
+
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
const char *hts_bcf_wmode2(int file_type, const char *fname);
cs->icig++;
continue;
}
+ if ( op==BAM_CHARD_CLIP || op==BAM_CPAD )
+ {
+ cs->icig++;
+ continue;
+ }
+ error("FIXME: not ready for CIGAR operator %d\n",op);
}
// the read starts after pos
if ( trim_left )
cs->icig++;
continue;
}
+ if ( op==BAM_CHARD_CLIP || op==BAM_CPAD )
+ {
+ cs->icig++;
+ continue;
+ }
+ error("FIXME: not ready for CIGAR operator %d\n",op);
}
return cs->icig < cs->ncig ? -1 : -2;
}
#define PICK_SHORT 8
#define PICK_IUPAC 16
-#define TO_UPPER 0
-#define TO_LOWER 1
+#define TO_UPPER 1
+#define TO_LOWER 2
typedef struct
{
{
char *ss, *se = line;
while ( *se && !isspace(*se) && *se!=':' ) se++;
- int from = 0, to = 0;
+ hts_pos_t from = 0, to = 0;
char tmp = 0, *tmp_ptr = NULL;
if ( *se )
{
args->fa_frz_mod = -1;
args->fa_case = -1;
args->vcf_rbuf.n = 0;
- bcf_sr_seek(args->files,line,args->fa_ori_pos);
+
+ kstring_t str = {0,0,0};
+ if ( from==0 ) from = 1;
+ if ( to==0 ) to = HTS_POS_MAX;
+ ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
+ bcf_sr_set_regions(args->files,line,0);
+ free(str.s);
+
if ( tmp_ptr ) *tmp_ptr = tmp;
fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
if ( args->chain_fname )
static void mark_ins(char *ref, char *alt, char mark)
{
int i, nref = strlen(ref), nalt = strlen(alt);
- if ( mark=='l' )
+ if ( mark==TO_LOWER )
for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
- else
+ else if ( mark==TO_UPPER )
for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+ else if ( mark )
+ for (i=nref; i<nalt; i++) alt[i] = mark;
}
static void mark_snv(char *ref, char *alt, char mark)
{
int i, nref = strlen(ref), nalt = strlen(alt);
int n = nref < nalt ? nref : nalt;
- if ( mark=='l' )
+ if ( mark==TO_LOWER )
{
for (i=0; i<n; i++)
if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
}
- else
+ else if ( mark==TO_UPPER)
{
for (i=0; i<n; i++)
if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
}
+ else if ( mark==TO_UPPER)
+ {
+ for (i=0; i<n; i++)
+ if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+ }
+ else if ( mark )
+ {
+ for (i=0; i<n; i++)
+ if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = mark;
+ }
}
static void iupac_init(args_t *args, bcf1_t *rec)
{
fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
fprintf(stderr, " -H, --haplotype WHICH Choose which allele to use from the FORMAT/GT field, note\n");
fprintf(stderr, " the codes are case-insensitive:\n");
- fprintf(stderr, " 1: first allele from GT, regardless of phasing\n");
- fprintf(stderr, " 2: second allele from GT, regardless of phasing\n");
+ fprintf(stderr, " N: N={1,2,3,..} is the index of the allele from GT, regardless of phasing (e.g. \"2\")\n");
fprintf(stderr, " R: REF allele in het genotypes\n");
fprintf(stderr, " A: ALT allele\n");
fprintf(stderr, " I: IUPAC code for all genotypes\n");
fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
- fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+ fprintf(stderr, " NpIu: index of the allele for phased and IUPAC code for unphased GTs (e.g. \"2pIu\")\n");
fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -I, --iupac-codes Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n");
- fprintf(stderr, " --mark-del CHAR Instead of removing sequence, insert CHAR for deletions\n");
- fprintf(stderr, " --mark-ins uc|lc Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
- fprintf(stderr, " --mark-snv uc|lc Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(stderr, " --mark-del CHAR Instead of removing sequence, insert character CHAR for deletions\n");
+ fprintf(stderr, " --mark-ins uc|lc|CHAR Highlight insertions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
+ fprintf(stderr, " --mark-snv uc|lc|CHAR Highlight substitutions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
fprintf(stderr, " -m, --mask FILE Replace regions according to the next --mask-with option. The default is --mask-with N\n");
fprintf(stderr, " --mask-with CHAR|uc|lc Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
fprintf(stderr, " -M, --missing CHAR Output CHAR instead of skipping a missing genotype \"./.\"\n");
{
case 1 : args->mark_del = optarg[0]; break;
case 2 :
- if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
- else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+ if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER;
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER;
+ else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0];
else error("The argument is not recognised: --mark-ins %s\n",optarg);
break;
case 3 :
- if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
- else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+ if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER;
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER;
+ else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
else error("The argument is not recognised: --mark-snv %s\n",optarg);
break;
case 'p': args->chr_prefix = optarg; break;
{
char *tmp;
args->haplotype = strtol(optarg, &tmp, 10);
- if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg);
+ if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg);
+ if ( *tmp ) args->allele |= PICK_IUPAC;
if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n");
}
break;
#define PICK_SHORT 8
#define PICK_IUPAC 16
-#define TO_UPPER 0
-#define TO_LOWER 1
+#define TO_UPPER 1
+#define TO_LOWER 2
typedef struct
{
{
char *ss, *se = line;
while ( *se && !isspace(*se) && *se!=':' ) se++;
- int from = 0, to = 0;
+ hts_pos_t from = 0, to = 0;
char tmp = 0, *tmp_ptr = NULL;
if ( *se )
{
args->fa_frz_mod = -1;
args->fa_case = -1;
args->vcf_rbuf.n = 0;
- bcf_sr_seek(args->files,line,args->fa_ori_pos);
+
+ kstring_t str = {0,0,0};
+ if ( from==0 ) from = 1;
+ if ( to==0 ) to = HTS_POS_MAX;
+ ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
+ bcf_sr_set_regions(args->files,line,0);
+ free(str.s);
+
if ( tmp_ptr ) *tmp_ptr = tmp;
fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
if ( args->chain_fname )
static void mark_ins(char *ref, char *alt, char mark)
{
int i, nref = strlen(ref), nalt = strlen(alt);
- if ( mark=='l' )
+ if ( mark==TO_LOWER )
for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
- else
+ else if ( mark==TO_UPPER )
for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+ else if ( mark )
+ for (i=nref; i<nalt; i++) alt[i] = mark;
}
static void mark_snv(char *ref, char *alt, char mark)
{
int i, nref = strlen(ref), nalt = strlen(alt);
int n = nref < nalt ? nref : nalt;
- if ( mark=='l' )
+ if ( mark==TO_LOWER )
{
for (i=0; i<n; i++)
if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
}
- else
+ else if ( mark==TO_UPPER)
{
for (i=0; i<n; i++)
if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
}
+ else if ( mark==TO_UPPER)
+ {
+ for (i=0; i<n; i++)
+ if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+ }
+ else if ( mark )
+ {
+ for (i=0; i<n; i++)
+ if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = mark;
+ }
}
static void iupac_init(args_t *args, bcf1_t *rec)
{
fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
fprintf(bcftools_stderr, " -H, --haplotype WHICH Choose which allele to use from the FORMAT/GT field, note\n");
fprintf(bcftools_stderr, " the codes are case-insensitive:\n");
- fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n");
- fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n");
+ fprintf(bcftools_stderr, " N: N={1,2,3,..} is the index of the allele from GT, regardless of phasing (e.g. \"2\")\n");
fprintf(bcftools_stderr, " R: REF allele in het genotypes\n");
fprintf(bcftools_stderr, " A: ALT allele\n");
fprintf(bcftools_stderr, " I: IUPAC code for all genotypes\n");
fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
- fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+ fprintf(bcftools_stderr, " NpIu: index of the allele for phased and IUPAC code for unphased GTs (e.g. \"2pIu\")\n");
fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
fprintf(bcftools_stderr, " -I, --iupac-codes Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n");
- fprintf(bcftools_stderr, " --mark-del CHAR Instead of removing sequence, insert CHAR for deletions\n");
- fprintf(bcftools_stderr, " --mark-ins uc|lc Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
- fprintf(bcftools_stderr, " --mark-snv uc|lc Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(bcftools_stderr, " --mark-del CHAR Instead of removing sequence, insert character CHAR for deletions\n");
+ fprintf(bcftools_stderr, " --mark-ins uc|lc|CHAR Highlight insertions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
+ fprintf(bcftools_stderr, " --mark-snv uc|lc|CHAR Highlight substitutions in uppercase (uc), lowercase (lc), or use CHAR, leaving the rest as is\n");
fprintf(bcftools_stderr, " -m, --mask FILE Replace regions according to the next --mask-with option. The default is --mask-with N\n");
fprintf(bcftools_stderr, " --mask-with CHAR|uc|lc Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
fprintf(bcftools_stderr, " -M, --missing CHAR Output CHAR instead of skipping a missing genotype \"./.\"\n");
{
case 1 : args->mark_del = optarg[0]; break;
case 2 :
- if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
- else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+ if ( !strcasecmp(optarg,"uc") ) args->mark_ins = TO_UPPER;
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = TO_LOWER;
+ else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_ins = optarg[0];
else error("The argument is not recognised: --mark-ins %s\n",optarg);
break;
case 3 :
- if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
- else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+ if ( !strcasecmp(optarg,"uc") ) args->mark_snv = TO_UPPER;
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = TO_LOWER;
+ else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
else error("The argument is not recognised: --mark-snv %s\n",optarg);
break;
case 'p': args->chr_prefix = optarg; break;
{
char *tmp;
args->haplotype = strtol(optarg, &tmp, 10);
- if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg);
+ if ( tmp==optarg || (*tmp && strcasecmp(tmp,"pIu")) ) error("Error: Could not parse \"--haplotype %s\", expected number of number followed with \"pIu\"\n", optarg);
+ if ( *tmp ) args->allele |= PICK_IUPAC;
if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n");
}
break;
char **used_tags_list;
int nused_tags;
int allow_undef_tags;
+ int force_newline;
uint8_t **subset_samples;
};
static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
vcf_format1(convert->header, line, str);
+ if ( str->s[str->l-1]=='\n' ) str->l--;
}
static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
int convert_header(convert_t *convert, kstring_t *str)
{
int i, icol = 0, l_ori = str->l;
- bcf_hdr_t *hdr = convert->header;
// Supress the header output if LINE is present
for (i=0; i<convert->nfmt; i++)
if ( i!=convert->nfmt )
return str->l - l_ori;
+ // Header formatting becomes problematic when the formatting expression contains a newline.
+ // Simple cases like
+ // -f'[%CHROM %POS %SAMPLE\n]'
+ // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines
+ // are present.
+ int has_fmt_newline = 0;
kputc('#', str);
for (i=0; i<convert->nfmt; i++)
{
while ( convert->fmt[j].is_gt_field ) j++;
for (js=0; js<convert->nsamples; js++)
{
- int ks = convert->samples[js];
for (k=i; k<j; k++)
{
if ( convert->fmt[k].type == T_SEP )
{
- if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str);
+ if ( convert->fmt[k].key )
+ {
+ char *tmp = convert->fmt[k].key;
+ while ( *tmp )
+ {
+ if ( *tmp=='\n' ) has_fmt_newline = 1;
+ else kputc(*tmp,str);
+ tmp++;
+ }
+ }
}
- else if ( convert->fmt[k].type == T_SAMPLE )
- ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
else
- ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
+ ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
}
+ if ( has_fmt_newline ) break;
}
i = j-1;
continue;
}
ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
}
+ if ( has_fmt_newline ) kputc('\n',str);
return str->l - l_ori;
}
return str->l - l_ori;
}
+static void force_newline_(convert_t *convert)
+{
+ int i, has_newline = 0;
+ for (i=0; i<convert->nfmt; i++)
+ {
+ if ( !convert->fmt[i].key ) continue;
+ char *tmp = convert->fmt[i].key;
+ while (*tmp)
+ {
+ if ( *tmp=='\n' ) { has_newline = 1; break; }
+ tmp++;
+ }
+ if ( has_newline ) break;
+ }
+ if ( has_newline ) return;
+
+ // A newline is not present, force it. But where to add it?
+ // Consider
+ // -f'%CHROM[ %SAMPLE]\n'
+ // vs
+ // -f'[%CHROM %SAMPLE\n]'
+ for (i=0; i<convert->nfmt; i++)
+ if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
+
+ if ( i < convert->nfmt )
+ register_tag(convert, "\n", 0, T_SEP); // the first case
+ else
+ {
+ // the second case
+ i = convert->nfmt - 1;
+ if ( !convert->fmt[i].key )
+ {
+ convert->fmt[i].key = strdup("\n");
+ convert->fmt[i].is_gt_field = 1;
+ register_tag(convert, NULL, 0, T_SEP);
+ }
+ else
+ register_tag(convert, "\n", 1, T_SEP);
+ }
+}
+
int convert_set_option(convert_t *convert, enum convert_option opt, ...)
{
int ret = 0;
case subset_samples:
convert->subset_samples = va_arg(args, uint8_t**);
break;
+ case force_newline:
+ convert->force_newline = va_arg(args, int);
+ if ( convert->force_newline ) force_newline_(convert);
+ break;
default:
ret = -1;
}
char **used_tags_list;
int nused_tags;
int allow_undef_tags;
+ int force_newline;
uint8_t **subset_samples;
};
static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
vcf_format1(convert->header, line, str);
+ if ( str->s[str->l-1]=='\n' ) str->l--;
}
static void process_chrom_pos_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
int convert_header(convert_t *convert, kstring_t *str)
{
int i, icol = 0, l_ori = str->l;
- bcf_hdr_t *hdr = convert->header;
// Supress the header output if LINE is present
for (i=0; i<convert->nfmt; i++)
if ( i!=convert->nfmt )
return str->l - l_ori;
+ // Header formatting becomes problematic when the formatting expression contains a newline.
+ // Simple cases like
+ // -f'[%CHROM %POS %SAMPLE\n]'
+ // can be handled quite easily with has_fmt_newline. Note this will not work if multiple newlines
+ // are present.
+ int has_fmt_newline = 0;
kputc('#', str);
for (i=0; i<convert->nfmt; i++)
{
while ( convert->fmt[j].is_gt_field ) j++;
for (js=0; js<convert->nsamples; js++)
{
- int ks = convert->samples[js];
for (k=i; k<j; k++)
{
if ( convert->fmt[k].type == T_SEP )
{
- if ( convert->fmt[k].key ) kputs(convert->fmt[k].key, str);
+ if ( convert->fmt[k].key )
+ {
+ char *tmp = convert->fmt[k].key;
+ while ( *tmp )
+ {
+ if ( *tmp=='\n' ) has_fmt_newline = 1;
+ else kputc(*tmp,str);
+ tmp++;
+ }
+ }
}
- else if ( convert->fmt[k].type == T_SAMPLE )
- ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
else
- ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
+ ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
}
+ if ( has_fmt_newline ) break;
}
i = j-1;
continue;
}
ksprintf(str, "[%d]%s", ++icol, convert->fmt[i].key);
}
+ if ( has_fmt_newline ) kputc('\n',str);
return str->l - l_ori;
}
return str->l - l_ori;
}
+static void force_newline_(convert_t *convert)
+{
+ int i, has_newline = 0;
+ for (i=0; i<convert->nfmt; i++)
+ {
+ if ( !convert->fmt[i].key ) continue;
+ char *tmp = convert->fmt[i].key;
+ while (*tmp)
+ {
+ if ( *tmp=='\n' ) { has_newline = 1; break; }
+ tmp++;
+ }
+ if ( has_newline ) break;
+ }
+ if ( has_newline ) return;
+
+ // A newline is not present, force it. But where to add it?
+ // Consider
+ // -f'%CHROM[ %SAMPLE]\n'
+ // vs
+ // -f'[%CHROM %SAMPLE\n]'
+ for (i=0; i<convert->nfmt; i++)
+ if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
+
+ if ( i < convert->nfmt )
+ register_tag(convert, "\n", 0, T_SEP); // the first case
+ else
+ {
+ // the second case
+ i = convert->nfmt - 1;
+ if ( !convert->fmt[i].key )
+ {
+ convert->fmt[i].key = strdup("\n");
+ convert->fmt[i].is_gt_field = 1;
+ register_tag(convert, NULL, 0, T_SEP);
+ }
+ else
+ register_tag(convert, "\n", 1, T_SEP);
+ }
+}
+
int convert_set_option(convert_t *convert, enum convert_option opt, ...)
{
int ret = 0;
case subset_samples:
convert->subset_samples = va_arg(args, uint8_t**);
break;
+ case force_newline:
+ convert->force_newline = va_arg(args, int);
+ if ( convert->force_newline ) force_newline_(convert);
+ break;
default:
ret = -1;
}
/* convert.h -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2014-2021 Genome Research Ltd.
+ Copyright (C) 2014-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
{
allow_undef_tags,
subset_samples,
+ force_newline,
};
convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
Read about transcript types here
http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
http://www.ensembl.org/info/genome/variation/predicted_data.html
- http://www.gencodegenes.org/gencode_biotypes.html
+ https://www.gencodegenes.org/pages/biotypes.html
List of supported biotypes
antisense
IG_LV_gene
IG_V_gene
lincRNA
+ lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
macro_lncRNA
miRNA
misc_RNA
Mt_tRNA
polymorphic_pseudogene
processed_transcript
- protein_coding
+ protein_coding, mRNA
ribozyme
rRNA
sRNA
#include <htslib/khash_str2int.h>
#include <htslib/kseq.h>
#include <htslib/faidx.h>
+#include <htslib/bgzf.h>
#include <errno.h>
#include <unistd.h>
#include <ctype.h>
#include "kheap.h"
#include "smpl_ilist.h"
#include "rbuf.h"
+#include "gff.h"
#ifndef __FUNCTION__
# define __FUNCTION__ __func__
#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2
-// Definition of splice_region, splice_acceptor and splice_donor
-#define N_SPLICE_DONOR 2
-#define N_SPLICE_REGION_EXON 3
-#define N_SPLICE_REGION_INTRON 8
-
#define N_REF_PAD 10 // number of bases to avoid boundary effects
-#define STRAND_REV 0
-#define STRAND_FWD 1
-
-#define TRIM_NONE 0
-#define TRIM_5PRIME 1
-#define TRIM_3PRIME 2
-
// How to treat phased/unphased genotypes
#define PHASE_REQUIRE 0 // --phase r
#define PHASE_MERGE 1 // --phase m
#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING))
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
// see kput_vcsq()
"start_retained"
};
-
-// GFF line types
-#define GFF_UNKN_LINE 0
-#define GFF_TSCRIPT_LINE 1
-#define GFF_GENE_LINE 2
-
-
-/*
- Genomic features, for fast lookup by position to overlapping features
-*/
-#define GF_coding_bit 6
-#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
-#define GF_MT_rRNA 1 // non-coding: 1, 2, ...
-#define GF_MT_tRNA 2
-#define GF_lincRNA 3
-#define GF_miRNA 4
-#define GF_MISC_RNA 5
-#define GF_rRNA 6
-#define GF_snRNA 7
-#define GF_snoRNA 8
-#define GF_PROCESSED_TRANSCRIPT 9
-#define GF_ANTISENSE 10
-#define GF_macro_lncRNA 11
-#define GF_ribozyme 12
-#define GF_sRNA 13
-#define GF_scRNA 14
-#define GF_scaRNA 15
-#define GF_SENSE_INTRONIC 16
-#define GF_SENSE_OVERLAPPING 17
-#define GF_PSEUDOGENE 18
-#define GF_PROCESSED_PSEUDOGENE 19
-#define GF_ARTIFACT 20
-#define GF_IG_PSEUDOGENE 21
-#define GF_IG_C_PSEUDOGENE 22
-#define GF_IG_J_PSEUDOGENE 23
-#define GF_IG_V_PSEUDOGENE 24
-#define GF_TR_V_PSEUDOGENE 25
-#define GF_TR_J_PSEUDOGENE 26
-#define GF_MT_tRNA_PSEUDOGENE 27
-#define GF_misc_RNA_PSEUDOGENE 28
-#define GF_miRNA_PSEUDOGENE 29
-#define GF_RIBOZYME 30
-#define GF_RETAINED_INTRON 31
-#define GF_RETROTRANSPOSED 32
-#define GF_tRNA_PSEUDOGENE 33
-#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE 34
-#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE 35
-#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE 36
-#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE 37
-#define GF_TRANSLATED_PROCESSED_PSEUDOGENE 38
-#define GF_KNOWN_NCRNA 39
-#define GF_UNITARY_PSEUDOGENE 40
-#define GF_UNPROCESSED_PSEUDOGENE 41
-#define GF_LRG_GENE 42
-#define GF_3PRIME_OVERLAPPING_ncRNA 43
-#define GF_DISRUPTED_DOMAIN 44
-#define GF_vaultRNA 45
-#define GF_BIDIRECTIONAL_PROMOTER_lncRNA 46
-#define GF_AMBIGUOUS_ORF 47
-#define GF_PROTEIN_CODING (1|(1<<GF_coding_bit)) // coding: 65, 66, ...
-#define GF_POLYMORPHIC_PSEUDOGENE (2|(1<<GF_coding_bit))
-#define GF_IG_C (3|(1<<GF_coding_bit))
-#define GF_IG_D (4|(1<<GF_coding_bit))
-#define GF_IG_J (5|(1<<GF_coding_bit))
-#define GF_IG_LV (6|(1<<GF_coding_bit))
-#define GF_IG_V (7|(1<<GF_coding_bit))
-#define GF_TR_C (8|(1<<GF_coding_bit))
-#define GF_TR_D (9|(1<<GF_coding_bit))
-#define GF_TR_J (10|(1<<GF_coding_bit))
-#define GF_TR_V (11|(1<<GF_coding_bit))
-#define GF_NMD (12|(1<<GF_coding_bit))
-#define GF_NON_STOP_DECAY (13|(1<<GF_coding_bit))
-#define GF_CDS ((1<<(GF_coding_bit+1))+1) // special types: 129, 130, ...
-#define GF_EXON ((1<<(GF_coding_bit+1))+2)
-#define GF_UTR3 ((1<<(GF_coding_bit+1))+3)
-#define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
-// GF_MAX = (1<<30)-1, see hap_node_t
-
-#define CDS_PHASE_UNKN 3
-typedef struct _tscript_t tscript_t;
-typedef struct
-{
- tscript_t *tr; // transcript
- uint32_t beg; // the start coordinate of the CDS (on the reference strand, 0-based)
- uint32_t pos; // 0-based index of the first exon base within the transcript (only to
- // update hap_node_t.sbeg in hap_init, could be calculated on the fly)
- uint32_t len; // exon length
- uint32_t icds:30, // exon index within the transcript
- phase:2; // offset of the CDS: 0,1,2 or 3 for unknown
-}
-gf_cds_t;
-typedef struct
-{
- char *name; // human readable name, e.g. ORF45
- uint32_t iseq;
-}
-gf_gene_t;
-typedef struct
-{
- uint32_t beg,end;
- tscript_t *tr;
-}
-gf_exon_t;
-typedef enum { prime3, prime5 } utr_t;
-typedef struct
-{
- utr_t which;
- uint32_t beg,end;
- tscript_t *tr;
-}
-gf_utr_t;
-
-
/*
Structures related to VCF output:
csq_t *csq_list; // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
int ncsq_list, mcsq_list;
};
-struct _tscript_t
+#define TSCRIPT_AUX(x) ((tscript_t*)(x)->aux)
+typedef struct
{
- uint32_t id; // transcript id
- uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
- uint32_t strand:1, // STRAND_REV or STRAND_FWD
- ncds:31, // number of exons
- mcds;
- gf_cds_t **cds; // ordered list of exons
char *ref; // reference sequence, padded with N_REF_PAD bases on both ends
char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends
hap_node_t *root; // root of the haplotype tree
hap_node_t **hap; // pointer to haplotype leaves, two for each sample
int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD
- uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
- type:30; // one of GF_* types
- gf_gene_t *gene;
-};
-static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+}
+tscript_t;
+static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b)
{
return ( (*a)->end < (*b)->end ) ? 1 : 0;
}
-KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript)
typedef khp_trhp_t tr_heap_t;
typedef struct
{
{
int mstack;
hstack_t *stack;
- tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand
kstring_t sseq; // spliced haplotype sequence on ref strand
kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
kstring_t tref; // the variable part of translated reference transcript, coding strand
}
hap_t;
-
-/*
- Helper structures, only for initialization
-
- ftr_t
- temporary list of all exons, CDS, UTRs
-*/
-KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
-typedef struct
-{
- int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
- uint32_t beg;
- uint32_t end;
- uint32_t trid;
- uint32_t strand:1; // STRAND_REV,STRAND_FWD
- uint32_t phase:2; // 0, 1, 2, or 3 for unknown
- uint32_t iseq:29;
-}
-ftr_t;
-/*
- Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
- to integer id. To keep the memory requirements low, the original version
- relied on IDs in the form of a string prefix and a numerical id. However,
- it turns out that this assumption is not valid for some ensembl GFFs, see
- for example Zea_mays.AGPv4.36.gff3.gz
- */
-typedef struct
-{
- void *str2id; // khash_str2int
- int nstr, mstr;
- char **str; // numeric id to string
-}
-id_tbl_t;
-typedef struct
-{
- // all exons, CDS, UTRs
- ftr_t *ftr;
- int nftr, mftr;
-
- // mapping from gene id to gf_gene_t
- kh_int2gene_t *gid2gene;
-
- // mapping from transcript id to tscript, for quick CDS anchoring
- kh_int2tscript_t *id2tr;
-
- // sequences
- void *seq2int; // str2int hash
- char **seq;
- int nseq, mseq;
-
- // ignored biotypes
- void *ignored_biotypes;
-
- id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
-}
-aux_t;
-
typedef struct _args_t
{
// the main regidx lookups, from chr:beg-end to overlapping features and
// index iterator
+ gff_t *gff;
regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
regitr_t *itr;
- // temporary structures, deleted after initializtion
- aux_t init;
-
// text tab-delimited output (out) or vcf/bcf output (out_fh)
FILE *out;
htsFile *out_fh;
+ char *index_fn;
+ int write_index;
+ char *dump_gff;
// vcf
bcf_srs_t *sr;
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
int brief_predictions;
+ int unify_chr_names;
+ char *chr_name;
+ int mchr_name;
+ struct {
+ int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ } warned;
int rid; // current chromosome
tr_heap_t *active_tr; // heap of active transcripts for quick flushing
vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush
rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf
kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position
- tscript_t **rm_tr; // buffer of transcripts to clean
+ gf_tscript_t **rm_tr; // buffer of transcripts to clean
int nrm_tr, mrm_tr;
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
- id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
int n_threads; // extra compression/decompression threads
#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
-static const char *gf_strings_noncoding[] =
-{
- "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
- "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
- "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
- "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
- "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
- "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
- "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
- "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
-};
-static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
-static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
-
-const char *gf_type2gff_string(int type)
-{
- if ( !GF_is_coding(type) )
- {
- if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
- type &= (1<<(GF_coding_bit+1)) - 1;
- return gf_strings_special[type - 1];
- }
- type &= (1<<GF_coding_bit) - 1;
- return gf_strings_coding[type - 1];
-}
-
-/*
- gff parsing functions
-*/
-static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
-{
- aux_t *aux = &args->init;
- char c = chr_end[1];
- chr_end[1] = 0;
- int iseq;
- if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
- {
- // check for possible mismatch in chromosome naming convention such as chrX vs X
- char *new_chr = NULL;
- if ( faidx_has_seq(args->fai,chr_beg) )
- new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx
- else
- {
- int len = strlen(chr_beg);
- if ( !strncmp("chr",chr_beg,3) && len>3 )
- new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not
- else
- {
- new_chr = malloc(len+4); // gff does not have the prefix, faidx has
- memcpy(new_chr,"chr",3);
- memcpy(new_chr+3,chr_beg,len);
- new_chr[len+3] = 0;
- }
- if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai
- {
- static int unkwn_chr_warned = 0;
- if ( !unkwn_chr_warned && args->verbosity>0 )
- fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
- unkwn_chr_warned = 1;
- free(new_chr);
- new_chr = strdup(chr_beg); // use the original sequence name
- }
- }
- if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
- {
- hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
- aux->seq[aux->nseq] = new_chr;
- iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
- aux->nseq++;
- assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
- }
- else
- free(new_chr);
- }
- chr_end[1] = c;
- return iseq;
-}
-static inline char *gff_skip(const char *line, char *ss)
-{
- while ( *ss && *ss!='\t' ) ss++;
- if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- return ss+1;
-}
-static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
-{
- char *se = (char*) line;
- while ( *se && *se!='\t' ) se++;
- if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- *chr_beg = (char*) line;
- *chr_end = se-1;
-}
-static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
-{
- char *se = ss;
- *beg = strtol(ss, &se, 10) - 1;
- if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
- ss = se+1;
- *end = strtol(ss, &se, 10) - 1;
- if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- return se+1;
-}
-static void gff_id_init(id_tbl_t *tbl)
-{
- memset(tbl, 0, sizeof(*tbl));
- tbl->str2id = khash_str2int_init();
-}
-static void gff_id_destroy(id_tbl_t *tbl)
-{
- khash_str2int_destroy_free(tbl->str2id);
- free(tbl->str);
-}
-// returns 0 on success, -1 on failure
-static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr)
-{
- ss = strstr(ss,needle); // e.g. "ID=transcript:"
- if ( !ss ) return -1;
- ss += strlen(needle);
-
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- char tmp = *se;
- *se = 0;
-
- int id;
- if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
- {
- id = tbl->nstr++;
- hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
- tbl->str[id] = strdup(ss);
- khash_str2int_set(tbl->str2id, tbl->str[id], id);
- }
- *se = tmp;
- *id_ptr = id;
- return 0;
-}
-static inline int gff_parse_type(char *line)
-{
- line = strstr(line,"ID=");
- if ( !line ) return -1;
- line += 3;
- if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
- else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
- return -1;
-}
-static inline int gff_parse_biotype(char *_line)
-{
- char *line = strstr(_line,"biotype=");
- if ( !line ) return -1;
-
- line += 8;
- switch (*line)
- {
- case 'p':
- if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
- else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
- else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
- else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
- break;
- case 'a':
- if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
- else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
- else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
- break;
- case 'I':
- if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
- else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
- else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
- else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
- else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
- else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
- else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
- else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
- else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
- break;
- case 'T':
- if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
- else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
- else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
- else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
- else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
- else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
- break;
- case 'M':
- if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
- else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
- else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
- break;
- case 'l':
- if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
- break;
- case 'm':
- if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
- else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
- else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
- else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
- else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
- break;
- case 'r':
- if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
- else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
- else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
- else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
- break;
- case 's':
- if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
- else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
- else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
- else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
- else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
- else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
- else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
- break;
- case 't':
- if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
- else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
- break;
- case 'n':
- if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
- else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
- break;
- case 'k':
- if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
- break;
- case 'u':
- if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
- else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
- break;
- case 'L':
- if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
- break;
- case '3':
- if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
- break;
- case 'd':
- if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
- break;
- case 'v':
- if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
- break;
- case 'b':
- if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
- break;
- }
- return 0;
-}
-static inline int gff_ignored_biotype(args_t *args, char *ss)
-{
- ss = strstr(ss,"biotype=");
- if ( !ss ) return 0;
-
- ss += 8;
- char *se = ss, tmp;
- while ( *se && *se!=';' ) se++;
- tmp = *se;
- *se = 0;
-
- char *key = ss;
- int n = 0;
- if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
- khash_str2int_set(args->init.ignored_biotypes, key, n+1);
-
- *se = tmp;
- return 1;
-}
-gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
-{
- khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
- gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
- if ( !gene )
- {
- gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
- int ret;
- k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
- kh_val(aux->gid2gene,k) = gene;
- }
- return gene;
-}
-void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- int biotype = gff_parse_biotype(ss);
- if ( biotype <= 0 )
- {
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript, unknown biotype: %s\n",line);
- return;
- }
-
- // create a mapping from transcript_id to gene_id
- uint32_t trid, gene_id;
- if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) )
- {
- if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) )
- error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line);
- warned = 1;
- }
- }
- if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) )
- {
- if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) )
- error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
- tr->id = trid;
- tr->strand = ftr->strand;
- tr->gene = gene_init(aux, gene_id);
- tr->type = biotype;
- tr->beg = ftr->beg;
- tr->end = ftr->end;
-
- khint_t k;
- int ret;
- k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
- kh_val(aux->id2tr,k) = tr;
-}
-void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
-{
- int biotype = gff_parse_biotype(ss);
- if ( biotype <= 0 )
- {
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene, unknown biotype: %s\n",line);
- return;
- }
-
- aux_t *aux = &args->init;
-
- // substring search for "ID=gene:ENSG00000437963"
- uint32_t gene_id;
- if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) )
- {
- if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) )
- error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- gf_gene_t *gene = gene_init(aux, gene_id);
- assert( !gene->name ); // the gene_id should be unique
-
- gene->iseq = feature_set_seq(args, chr_beg,chr_end);
-
- // substring search for "Name=OR4F5"
- ss = strstr(chr_end+2,"Name=");
- if ( ss )
- {
- ss += 5;
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- gene->name = (char*) malloc(se-ss+1);
- memcpy(gene->name,ss,se-ss);
- gene->name[se-ss] = 0;
- }
- else
- gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
-}
-int gff_parse(args_t *args, char *line, ftr_t *ftr)
-{
- // - skip empty lines and commented lines
- // - columns
- // 1. chr
- // 2. <skip>
- // 3. CDS, transcript, gene, ...
- // 4-5. beg,end
- // 6. <skip>
- // 7. strand
- // 8. phase
- // 9. Parent=transcript:ENST(\d+);ID=... etc
-
- char *ss = line;
- if ( !*ss ) return -1; // skip blank lines
- if ( *ss=='#' ) return -1; // skip comments
-
- char *chr_beg, *chr_end;
- gff_parse_chr(line, &chr_beg, &chr_end);
- ss = gff_skip(line, chr_end + 2);
-
- // 3. column: is this a CDS, transcript, gene, etc.
- if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
- else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
- else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
- else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
- else
- {
- int type = GFF_UNKN_LINE;
- if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE;
- else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE;
- ss = gff_skip(line, ss);
- ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
- ss = gff_skip(line, ss);
- if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene:
- if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
- {
- // we ignore these, debug print to see new types:
- ss = strstr(ss,"ID=");
- if ( !ss ) return -1; // no ID, ignore the line
- if ( !strncmp("chromosome",ss+3,10) ) return -1;
- if ( !strncmp("supercontig",ss+3,11) ) return -1;
- if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line);
- return -1;
- }
-
- // 7. column: strand
- if ( *ss == '+' ) ftr->strand = STRAND_FWD;
- else if ( *ss == '-' ) ftr->strand = STRAND_REV;
- else error("Unknown strand: %c .. %s\n", *ss,ss);
-
- if ( type==GFF_TSCRIPT_LINE )
- gff_parse_transcript(args, line, ss, ftr);
- else
- gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
-
- return -1;
- }
- ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
- ss = gff_skip(line, ss);
-
- // 7. column: strand
- if ( *ss == '+' ) ftr->strand = STRAND_FWD;
- else if ( *ss == '-' ) ftr->strand = STRAND_REV;
- else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
- ss += 2;
-
- // 8. column: phase (codon offset)
- if ( *ss == '0' ) ftr->phase = 0;
- else if ( *ss == '1' ) ftr->phase = 1;
- else if ( *ss == '2' ) ftr->phase = 2;
- else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase
- else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
- ss += 2;
-
- // substring search for "Parent=transcript:ENST00000437963"
- if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) )
- {
- if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) )
- error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
- return 0;
-}
-
-static int cmp_cds_ptr(const void *a, const void *b)
-{
- // comparison function for qsort of transcripts's CDS
- if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
- if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
- return 0;
-}
-
-static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
-{
- *chr_beg = *chr_end = aux->seq[iseq];
- while ( (*chr_end)[1] ) (*chr_end)++;
-}
-tscript_t *tscript_init(aux_t *aux, uint32_t trid)
-{
- khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
- tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
- assert( tr );
- return tr;
-}
-void register_cds(args_t *args, ftr_t *ftr)
-{
- // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
- // ftr is the result of parsing a gff CDS line
- aux_t *aux = &args->init;
-
- tscript_t *tr = tscript_init(aux, ftr->trid);
- if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
-
- gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
- cds->tr = tr;
- cds->beg = ftr->beg;
- cds->len = ftr->end - ftr->beg + 1;
- cds->icds = 0; // to keep valgrind on mac happy
- cds->phase = ftr->phase;
-
- hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
- tr->cds[tr->ncds++] = cds;
-}
-void register_utr(args_t *args, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
- utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
- utr->beg = ftr->beg;
- utr->end = ftr->end;
- utr->tr = tscript_init(aux, ftr->trid);
-
- char *chr_beg, *chr_end;
- chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
-}
-void register_exon(args_t *args, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
- exon->beg = ftr->beg;
- exon->end = ftr->end;
- exon->tr = tscript_init(aux, ftr->trid);
-
- char *chr_beg, *chr_end;
- chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
-}
-
-void tscript_init_cds(args_t *args)
-{
- aux_t *aux = &args->init;
-
- // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
- khint_t k;
- int warn_phase_unkn = 0;
- for (k=0; k<kh_end(aux->id2tr); k++)
- {
- if ( !kh_exist(aux->id2tr, k) ) continue;
- tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
-
- // position-to-tscript lookup
- char *chr_beg, *chr_end;
- chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
-
- if ( !tr->ncds ) continue; // transcript with no CDS
-
- // sort CDs
- qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
-
- // trim non-coding start
- int i, len = 0;
- if ( tr->strand==STRAND_FWD )
- {
- if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
- {
- if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
- tr->cds[0]->beg += tr->cds[0]->phase;
- tr->cds[0]->len -= tr->cds[0]->phase;
- tr->cds[0]->phase = 0;
- }
-
- // sanity check phase; the phase number in gff tells us how many bases to skip in this
- // feature to reach the first base of the next codon
- int tscript_ok = 1;
- for (i=0; i<tr->ncds; i++)
- {
- if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
- {
- warn_phase_unkn = 1;
- len += tr->cds[i]->len;
- continue;
- }
- int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
- if ( phase!=len%3 )
- {
- if ( args->force )
- {
- if ( args->verbosity > 0 )
- fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- tscript_ok = 0;
- break;
- }
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- }
- len += tr->cds[i]->len;
- }
- if ( !tscript_ok ) continue; // skip this transcript
- }
- else
- {
- if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
- {
- // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
- // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
- // todo: the same for the fwd strand
- i = tr->ncds - 1;
- int phase = tr->cds[i]->phase;
- if ( phase ) tr->trim |= TRIM_5PRIME;
- while ( i>=0 && phase > tr->cds[i]->len )
- {
- phase -= tr->cds[i]->len;
- tr->cds[i]->phase = 0;
- tr->cds[i]->len = 0;
- i--;
- }
- tr->cds[i]->len -= tr->cds[i]->phase;
- tr->cds[i]->phase = 0;
- }
-
- // sanity check phase
- int tscript_ok = 1;
- for (i=tr->ncds-1; i>=0; i--)
- {
- if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
- {
- warn_phase_unkn = 1;
- len += tr->cds[i]->len;
- continue;
- }
- int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
- if ( phase!=len%3)
- {
- if ( args->force )
- {
- if ( args->verbosity > 0 )
- fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- tscript_ok = 0;
- break;
- }
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- }
- len += tr->cds[i]->len;
- }
- if ( !tscript_ok ) continue; // skip this transcript
- }
-
- // set len. At the same check that CDS within a transcript do not overlap
- len = 0;
- for (i=0; i<tr->ncds; i++)
- {
- tr->cds[i]->icds = i;
- len += tr->cds[i]->len;
- if ( !i ) continue;
-
- gf_cds_t *a = tr->cds[i-1];
- gf_cds_t *b = tr->cds[i];
- if ( a->beg + a->len - 1 >= b->beg )
- {
- if ( args->force )
- {
- fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
- args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
- }
- else
- error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
- " Use the --force option to override (at your own risk).\n",
- args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
- }
- }
- if ( len%3 != 0 )
- {
- // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
- // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
- // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
-
- tr->trim |= TRIM_3PRIME;
- if ( tr->strand==STRAND_FWD )
- {
- i = tr->ncds - 1;
- while ( i>=0 && len%3 )
- {
- int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
- tr->cds[i]->len -= dlen;
- len -= dlen;
- i--;
- }
- }
- else
- {
- i = 0;
- while ( i<tr->ncds && len%3 )
- {
- int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
- tr->cds[i]->len -= dlen;
- tr->cds[i]->beg += dlen;
- len -= dlen;
- i++;
- }
- }
- }
-
- // set CDS offsets and insert into regidx
- len=0;
- for (i=0; i<tr->ncds; i++)
- {
- tr->cds[i]->pos = len;
- len += tr->cds[i]->len;
- regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
- }
- }
- if ( warn_phase_unkn && args->verbosity > 0 )
- fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n");
-}
-
-void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
-void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
-
-void init_gff(args_t *args)
-{
- aux_t *aux = &args->init;
- aux->seq2int = khash_str2int_init(); // chrom's numeric id
- aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
- aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
- args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
- aux->ignored_biotypes = khash_str2int_init();
- gff_id_init(&aux->gene_ids);
- gff_id_init(&args->tscript_ids);
-
- // parse gff
- kstring_t str = {0,0,0};
- htsFile *fp = hts_open(args->gff_fname,"r");
- if ( !fp ) error("Failed to read %s\n", args->gff_fname);
- while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
- {
- hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
- int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
- if ( !ret ) aux->nftr++;
- }
- free(str.s);
- if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
-
-
- // process gff information: connect CDS and exons to transcripts
- args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
- args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
- args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
- args->itr = regitr_init(NULL);
-
- int i;
- for (i=0; i<aux->nftr; i++)
- {
- ftr_t *ftr = &aux->ftr[i];
-
- // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
- khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
- if ( k==kh_end(aux->id2tr) ) continue; // no such transcript
-
- tscript_t *tr = kh_val(aux->id2tr,k);
- if ( !tr->gene->name )
- {
- // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
- regidx_free_tscript(&tr);
- kh_del(int2tscript, aux->id2tr,k);
- continue;
- }
-
- // populate regidx by category:
- // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
- // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
- if ( ftr->type==GF_CDS ) register_cds(args, ftr);
- else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
- else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
- else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
- else
- error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
- }
- tscript_init_cds(args);
-
- if ( args->verbosity > 0 )
- {
- fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
- regidx_nregs(args->idx_tscript),
- regidx_nregs(args->idx_exon),
- regidx_nregs(args->idx_cds),
- regidx_nregs(args->idx_utr));
- }
- if ( !regidx_nregs(args->idx_tscript) )
- fprintf(stderr,
- "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
- " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
- " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
-
- free(aux->ftr);
- khash_str2int_destroy_free(aux->seq2int);
- // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
- kh_destroy(int2tscript,aux->id2tr);
- free(aux->seq);
- gff_id_destroy(&aux->gene_ids);
-
- if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
- {
- khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
- fprintf(stderr,"Ignored the following biotypes:\n");
- for (i = kh_begin(ign); i < kh_end(ign); i++)
- {
- if ( !kh_exist(ign,i)) continue;
- const char *biotype = kh_key(ign,i);
- if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
- fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
- }
- }
- khash_str2int_destroy_free(aux->ignored_biotypes);
-}
-
static inline int ncsq2_to_nfmt(int ncsq2)
{
return 1 + (ncsq2 - 1) / 30;
args->fai = fai_load(args->fa_fname);
if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
- if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
- init_gff(args);
+ args->gff = gff_init(args->gff_fname);
+ gff_set(args->gff,verbosity,args->verbosity);
+ gff_set(args->gff,strip_chr_names,args->unify_chr_names);
+ gff_set(args->gff,force_out_of_phase,args->force);
+ gff_set(args->gff,dump_fname,args->dump_gff);
+ gff_parse(args->gff);
+ args->idx_cds = gff_get(args->gff,idx_cds);
+ args->idx_utr = gff_get(args->gff,idx_utr);
+ args->idx_exon = gff_get(args->gff,idx_exon);
+ args->idx_tscript = gff_get(args->gff,idx_tscript);
+ args->itr = regitr_init(NULL);
args->rid = -1;
if ( args->hdr_nsmpl )
bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+ if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
}
"Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
" the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
- regidx_destroy(args->idx_cds);
- regidx_destroy(args->idx_utr);
- regidx_destroy(args->idx_exon);
- regidx_destroy(args->idx_tscript);
regitr_destroy(args->itr);
-
- khint_t k,i,j;
- for (k=0; k<kh_end(args->init.gid2gene); k++)
- {
- if ( !kh_exist(args->init.gid2gene, k) ) continue;
- gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
- free(gene->name);
- free(gene);
- }
- kh_destroy(int2gene,args->init.gid2gene);
+ gff_destroy(args->gff);
if ( args->filter )
filter_destroy(args->filter);
khp_destroy(trhp,args->active_tr);
kh_destroy(pos2vbuf,args->pos2vbuf);
if ( args->smpl ) smpl_ilist_destroy(args->smpl);
- int ret;
+ int i,j,ret;
if ( args->out_fh )
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
ret = hts_close(args->out_fh);
+ }
else
ret = fclose(args->out);
if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
free(args->gt_arr);
free(args->str.s);
free(args->str2.s);
- gff_id_destroy(&args->tscript_ids);
+ free(args->chr_name);
}
/*
#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
typedef struct
{
- tscript_t *tr;
+ gf_tscript_t *tr;
struct {
int32_t pos, rlen, alen, ial;
char *ref, *alt;
if ( rbeg < splice->vcf.pos )
{
assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD
- kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
roff = 0;
}
else
if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
if ( splice->kref.l < rlen )
- kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
}
#if XDBG
fprintf(stderr,"r3: %s\n",splice->kref.s);
if ( abeg < splice->vcf.pos )
{
assert( splice->tr->beg <= abeg );
- kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
aoff = 0;
}
else
if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
if ( alen > 0 && alen > splice->kalt.l )
- kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
}
#if XDBG
fprintf(stderr,"a3: %s\n",splice->kalt.s);
while ( regitr_overlap(itr) )
{
gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
- tscript_t *tr = utr->tr;
+ gf_tscript_t *tr = utr->tr;
if ( tr->id != trid ) continue;
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
}
return 0;
}
-static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
{
#if XDBG
fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
}
+static inline const char *drop_chr_prefix(args_t *args, const char *chr)
+{
+ if ( !args->unify_chr_names ) return chr;
+ if ( !strncasecmp("chr",chr,3) ) return chr+3;
+ return chr;
+}
+static inline const char *add_chr_prefix(args_t *args, const char *chr)
+{
+ if ( !args->unify_chr_names ) return chr;
+ int len = strlen(chr);
+ hts_expand(char,len+4,args->mchr_name,args->chr_name);
+ memcpy(args->chr_name,"chr",3);
+ memcpy(args->chr_name+3,chr,len+1);
+ return args->chr_name;
+}
static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
{
// coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
{
static int small_ref_padding_warned = 0;
- tscript_t *tr = splice->tr;
+ gf_tscript_t *tr = splice->tr;
// We know the VCF record overlaps the exon, but does it overlap the start codon?
if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0;
}
char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele
- char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted
+ char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted
#if XDBG
fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
#endif
}
char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele
- char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block
+ char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block
#if XDBG
fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
#endif
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
{
int i;
kstring_t str = {0,0,0};
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
child->icds = cds->icds; // index of cds in the tscript's list of exons
child->vcf_ial = ial;
}
if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
{
- if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
- else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
}
if ( child->icds!=0 ) splice.check_region_beg = 1;
if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
// the variant is on a new exon, finish up the previous
int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
if ( len > 0 )
- kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
}
// append any skipped non-variant exons
while ( ++i < cds->icds )
- kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
if ( parent->icds==child->icds )
{
free(splice.kalt.s);
return 1;
}
- kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
}
else
- kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
}
kputs(splice.kalt.s + dbeg, &str);
#endif
}
-void tscript_splice_ref(tscript_t *tr)
+void tscript_splice_ref(gf_tscript_t *tr)
{
int i, len = 0;
for (i=0; i<tr->ncds; i++)
len += tr->cds[i]->len;
- tr->nsref = len + 2*N_REF_PAD;
- tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
+ TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD;
+ TSCRIPT_AUX(tr)->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
len = 0;
- memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+ memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
len += N_REF_PAD;
for (i=0; i<tr->ncds; i++)
{
- memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+ memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
len += tr->cds[i]->len;
}
- memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+ memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
len += N_REF_PAD;
- tr->sref[len] = 0;
+ TSCRIPT_AUX(tr)->sref[len] = 0;
}
// returns: 0 if consequence was added, 1 if it already exists or could not be added
if ( csq->type & CSQ_UPSTREAM_STOP )
kputc_('*',str);
- int i, n = sizeof(csq_strings)/sizeof(char*);
+ int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*);
for (i=1; i<n; i++)
- if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+ if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputs(csq_strings[i],str); break; }
i++;
for (; i<n; i++)
- if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+ if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputc_('&',str); kputs(csq_strings[i],str); }
+
+ if ( (csq->biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) )
+ {
+ if ( has_csq ) kputc_('&',str); // just in case, this should always be true
+ kputs("NMD_transcript",str);
+ }
kputc_('|', str);
if ( csq->gene ) kputs(csq->gene , str);
kputc_('|', str);
- if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+// if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+ if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str);
kputc_('|', str);
kputs(gf_type2gff_string(csq->biotype), str);
void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
{
int i;
- tscript_t *tr = hap->tr;
+ gf_tscript_t *tr = hap->tr;
int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
int icsq = node->ncsq_list++;
hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
str.l = 0;
// create the aa variant string
- int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+ int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
void hap_finalize(args_t *args, hap_t *hap)
{
- tscript_t *tr = hap->tr;
- if ( !tr->sref )
+ gf_tscript_t *tr = hap->tr;
+ if ( !TSCRIPT_AUX(tr)->sref )
tscript_splice_ref(tr);
kstring_t sref;
- sref.s = tr->sref;
- sref.l = tr->nsref;
+ sref.s = TSCRIPT_AUX(tr)->sref;
+ sref.l = TSCRIPT_AUX(tr)->nsref;
sref.m = sref.l;
int istack = 0;
hap->sseq.l = 0;
hap->tseq.l = 0;
- hap->stack[0].node = tr->root;
+ hap->stack[0].node = TSCRIPT_AUX(tr)->root;
hap->stack[0].ichild = -1;
hap->stack[0].slen = 0;
hap->stack[0].dlen = 0;
kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
-static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
{
if ( !node || !node->ncsq_list ) return;
}
}
-static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
{
if ( !node || !node->ncsq_list || ismpl<0 ) return;
tr_heap_t *heap = args->active_tr;
while ( heap->ndat && heap->dat[0]->end<=pos )
{
- tscript_t *tr = heap->dat[0];
+ gf_tscript_t *tr = heap->dat[0];
khp_delete(trhp, heap);
args->hap->tr = tr;
- if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+ if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling
{
hap_finalize(args, args->hap);
if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf
{
if ( args->phase==PHASE_DROP_GT )
- hap_print_text(args, tr, -1,0, tr->hap[0]);
+ hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]);
else
{
for (i=0; i<args->smpl->n; i++)
{
for (j=0; j<2; j++)
- hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+ hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]);
}
}
}
for (i=0; i<args->smpl->n; i++)
{
for (j=0; j<2; j++)
- hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+ hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]);
}
}
}
// mark the transcript for deletion. Cannot delete it immediately because
// by-position VCF output will need them when flushed by vcf_buf_push
args->nrm_tr++;
- hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+ hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
args->rm_tr[args->nrm_tr-1] = tr;
}
}
for (i=0; i<args->nrm_tr; i++)
{
- tscript_t *tr = args->rm_tr[i];
- if ( tr->root ) hap_destroy(tr->root);
- tr->root = NULL;
- free(tr->hap);
- free(tr->ref);
- free(tr->sref);
+ gf_tscript_t *tr = args->rm_tr[i];
+ tscript_t *aux = TSCRIPT_AUX(tr);
+ if ( aux->root ) hap_destroy(aux->root);
+ aux->root = NULL;
+ free(aux->hap);
+ free(aux->ref);
+ free(aux->sref);
+ free(aux);
+ tr->aux = NULL;
}
args->nrm_tr = 0;
args->ncsq_buf = 0;
}
-void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
{
int i, len;
int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
- tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
- if ( !tr->ref )
+ const char *tmp_chr = chr;
+ if ( !faidx_has_seq(args->fai,tmp_chr) )
+ {
+ tmp_chr = drop_chr_prefix(args,chr);
+ if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr);
+ }
+ TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ if ( !TSCRIPT_AUX(tr)->ref )
error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
{
char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1);
for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
- memcpy(ref+i, tr->ref, len);
+ memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len);
len += i;
for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
ref[i+len] = 0;
- free(tr->ref);
- tr->ref = ref;
+ free(TSCRIPT_AUX(tr)->ref);
+ TSCRIPT_AUX(tr)->ref = ref;
}
}
-static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
{
int vbeg = 0;
int rbeg = rec->pos - tr->beg + N_REF_PAD;
if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; }
- char *ref = tr->ref + rbeg;
+ char *ref = TSCRIPT_AUX(tr)->ref + rbeg;
char *vcf = rec->d.allele[0] + vbeg;
- assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD );
+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD );
int i = 0;
while ( ref[i] && vcf[i] )
{
int test_cds_local(args_t *args, bcf1_t *rec)
{
int i,j, ret = 0;
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
if ( !GF_is_coding(tr->type) ) continue;
ret = 1;
- if ( !tr->ref )
+ if ( !TSCRIPT_AUX(tr) )
{
+ tr->aux = calloc(sizeof(tscript_t),1);
tscript_init_ref(args, tr, chr);
tscript_splice_ref(tr);
khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
sanity_check_ref(args, tr, rec);
kstring_t sref;
- sref.s = tr->sref;
- sref.l = tr->nsref;
+ sref.s = TSCRIPT_AUX(tr)->sref;
+ sref.l = TSCRIPT_AUX(tr)->nsref;
sref.m = sref.l;
for (i=1; i<rec->n_allele; i++)
{
// create the aa variant string
kstring_t str = {0,0,0};
- int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
- int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+ int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
kprint_aa_prediction(args,aa_rbeg,tref,&str);
csq_stage(args, &csq, rec);
// all this only to clean vstr when vrec is flushed
- if ( !tr->root )
- tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
- tr->root->ncsq_list++;
- hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
- csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+ if ( !TSCRIPT_AUX(tr)->root )
+ TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ TSCRIPT_AUX(tr)->root->ncsq_list++;
+ hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list);
+ csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1;
rm_csq->type.vstr = str;
}
if ( csq_type & ~CSQ_COMPOUND )
static int overlaps_warned = 0, multiploid_warned = 0;
int i, ret = 0, hap_ret;
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
if ( !GF_is_coding(tr->type) ) continue;
if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end;
ret = 1;
- if ( !tr->root )
+ if ( !TSCRIPT_AUX(tr) )
{
// initialize the transcript and its haplotype tree, fetch the reference sequence
+ tr->aux = calloc(sizeof(tscript_t),1);
tscript_init_ref(args, tr, chr);
- tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
- tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
- tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
- for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
- tr->root->nend = tr->nhap;
- tr->root->type = HAP_ROOT;
+ TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
+ TSCRIPT_AUX(tr)->hap = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*));
+ for (i=0; i<TSCRIPT_AUX(tr)->nhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL;
+ TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap;
+ TSCRIPT_AUX(tr)->root->type = HAP_ROOT;
khp_insert(trhp, args->active_tr, &tr);
}
if ( args->phase==PHASE_DROP_GT )
{
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
- hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+ hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root;
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
hap_ret = hap_init(args, parent, child, cds, rec, 1);
if ( hap_ret!=0 )
parent->mchild = 1;
parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*));
parent->child[0] = child;
- tr->hap[0] = child;
- tr->hap[0]->nend = 1;
+ TSCRIPT_AUX(tr)->hap[0] = child;
+ TSCRIPT_AUX(tr)->hap[0]->nend = 1;
continue;
}
assert( ial < rec->n_allele );
if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
- hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+ hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root;
if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
{
// this haplotype has been seen in another sample
- tr->hap[i] = parent->child[ parent->cur_child[ial] ];
- tr->hap[i]->nend++;
+ TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ];
+ TSCRIPT_AUX(tr)->hap[i]->nend++;
parent->nend--;
continue;
}
hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
parent->cur_child[ial] = j;
parent->child[j] = child;
- tr->hap[i] = child;
- tr->hap[i]->nend++;
+ TSCRIPT_AUX(tr)->hap[i] = child;
+ TSCRIPT_AUX(tr)->hap[i]->nend++;
parent->nend--;
}
}
}
int test_utr(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
- tscript_t *tr = splice.tr = utr->tr;
+ gf_tscript_t *tr = splice.tr = utr->tr;
for (i=1; i<rec->n_allele; i++)
{
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
}
int test_splice(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
splice_t splice;
}
int test_tscript(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
splice_t splice;
int i, ret = 0;
while ( regitr_overlap(args->itr) )
{
- tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
for (i=1; i<rec->n_allele; i++)
{
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
warned = 1;
}
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// only insertions atm
int beg = rec->pos + 1;
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class;
csq.pos = rec->pos;
csq.type.biotype = tr->type;
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
- tscript_t *tr = utr->tr;
+ gf_tscript_t *tr = utr->tr;
csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class;
csq.pos = rec->pos;
csq.type.biotype = tr->type;
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
- tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
splice.vcf.alt = rec->d.allele[1];
splice.csq = csq_class;
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
// Perform a simple sanity check (that does not catch much), the chromosome must be present in the
// reference file
if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
- error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ {
+ if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) )
+ error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ }
}
if ( prev_pos > rec->pos )
error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased hets\n"
- "Options:\n"
- " -e, --exclude EXPR Exclude sites for which the expression is true\n"
+ "GFF options:\n"
+ " --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n"
" --force Run even if some sanity checks fail\n"
+ " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n"
+ "General options:\n"
+ " -e, --exclude EXPR Exclude sites for which the expression is true\n"
" -i, --include EXPR Select sites for which the expression is true\n"
" --no-version Do not append version and command line to the header\n"
" -o, --output FILE Write output to a file [standard output]\n"
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
" --threads INT Use multithreading with <int> worker threads [0]\n"
" -v, --verbose INT Verbosity level 0-2 [1]\n"
+ " --write-index Automatically index the output files [off]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
args->verbosity = 1;
args->record_cmd_line = 1;
args->clevel = -1;
+ args->unify_chr_names = 1;
static struct option loptions[] =
{
{"targets-file",1,0,'T'},
{"targets-overlap",required_argument,NULL,5},
{"no-version",no_argument,NULL,3},
+ {"write-index",no_argument,NULL,6},
+ {"dump-gff",required_argument,NULL,7},
+ {"unify-chr-names",required_argument,NULL,8},
{0,0,0,0}
};
int c, targets_is_file = 0, regions_is_file = 0;
case 3 : args->record_cmd_line = 0; break;
case 'b':
args->brief_predictions = 1;
- fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+ fprintf(stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n");
break;
case 'B':
args->brief_predictions = strtol(optarg,&tmp,10);
targets_overlap = parse_overlap_option(optarg);
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
+ case 6 : args->write_index = 1; break;
+ case 7 : args->dump_gff = optarg; break;
+ case 8 :
+ if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
+ else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1;
+ else error("Could not parse: --unify-chr-names %s\n",optarg);
+ break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
Read about transcript types here
http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
http://www.ensembl.org/info/genome/variation/predicted_data.html
- http://www.gencodegenes.org/gencode_biotypes.html
+ https://www.gencodegenes.org/pages/biotypes.html
List of supported biotypes
antisense
IG_LV_gene
IG_V_gene
lincRNA
+ lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
macro_lncRNA
miRNA
misc_RNA
Mt_tRNA
polymorphic_pseudogene
processed_transcript
- protein_coding
+ protein_coding, mRNA
ribozyme
rRNA
sRNA
#include <htslib/khash_str2int.h>
#include <htslib/kseq.h>
#include <htslib/faidx.h>
+#include <htslib/bgzf.h>
#include <errno.h>
#include <unistd.h>
#include <ctype.h>
#include "kheap.h"
#include "smpl_ilist.h"
#include "rbuf.h"
+#include "gff.h"
#ifndef __FUNCTION__
# define __FUNCTION__ __func__
#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2
-// Definition of splice_region, splice_acceptor and splice_donor
-#define N_SPLICE_DONOR 2
-#define N_SPLICE_REGION_EXON 3
-#define N_SPLICE_REGION_INTRON 8
-
#define N_REF_PAD 10 // number of bases to avoid boundary effects
-#define STRAND_REV 0
-#define STRAND_FWD 1
-
-#define TRIM_NONE 0
-#define TRIM_5PRIME 1
-#define TRIM_3PRIME 2
-
// How to treat phased/unphased genotypes
#define PHASE_REQUIRE 0 // --phase r
#define PHASE_MERGE 1 // --phase m
#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING))
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
// see kput_vcsq()
"start_retained"
};
-
-// GFF line types
-#define GFF_UNKN_LINE 0
-#define GFF_TSCRIPT_LINE 1
-#define GFF_GENE_LINE 2
-
-
-/*
- Genomic features, for fast lookup by position to overlapping features
-*/
-#define GF_coding_bit 6
-#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
-#define GF_MT_rRNA 1 // non-coding: 1, 2, ...
-#define GF_MT_tRNA 2
-#define GF_lincRNA 3
-#define GF_miRNA 4
-#define GF_MISC_RNA 5
-#define GF_rRNA 6
-#define GF_snRNA 7
-#define GF_snoRNA 8
-#define GF_PROCESSED_TRANSCRIPT 9
-#define GF_ANTISENSE 10
-#define GF_macro_lncRNA 11
-#define GF_ribozyme 12
-#define GF_sRNA 13
-#define GF_scRNA 14
-#define GF_scaRNA 15
-#define GF_SENSE_INTRONIC 16
-#define GF_SENSE_OVERLAPPING 17
-#define GF_PSEUDOGENE 18
-#define GF_PROCESSED_PSEUDOGENE 19
-#define GF_ARTIFACT 20
-#define GF_IG_PSEUDOGENE 21
-#define GF_IG_C_PSEUDOGENE 22
-#define GF_IG_J_PSEUDOGENE 23
-#define GF_IG_V_PSEUDOGENE 24
-#define GF_TR_V_PSEUDOGENE 25
-#define GF_TR_J_PSEUDOGENE 26
-#define GF_MT_tRNA_PSEUDOGENE 27
-#define GF_misc_RNA_PSEUDOGENE 28
-#define GF_miRNA_PSEUDOGENE 29
-#define GF_RIBOZYME 30
-#define GF_RETAINED_INTRON 31
-#define GF_RETROTRANSPOSED 32
-#define GF_tRNA_PSEUDOGENE 33
-#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE 34
-#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE 35
-#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE 36
-#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE 37
-#define GF_TRANSLATED_PROCESSED_PSEUDOGENE 38
-#define GF_KNOWN_NCRNA 39
-#define GF_UNITARY_PSEUDOGENE 40
-#define GF_UNPROCESSED_PSEUDOGENE 41
-#define GF_LRG_GENE 42
-#define GF_3PRIME_OVERLAPPING_ncRNA 43
-#define GF_DISRUPTED_DOMAIN 44
-#define GF_vaultRNA 45
-#define GF_BIDIRECTIONAL_PROMOTER_lncRNA 46
-#define GF_AMBIGUOUS_ORF 47
-#define GF_PROTEIN_CODING (1|(1<<GF_coding_bit)) // coding: 65, 66, ...
-#define GF_POLYMORPHIC_PSEUDOGENE (2|(1<<GF_coding_bit))
-#define GF_IG_C (3|(1<<GF_coding_bit))
-#define GF_IG_D (4|(1<<GF_coding_bit))
-#define GF_IG_J (5|(1<<GF_coding_bit))
-#define GF_IG_LV (6|(1<<GF_coding_bit))
-#define GF_IG_V (7|(1<<GF_coding_bit))
-#define GF_TR_C (8|(1<<GF_coding_bit))
-#define GF_TR_D (9|(1<<GF_coding_bit))
-#define GF_TR_J (10|(1<<GF_coding_bit))
-#define GF_TR_V (11|(1<<GF_coding_bit))
-#define GF_NMD (12|(1<<GF_coding_bit))
-#define GF_NON_STOP_DECAY (13|(1<<GF_coding_bit))
-#define GF_CDS ((1<<(GF_coding_bit+1))+1) // special types: 129, 130, ...
-#define GF_EXON ((1<<(GF_coding_bit+1))+2)
-#define GF_UTR3 ((1<<(GF_coding_bit+1))+3)
-#define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
-// GF_MAX = (1<<30)-1, see hap_node_t
-
-#define CDS_PHASE_UNKN 3
-typedef struct _tscript_t tscript_t;
-typedef struct
-{
- tscript_t *tr; // transcript
- uint32_t beg; // the start coordinate of the CDS (on the reference strand, 0-based)
- uint32_t pos; // 0-based index of the first exon base within the transcript (only to
- // update hap_node_t.sbeg in hap_init, could be calculated on the fly)
- uint32_t len; // exon length
- uint32_t icds:30, // exon index within the transcript
- phase:2; // offset of the CDS: 0,1,2 or 3 for unknown
-}
-gf_cds_t;
-typedef struct
-{
- char *name; // human readable name, e.g. ORF45
- uint32_t iseq;
-}
-gf_gene_t;
-typedef struct
-{
- uint32_t beg,end;
- tscript_t *tr;
-}
-gf_exon_t;
-typedef enum { prime3, prime5 } utr_t;
-typedef struct
-{
- utr_t which;
- uint32_t beg,end;
- tscript_t *tr;
-}
-gf_utr_t;
-
-
/*
Structures related to VCF output:
csq_t *csq_list; // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
int ncsq_list, mcsq_list;
};
-struct _tscript_t
+#define TSCRIPT_AUX(x) ((tscript_t*)(x)->aux)
+typedef struct
{
- uint32_t id; // transcript id
- uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
- uint32_t strand:1, // STRAND_REV or STRAND_FWD
- ncds:31, // number of exons
- mcds;
- gf_cds_t **cds; // ordered list of exons
char *ref; // reference sequence, padded with N_REF_PAD bases on both ends
char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends
hap_node_t *root; // root of the haplotype tree
hap_node_t **hap; // pointer to haplotype leaves, two for each sample
int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD
- uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
- type:30; // one of GF_* types
- gf_gene_t *gene;
-};
-static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+}
+tscript_t;
+static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b)
{
return ( (*a)->end < (*b)->end ) ? 1 : 0;
}
-KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript)
typedef khp_trhp_t tr_heap_t;
typedef struct
{
{
int mstack;
hstack_t *stack;
- tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ gf_tscript_t *tr; // tr->ref: spliced transcript on ref strand
kstring_t sseq; // spliced haplotype sequence on ref strand
kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
kstring_t tref; // the variable part of translated reference transcript, coding strand
}
hap_t;
-
-/*
- Helper structures, only for initialization
-
- ftr_t
- temporary list of all exons, CDS, UTRs
-*/
-KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
-typedef struct
-{
- int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
- uint32_t beg;
- uint32_t end;
- uint32_t trid;
- uint32_t strand:1; // STRAND_REV,STRAND_FWD
- uint32_t phase:2; // 0, 1, 2, or 3 for unknown
- uint32_t iseq:29;
-}
-ftr_t;
-/*
- Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
- to integer id. To keep the memory requirements low, the original version
- relied on IDs in the form of a string prefix and a numerical id. However,
- it turns out that this assumption is not valid for some ensembl GFFs, see
- for example Zea_mays.AGPv4.36.gff3.gz
- */
-typedef struct
-{
- void *str2id; // khash_str2int
- int nstr, mstr;
- char **str; // numeric id to string
-}
-id_tbl_t;
-typedef struct
-{
- // all exons, CDS, UTRs
- ftr_t *ftr;
- int nftr, mftr;
-
- // mapping from gene id to gf_gene_t
- kh_int2gene_t *gid2gene;
-
- // mapping from transcript id to tscript, for quick CDS anchoring
- kh_int2tscript_t *id2tr;
-
- // sequences
- void *seq2int; // str2int hash
- char **seq;
- int nseq, mseq;
-
- // ignored biotypes
- void *ignored_biotypes;
-
- id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
-}
-aux_t;
-
typedef struct _args_t
{
// the main regidx lookups, from chr:beg-end to overlapping features and
// index iterator
+ gff_t *gff;
regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
regitr_t *itr;
- // temporary structures, deleted after initializtion
- aux_t init;
-
// text tab-delimited output (out) or vcf/bcf output (out_fh)
FILE *out;
htsFile *out_fh;
+ char *index_fn;
+ int write_index;
+ char *dump_gff;
// vcf
bcf_srs_t *sr;
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
int brief_predictions;
+ int unify_chr_names;
+ char *chr_name;
+ int mchr_name;
+ struct {
+ int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ } warned;
int rid; // current chromosome
tr_heap_t *active_tr; // heap of active transcripts for quick flushing
vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush
rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf
kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position
- tscript_t **rm_tr; // buffer of transcripts to clean
+ gf_tscript_t **rm_tr; // buffer of transcripts to clean
int nrm_tr, mrm_tr;
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
- id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
int n_threads; // extra compression/decompression threads
#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
-static const char *gf_strings_noncoding[] =
-{
- "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
- "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
- "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
- "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
- "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
- "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
- "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
- "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
-};
-static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
-static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
-
-const char *gf_type2gff_string(int type)
-{
- if ( !GF_is_coding(type) )
- {
- if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
- type &= (1<<(GF_coding_bit+1)) - 1;
- return gf_strings_special[type - 1];
- }
- type &= (1<<GF_coding_bit) - 1;
- return gf_strings_coding[type - 1];
-}
-
-/*
- gff parsing functions
-*/
-static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
-{
- aux_t *aux = &args->init;
- char c = chr_end[1];
- chr_end[1] = 0;
- int iseq;
- if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
- {
- // check for possible mismatch in chromosome naming convention such as chrX vs X
- char *new_chr = NULL;
- if ( faidx_has_seq(args->fai,chr_beg) )
- new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx
- else
- {
- int len = strlen(chr_beg);
- if ( !strncmp("chr",chr_beg,3) && len>3 )
- new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not
- else
- {
- new_chr = malloc(len+4); // gff does not have the prefix, faidx has
- memcpy(new_chr,"chr",3);
- memcpy(new_chr+3,chr_beg,len);
- new_chr[len+3] = 0;
- }
- if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai
- {
- static int unkwn_chr_warned = 0;
- if ( !unkwn_chr_warned && args->verbosity>0 )
- fprintf(bcftools_stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
- unkwn_chr_warned = 1;
- free(new_chr);
- new_chr = strdup(chr_beg); // use the original sequence name
- }
- }
- if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
- {
- hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
- aux->seq[aux->nseq] = new_chr;
- iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
- aux->nseq++;
- assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
- }
- else
- free(new_chr);
- }
- chr_end[1] = c;
- return iseq;
-}
-static inline char *gff_skip(const char *line, char *ss)
-{
- while ( *ss && *ss!='\t' ) ss++;
- if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- return ss+1;
-}
-static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
-{
- char *se = (char*) line;
- while ( *se && *se!='\t' ) se++;
- if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- *chr_beg = (char*) line;
- *chr_end = se-1;
-}
-static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
-{
- char *se = ss;
- *beg = strtol(ss, &se, 10) - 1;
- if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
- ss = se+1;
- *end = strtol(ss, &se, 10) - 1;
- if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- return se+1;
-}
-static void gff_id_init(id_tbl_t *tbl)
-{
- memset(tbl, 0, sizeof(*tbl));
- tbl->str2id = khash_str2int_init();
-}
-static void gff_id_destroy(id_tbl_t *tbl)
-{
- khash_str2int_destroy_free(tbl->str2id);
- free(tbl->str);
-}
-// returns 0 on success, -1 on failure
-static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr)
-{
- ss = strstr(ss,needle); // e.g. "ID=transcript:"
- if ( !ss ) return -1;
- ss += strlen(needle);
-
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- char tmp = *se;
- *se = 0;
-
- int id;
- if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
- {
- id = tbl->nstr++;
- hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
- tbl->str[id] = strdup(ss);
- khash_str2int_set(tbl->str2id, tbl->str[id], id);
- }
- *se = tmp;
- *id_ptr = id;
- return 0;
-}
-static inline int gff_parse_type(char *line)
-{
- line = strstr(line,"ID=");
- if ( !line ) return -1;
- line += 3;
- if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
- else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
- return -1;
-}
-static inline int gff_parse_biotype(char *_line)
-{
- char *line = strstr(_line,"biotype=");
- if ( !line ) return -1;
-
- line += 8;
- switch (*line)
- {
- case 'p':
- if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
- else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
- else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
- else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
- break;
- case 'a':
- if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
- else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
- else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
- break;
- case 'I':
- if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
- else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
- else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
- else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
- else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
- else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
- else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
- else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
- else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
- break;
- case 'T':
- if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
- else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
- else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
- else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
- else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
- else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
- break;
- case 'M':
- if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
- else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
- else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
- break;
- case 'l':
- if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
- break;
- case 'm':
- if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
- else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
- else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
- else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
- else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
- break;
- case 'r':
- if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
- else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
- else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
- else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
- break;
- case 's':
- if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
- else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
- else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
- else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
- else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
- else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
- else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
- break;
- case 't':
- if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
- else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
- break;
- case 'n':
- if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
- else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
- break;
- case 'k':
- if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
- break;
- case 'u':
- if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
- else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
- break;
- case 'L':
- if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
- break;
- case '3':
- if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
- break;
- case 'd':
- if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
- break;
- case 'v':
- if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
- break;
- case 'b':
- if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
- break;
- }
- return 0;
-}
-static inline int gff_ignored_biotype(args_t *args, char *ss)
-{
- ss = strstr(ss,"biotype=");
- if ( !ss ) return 0;
-
- ss += 8;
- char *se = ss, tmp;
- while ( *se && *se!=';' ) se++;
- tmp = *se;
- *se = 0;
-
- char *key = ss;
- int n = 0;
- if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
- khash_str2int_set(args->init.ignored_biotypes, key, n+1);
-
- *se = tmp;
- return 1;
-}
-gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
-{
- khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
- gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
- if ( !gene )
- {
- gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
- int ret;
- k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
- kh_val(aux->gid2gene,k) = gene;
- }
- return gene;
-}
-void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- int biotype = gff_parse_biotype(ss);
- if ( biotype <= 0 )
- {
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript, unknown biotype: %s\n",line);
- return;
- }
-
- // create a mapping from transcript_id to gene_id
- uint32_t trid, gene_id;
- if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) )
- {
- if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) )
- error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(bcftools_stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line);
- warned = 1;
- }
- }
- if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) )
- {
- if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) )
- error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(bcftools_stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
- tr->id = trid;
- tr->strand = ftr->strand;
- tr->gene = gene_init(aux, gene_id);
- tr->type = biotype;
- tr->beg = ftr->beg;
- tr->end = ftr->end;
-
- khint_t k;
- int ret;
- k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
- kh_val(aux->id2tr,k) = tr;
-}
-void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
-{
- int biotype = gff_parse_biotype(ss);
- if ( biotype <= 0 )
- {
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene, unknown biotype: %s\n",line);
- return;
- }
-
- aux_t *aux = &args->init;
-
- // substring search for "ID=gene:ENSG00000437963"
- uint32_t gene_id;
- if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) )
- {
- if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) )
- error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(bcftools_stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- gf_gene_t *gene = gene_init(aux, gene_id);
- assert( !gene->name ); // the gene_id should be unique
-
- gene->iseq = feature_set_seq(args, chr_beg,chr_end);
-
- // substring search for "Name=OR4F5"
- ss = strstr(chr_end+2,"Name=");
- if ( ss )
- {
- ss += 5;
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- gene->name = (char*) malloc(se-ss+1);
- memcpy(gene->name,ss,se-ss);
- gene->name[se-ss] = 0;
- }
- else
- gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
-}
-int gff_parse(args_t *args, char *line, ftr_t *ftr)
-{
- // - skip empty lines and commented lines
- // - columns
- // 1. chr
- // 2. <skip>
- // 3. CDS, transcript, gene, ...
- // 4-5. beg,end
- // 6. <skip>
- // 7. strand
- // 8. phase
- // 9. Parent=transcript:ENST(\d+);ID=... etc
-
- char *ss = line;
- if ( !*ss ) return -1; // skip blank lines
- if ( *ss=='#' ) return -1; // skip comments
-
- char *chr_beg, *chr_end;
- gff_parse_chr(line, &chr_beg, &chr_end);
- ss = gff_skip(line, chr_end + 2);
-
- // 3. column: is this a CDS, transcript, gene, etc.
- if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
- else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
- else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
- else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
- else
- {
- int type = GFF_UNKN_LINE;
- if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE;
- else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE;
- ss = gff_skip(line, ss);
- ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
- ss = gff_skip(line, ss);
- if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene:
- if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
- {
- // we ignore these, debug print to see new types:
- ss = strstr(ss,"ID=");
- if ( !ss ) return -1; // no ID, ignore the line
- if ( !strncmp("chromosome",ss+3,10) ) return -1;
- if ( !strncmp("supercontig",ss+3,11) ) return -1;
- if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored: %s\n", line);
- return -1;
- }
-
- // 7. column: strand
- if ( *ss == '+' ) ftr->strand = STRAND_FWD;
- else if ( *ss == '-' ) ftr->strand = STRAND_REV;
- else error("Unknown strand: %c .. %s\n", *ss,ss);
-
- if ( type==GFF_TSCRIPT_LINE )
- gff_parse_transcript(args, line, ss, ftr);
- else
- gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
-
- return -1;
- }
- ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
- ss = gff_skip(line, ss);
-
- // 7. column: strand
- if ( *ss == '+' ) ftr->strand = STRAND_FWD;
- else if ( *ss == '-' ) ftr->strand = STRAND_REV;
- else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
- ss += 2;
-
- // 8. column: phase (codon offset)
- if ( *ss == '0' ) ftr->phase = 0;
- else if ( *ss == '1' ) ftr->phase = 1;
- else if ( *ss == '2' ) ftr->phase = 2;
- else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase
- else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
- ss += 2;
-
- // substring search for "Parent=transcript:ENST00000437963"
- if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) )
- {
- if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) )
- error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- static int warned = 0;
- if ( !warned && args->verbosity > 0 )
- {
- fprintf(bcftools_stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line);
- warned = 1;
- }
- }
-
- ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
- return 0;
-}
-
-static int cmp_cds_ptr(const void *a, const void *b)
-{
- // comparison function for qsort of transcripts's CDS
- if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
- if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
- return 0;
-}
-
-static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
-{
- *chr_beg = *chr_end = aux->seq[iseq];
- while ( (*chr_end)[1] ) (*chr_end)++;
-}
-tscript_t *tscript_init(aux_t *aux, uint32_t trid)
-{
- khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
- tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
- assert( tr );
- return tr;
-}
-void register_cds(args_t *args, ftr_t *ftr)
-{
- // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
- // ftr is the result of parsing a gff CDS line
- aux_t *aux = &args->init;
-
- tscript_t *tr = tscript_init(aux, ftr->trid);
- if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
-
- gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
- cds->tr = tr;
- cds->beg = ftr->beg;
- cds->len = ftr->end - ftr->beg + 1;
- cds->icds = 0; // to keep valgrind on mac happy
- cds->phase = ftr->phase;
-
- hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
- tr->cds[tr->ncds++] = cds;
-}
-void register_utr(args_t *args, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
- utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
- utr->beg = ftr->beg;
- utr->end = ftr->end;
- utr->tr = tscript_init(aux, ftr->trid);
-
- char *chr_beg, *chr_end;
- chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
-}
-void register_exon(args_t *args, ftr_t *ftr)
-{
- aux_t *aux = &args->init;
- gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
- exon->beg = ftr->beg;
- exon->end = ftr->end;
- exon->tr = tscript_init(aux, ftr->trid);
-
- char *chr_beg, *chr_end;
- chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
-}
-
-void tscript_init_cds(args_t *args)
-{
- aux_t *aux = &args->init;
-
- // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
- khint_t k;
- int warn_phase_unkn = 0;
- for (k=0; k<kh_end(aux->id2tr); k++)
- {
- if ( !kh_exist(aux->id2tr, k) ) continue;
- tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
-
- // position-to-tscript lookup
- char *chr_beg, *chr_end;
- chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
- regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
-
- if ( !tr->ncds ) continue; // transcript with no CDS
-
- // sort CDs
- qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
-
- // trim non-coding start
- int i, len = 0;
- if ( tr->strand==STRAND_FWD )
- {
- if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
- {
- if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
- tr->cds[0]->beg += tr->cds[0]->phase;
- tr->cds[0]->len -= tr->cds[0]->phase;
- tr->cds[0]->phase = 0;
- }
-
- // sanity check phase; the phase number in gff tells us how many bases to skip in this
- // feature to reach the first base of the next codon
- int tscript_ok = 1;
- for (i=0; i<tr->ncds; i++)
- {
- if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
- {
- warn_phase_unkn = 1;
- len += tr->cds[i]->len;
- continue;
- }
- int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
- if ( phase!=len%3 )
- {
- if ( args->force )
- {
- if ( args->verbosity > 0 )
- fprintf(bcftools_stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- tscript_ok = 0;
- break;
- }
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- }
- len += tr->cds[i]->len;
- }
- if ( !tscript_ok ) continue; // skip this transcript
- }
- else
- {
- if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
- {
- // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
- // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
- // todo: the same for the fwd strand
- i = tr->ncds - 1;
- int phase = tr->cds[i]->phase;
- if ( phase ) tr->trim |= TRIM_5PRIME;
- while ( i>=0 && phase > tr->cds[i]->len )
- {
- phase -= tr->cds[i]->len;
- tr->cds[i]->phase = 0;
- tr->cds[i]->len = 0;
- i--;
- }
- tr->cds[i]->len -= tr->cds[i]->phase;
- tr->cds[i]->phase = 0;
- }
-
- // sanity check phase
- int tscript_ok = 1;
- for (i=tr->ncds-1; i>=0; i--)
- {
- if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
- {
- warn_phase_unkn = 1;
- len += tr->cds[i]->len;
- continue;
- }
- int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
- if ( phase!=len%3)
- {
- if ( args->force )
- {
- if ( args->verbosity > 0 )
- fprintf(bcftools_stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- tscript_ok = 0;
- break;
- }
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
- }
- len += tr->cds[i]->len;
- }
- if ( !tscript_ok ) continue; // skip this transcript
- }
-
- // set len. At the same check that CDS within a transcript do not overlap
- len = 0;
- for (i=0; i<tr->ncds; i++)
- {
- tr->cds[i]->icds = i;
- len += tr->cds[i]->len;
- if ( !i ) continue;
-
- gf_cds_t *a = tr->cds[i-1];
- gf_cds_t *b = tr->cds[i];
- if ( a->beg + a->len - 1 >= b->beg )
- {
- if ( args->force )
- {
- fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
- args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
- }
- else
- error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
- " Use the --force option to override (at your own risk).\n",
- args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
- }
- }
- if ( len%3 != 0 )
- {
- // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
- // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
- // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
-
- tr->trim |= TRIM_3PRIME;
- if ( tr->strand==STRAND_FWD )
- {
- i = tr->ncds - 1;
- while ( i>=0 && len%3 )
- {
- int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
- tr->cds[i]->len -= dlen;
- len -= dlen;
- i--;
- }
- }
- else
- {
- i = 0;
- while ( i<tr->ncds && len%3 )
- {
- int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
- tr->cds[i]->len -= dlen;
- tr->cds[i]->beg += dlen;
- len -= dlen;
- i++;
- }
- }
- }
-
- // set CDS offsets and insert into regidx
- len=0;
- for (i=0; i<tr->ncds; i++)
- {
- tr->cds[i]->pos = len;
- len += tr->cds[i]->len;
- regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
- }
- }
- if ( warn_phase_unkn && args->verbosity > 0 )
- fprintf(bcftools_stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n");
-}
-
-void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
-void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
-
-void init_gff(args_t *args)
-{
- aux_t *aux = &args->init;
- aux->seq2int = khash_str2int_init(); // chrom's numeric id
- aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
- aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
- args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
- aux->ignored_biotypes = khash_str2int_init();
- gff_id_init(&aux->gene_ids);
- gff_id_init(&args->tscript_ids);
-
- // parse gff
- kstring_t str = {0,0,0};
- htsFile *fp = hts_open(args->gff_fname,"r");
- if ( !fp ) error("Failed to read %s\n", args->gff_fname);
- while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
- {
- hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
- int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
- if ( !ret ) aux->nftr++;
- }
- free(str.s);
- if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
-
-
- // process gff information: connect CDS and exons to transcripts
- args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
- args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
- args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
- args->itr = regitr_init(NULL);
-
- int i;
- for (i=0; i<aux->nftr; i++)
- {
- ftr_t *ftr = &aux->ftr[i];
-
- // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
- khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
- if ( k==kh_end(aux->id2tr) ) continue; // no such transcript
-
- tscript_t *tr = kh_val(aux->id2tr,k);
- if ( !tr->gene->name )
- {
- // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
- regidx_free_tscript(&tr);
- kh_del(int2tscript, aux->id2tr,k);
- continue;
- }
-
- // populate regidx by category:
- // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
- // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
- if ( ftr->type==GF_CDS ) register_cds(args, ftr);
- else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
- else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
- else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
- else
- error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
- }
- tscript_init_cds(args);
-
- if ( args->verbosity > 0 )
- {
- fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
- regidx_nregs(args->idx_tscript),
- regidx_nregs(args->idx_exon),
- regidx_nregs(args->idx_cds),
- regidx_nregs(args->idx_utr));
- }
- if ( !regidx_nregs(args->idx_tscript) )
- fprintf(bcftools_stderr,
- "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
- " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
- " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
-
- free(aux->ftr);
- khash_str2int_destroy_free(aux->seq2int);
- // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
- kh_destroy(int2tscript,aux->id2tr);
- free(aux->seq);
- gff_id_destroy(&aux->gene_ids);
-
- if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
- {
- khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
- fprintf(bcftools_stderr,"Ignored the following biotypes:\n");
- for (i = kh_begin(ign); i < kh_end(ign); i++)
- {
- if ( !kh_exist(ign,i)) continue;
- const char *biotype = kh_key(ign,i);
- if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
- fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
- }
- }
- khash_str2int_destroy_free(aux->ignored_biotypes);
-}
-
static inline int ncsq2_to_nfmt(int ncsq2)
{
return 1 + (ncsq2 - 1) / 30;
args->fai = fai_load(args->fa_fname);
if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
- if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname);
- init_gff(args);
+ args->gff = gff_init(args->gff_fname);
+ gff_set(args->gff,verbosity,args->verbosity);
+ gff_set(args->gff,strip_chr_names,args->unify_chr_names);
+ gff_set(args->gff,force_out_of_phase,args->force);
+ gff_set(args->gff,dump_fname,args->dump_gff);
+ gff_parse(args->gff);
+ args->idx_cds = gff_get(args->gff,idx_cds);
+ args->idx_utr = gff_get(args->gff,idx_utr);
+ args->idx_exon = gff_get(args->gff,idx_exon);
+ args->idx_tscript = gff_get(args->gff,idx_tscript);
+ args->itr = regitr_init(NULL);
args->rid = -1;
if ( args->hdr_nsmpl )
bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+ if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Calling...\n");
}
"Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
" the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
- regidx_destroy(args->idx_cds);
- regidx_destroy(args->idx_utr);
- regidx_destroy(args->idx_exon);
- regidx_destroy(args->idx_tscript);
regitr_destroy(args->itr);
-
- khint_t k,i,j;
- for (k=0; k<kh_end(args->init.gid2gene); k++)
- {
- if ( !kh_exist(args->init.gid2gene, k) ) continue;
- gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
- free(gene->name);
- free(gene);
- }
- kh_destroy(int2gene,args->init.gid2gene);
+ gff_destroy(args->gff);
if ( args->filter )
filter_destroy(args->filter);
khp_destroy(trhp,args->active_tr);
kh_destroy(pos2vbuf,args->pos2vbuf);
if ( args->smpl ) smpl_ilist_destroy(args->smpl);
- int ret;
+ int i,j,ret;
if ( args->out_fh )
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
ret = hts_close(args->out_fh);
+ }
else
ret = fclose(args->out);
if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
free(args->gt_arr);
free(args->str.s);
free(args->str2.s);
- gff_id_destroy(&args->tscript_ids);
+ free(args->chr_name);
}
/*
#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
typedef struct
{
- tscript_t *tr;
+ gf_tscript_t *tr;
struct {
int32_t pos, rlen, alen, ial;
char *ref, *alt;
if ( rbeg < splice->vcf.pos )
{
assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD
- kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
roff = 0;
}
else
if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
if ( splice->kref.l < rlen )
- kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
}
#if XDBG
fprintf(bcftools_stderr,"r3: %s\n",splice->kref.s);
if ( abeg < splice->vcf.pos )
{
assert( splice->tr->beg <= abeg );
- kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
aoff = 0;
}
else
if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
if ( alen > 0 && alen > splice->kalt.l )
- kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+ kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
}
#if XDBG
fprintf(bcftools_stderr,"a3: %s\n",splice->kalt.s);
while ( regitr_overlap(itr) )
{
gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
- tscript_t *tr = utr->tr;
+ gf_tscript_t *tr = utr->tr;
if ( tr->id != trid ) continue;
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
}
return 0;
}
-static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
{
#if XDBG
fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
}
+static inline const char *drop_chr_prefix(args_t *args, const char *chr)
+{
+ if ( !args->unify_chr_names ) return chr;
+ if ( !strncasecmp("chr",chr,3) ) return chr+3;
+ return chr;
+}
+static inline const char *add_chr_prefix(args_t *args, const char *chr)
+{
+ if ( !args->unify_chr_names ) return chr;
+ int len = strlen(chr);
+ hts_expand(char,len+4,args->mchr_name,args->chr_name);
+ memcpy(args->chr_name,"chr",3);
+ memcpy(args->chr_name+3,chr,len+1);
+ return args->chr_name;
+}
static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
{
// coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
{
static int small_ref_padding_warned = 0;
- tscript_t *tr = splice->tr;
+ gf_tscript_t *tr = splice->tr;
// We know the VCF record overlaps the exon, but does it overlap the start codon?
if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0;
}
char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele
- char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted
+ char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted
#if XDBG
fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
#endif
}
char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele
- char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block
+ char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block
#if XDBG
fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref);
#endif
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
if ( splice->check_utr )
{
regitr_t *itr = regitr_init(NULL);
- const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec));
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
{
int i;
kstring_t str = {0,0,0};
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
child->icds = cds->icds; // index of cds in the tscript's list of exons
child->vcf_ial = ial;
}
if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
{
- if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
- else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
}
if ( child->icds!=0 ) splice.check_region_beg = 1;
if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
// the variant is on a new exon, finish up the previous
int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
if ( len > 0 )
- kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
}
// append any skipped non-variant exons
while ( ++i < cds->icds )
- kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
if ( parent->icds==child->icds )
{
free(splice.kalt.s);
return 1;
}
- kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
}
else
- kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+ kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
}
kputs(splice.kalt.s + dbeg, &str);
#endif
}
-void tscript_splice_ref(tscript_t *tr)
+void tscript_splice_ref(gf_tscript_t *tr)
{
int i, len = 0;
for (i=0; i<tr->ncds; i++)
len += tr->cds[i]->len;
- tr->nsref = len + 2*N_REF_PAD;
- tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
+ TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD;
+ TSCRIPT_AUX(tr)->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
len = 0;
- memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+ memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
len += N_REF_PAD;
for (i=0; i<tr->ncds; i++)
{
- memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+ memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
len += tr->cds[i]->len;
}
- memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+ memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
len += N_REF_PAD;
- tr->sref[len] = 0;
+ TSCRIPT_AUX(tr)->sref[len] = 0;
}
// returns: 0 if consequence was added, 1 if it already exists or could not be added
if ( csq->type & CSQ_UPSTREAM_STOP )
kputc_('*',str);
- int i, n = sizeof(csq_strings)/sizeof(char*);
+ int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*);
for (i=1; i<n; i++)
- if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+ if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputs(csq_strings[i],str); break; }
i++;
for (; i<n; i++)
- if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+ if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputc_('&',str); kputs(csq_strings[i],str); }
+
+ if ( (csq->biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) )
+ {
+ if ( has_csq ) kputc_('&',str); // just in case, this should always be true
+ kputs("NMD_transcript",str);
+ }
kputc_('|', str);
if ( csq->gene ) kputs(csq->gene , str);
kputc_('|', str);
- if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+// if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
+ if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str);
kputc_('|', str);
kputs(gf_type2gff_string(csq->biotype), str);
void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
{
int i;
- tscript_t *tr = hap->tr;
+ gf_tscript_t *tr = hap->tr;
int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
int icsq = node->ncsq_list++;
hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
str.l = 0;
// create the aa variant string
- int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+ int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
void hap_finalize(args_t *args, hap_t *hap)
{
- tscript_t *tr = hap->tr;
- if ( !tr->sref )
+ gf_tscript_t *tr = hap->tr;
+ if ( !TSCRIPT_AUX(tr)->sref )
tscript_splice_ref(tr);
kstring_t sref;
- sref.s = tr->sref;
- sref.l = tr->nsref;
+ sref.s = TSCRIPT_AUX(tr)->sref;
+ sref.l = TSCRIPT_AUX(tr)->nsref;
sref.m = sref.l;
int istack = 0;
hap->sseq.l = 0;
hap->tseq.l = 0;
- hap->stack[0].node = tr->root;
+ hap->stack[0].node = TSCRIPT_AUX(tr)->root;
hap->stack[0].ichild = -1;
hap->stack[0].slen = 0;
hap->stack[0].dlen = 0;
kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
-static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
{
if ( !node || !node->ncsq_list ) return;
}
}
-static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
{
if ( !node || !node->ncsq_list || ismpl<0 ) return;
tr_heap_t *heap = args->active_tr;
while ( heap->ndat && heap->dat[0]->end<=pos )
{
- tscript_t *tr = heap->dat[0];
+ gf_tscript_t *tr = heap->dat[0];
khp_delete(trhp, heap);
args->hap->tr = tr;
- if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+ if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) // normal, non-localized calling
{
hap_finalize(args, args->hap);
if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf
{
if ( args->phase==PHASE_DROP_GT )
- hap_print_text(args, tr, -1,0, tr->hap[0]);
+ hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]);
else
{
for (i=0; i<args->smpl->n; i++)
{
for (j=0; j<2; j++)
- hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+ hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]);
}
}
}
for (i=0; i<args->smpl->n; i++)
{
for (j=0; j<2; j++)
- hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+ hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]);
}
}
}
// mark the transcript for deletion. Cannot delete it immediately because
// by-position VCF output will need them when flushed by vcf_buf_push
args->nrm_tr++;
- hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+ hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
args->rm_tr[args->nrm_tr-1] = tr;
}
}
for (i=0; i<args->nrm_tr; i++)
{
- tscript_t *tr = args->rm_tr[i];
- if ( tr->root ) hap_destroy(tr->root);
- tr->root = NULL;
- free(tr->hap);
- free(tr->ref);
- free(tr->sref);
+ gf_tscript_t *tr = args->rm_tr[i];
+ tscript_t *aux = TSCRIPT_AUX(tr);
+ if ( aux->root ) hap_destroy(aux->root);
+ aux->root = NULL;
+ free(aux->hap);
+ free(aux->ref);
+ free(aux->sref);
+ free(aux);
+ tr->aux = NULL;
}
args->nrm_tr = 0;
args->ncsq_buf = 0;
}
-void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr)
{
int i, len;
int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
- tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
- if ( !tr->ref )
+ const char *tmp_chr = chr;
+ if ( !faidx_has_seq(args->fai,tmp_chr) )
+ {
+ tmp_chr = drop_chr_prefix(args,chr);
+ if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr);
+ }
+ TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ if ( !TSCRIPT_AUX(tr)->ref )
error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
{
char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1);
for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
- memcpy(ref+i, tr->ref, len);
+ memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len);
len += i;
for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
ref[i+len] = 0;
- free(tr->ref);
- tr->ref = ref;
+ free(TSCRIPT_AUX(tr)->ref);
+ TSCRIPT_AUX(tr)->ref = ref;
}
}
-static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec)
{
int vbeg = 0;
int rbeg = rec->pos - tr->beg + N_REF_PAD;
if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; }
- char *ref = tr->ref + rbeg;
+ char *ref = TSCRIPT_AUX(tr)->ref + rbeg;
char *vcf = rec->d.allele[0] + vbeg;
- assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD );
+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD );
int i = 0;
while ( ref[i] && vcf[i] )
{
int test_cds_local(args_t *args, bcf1_t *rec)
{
int i,j, ret = 0;
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
if ( !GF_is_coding(tr->type) ) continue;
ret = 1;
- if ( !tr->ref )
+ if ( !TSCRIPT_AUX(tr) )
{
+ tr->aux = calloc(sizeof(tscript_t),1);
tscript_init_ref(args, tr, chr);
tscript_splice_ref(tr);
khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
sanity_check_ref(args, tr, rec);
kstring_t sref;
- sref.s = tr->sref;
- sref.l = tr->nsref;
+ sref.s = TSCRIPT_AUX(tr)->sref;
+ sref.l = TSCRIPT_AUX(tr)->nsref;
sref.m = sref.l;
for (i=1; i<rec->n_allele; i++)
{
// create the aa variant string
kstring_t str = {0,0,0};
- int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
- int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+ int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
kprint_aa_prediction(args,aa_rbeg,tref,&str);
csq_stage(args, &csq, rec);
// all this only to clean vstr when vrec is flushed
- if ( !tr->root )
- tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
- tr->root->ncsq_list++;
- hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
- csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+ if ( !TSCRIPT_AUX(tr)->root )
+ TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ TSCRIPT_AUX(tr)->root->ncsq_list++;
+ hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list);
+ csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1;
rm_csq->type.vstr = str;
}
if ( csq_type & ~CSQ_COMPOUND )
static int overlaps_warned = 0, multiploid_warned = 0;
int i, ret = 0, hap_ret;
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
if ( !GF_is_coding(tr->type) ) continue;
if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end;
ret = 1;
- if ( !tr->root )
+ if ( !TSCRIPT_AUX(tr) )
{
// initialize the transcript and its haplotype tree, fetch the reference sequence
+ tr->aux = calloc(sizeof(tscript_t),1);
tscript_init_ref(args, tr, chr);
- tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
- tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
- tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
- for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
- tr->root->nend = tr->nhap;
- tr->root->type = HAP_ROOT;
+ TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
+ TSCRIPT_AUX(tr)->hap = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*));
+ for (i=0; i<TSCRIPT_AUX(tr)->nhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL;
+ TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap;
+ TSCRIPT_AUX(tr)->root->type = HAP_ROOT;
khp_insert(trhp, args->active_tr, &tr);
}
if ( args->phase==PHASE_DROP_GT )
{
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
- hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+ hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root;
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
hap_ret = hap_init(args, parent, child, cds, rec, 1);
if ( hap_ret!=0 )
parent->mchild = 1;
parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*));
parent->child[0] = child;
- tr->hap[0] = child;
- tr->hap[0]->nend = 1;
+ TSCRIPT_AUX(tr)->hap[0] = child;
+ TSCRIPT_AUX(tr)->hap[0]->nend = 1;
continue;
}
assert( ial < rec->n_allele );
if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
- hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+ hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root;
if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
{
// this haplotype has been seen in another sample
- tr->hap[i] = parent->child[ parent->cur_child[ial] ];
- tr->hap[i]->nend++;
+ TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ];
+ TSCRIPT_AUX(tr)->hap[i]->nend++;
parent->nend--;
continue;
}
hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
parent->cur_child[ial] = j;
parent->child[j] = child;
- tr->hap[i] = child;
- tr->hap[i]->nend++;
+ TSCRIPT_AUX(tr)->hap[i] = child;
+ TSCRIPT_AUX(tr)->hap[i]->nend++;
parent->nend--;
}
}
}
int test_utr(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// note that the off-by-one extension of rlen is deliberate to account for insertions
if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
while ( regitr_overlap(args->itr) )
{
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
- tscript_t *tr = splice.tr = utr->tr;
+ gf_tscript_t *tr = splice.tr = utr->tr;
for (i=1; i<rec->n_allele; i++)
{
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
}
int test_splice(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
splice_t splice;
}
int test_tscript(args_t *args, bcf1_t *rec)
{
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
splice_t splice;
int i, ret = 0;
while ( regitr_overlap(args->itr) )
{
- tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
for (i=1; i<rec->n_allele; i++)
{
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
warned = 1;
}
- const char *chr = bcf_seqname(args->hdr,rec);
+ const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec));
// only insertions atm
int beg = rec->pos + 1;
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
- tscript_t *tr = cds->tr;
+ gf_tscript_t *tr = cds->tr;
csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class;
csq.pos = rec->pos;
csq.type.biotype = tr->type;
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
- tscript_t *tr = utr->tr;
+ gf_tscript_t *tr = utr->tr;
csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class;
csq.pos = rec->pos;
csq.type.biotype = tr->type;
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
- tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
splice.vcf.alt = rec->d.allele[1];
splice.csq = csq_class;
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
// Perform a simple sanity check (that does not catch much), the chromosome must be present in the
// reference file
if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
- error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ {
+ if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) )
+ error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ }
}
if ( prev_pos > rec->pos )
error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased hets\n"
- "Options:\n"
- " -e, --exclude EXPR Exclude sites for which the expression is true\n"
+ "GFF options:\n"
+ " --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n"
" --force Run even if some sanity checks fail\n"
+ " --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n"
+ "General options:\n"
+ " -e, --exclude EXPR Exclude sites for which the expression is true\n"
" -i, --include EXPR Select sites for which the expression is true\n"
" --no-version Do not append version and command line to the header\n"
" -o, --output FILE Write output to a file [standard output]\n"
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
" --threads INT Use multithreading with <int> worker threads [0]\n"
" -v, --verbose INT Verbosity level 0-2 [1]\n"
+ " --write-index Automatically index the output files [off]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
args->verbosity = 1;
args->record_cmd_line = 1;
args->clevel = -1;
+ args->unify_chr_names = 1;
static struct option loptions[] =
{
{"targets-file",1,0,'T'},
{"targets-overlap",required_argument,NULL,5},
{"no-version",no_argument,NULL,3},
+ {"write-index",no_argument,NULL,6},
+ {"dump-gff",required_argument,NULL,7},
+ {"unify-chr-names",required_argument,NULL,8},
{0,0,0,0}
};
int c, targets_is_file = 0, regions_is_file = 0;
case 3 : args->record_cmd_line = 0; break;
case 'b':
args->brief_predictions = 1;
- fprintf(bcftools_stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+ fprintf(bcftools_stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n");
break;
case 'B':
args->brief_predictions = strtol(optarg,&tmp,10);
targets_overlap = parse_overlap_option(optarg);
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
+ case 6 : args->write_index = 1; break;
+ case 7 : args->dump_gff = optarg; break;
+ case 8 :
+ if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0;
+ else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1;
+ else error("Could not parse: --unify-chr-names %s\n",optarg);
+ break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
#if ENABLE_PERL_FILTERS
PerlInterpreter *perl;
#endif
- char **undef_tag;
- int nundef_tag;
+ char **undef_tag, **used_tag;
+ int nundef_tag, nused_tag;
int status, exit_on_error;
};
*ntags = filter->nundef_tag;
return (const char**)filter->undef_tag;
}
+static void filter_add_used_tag(filter_t *filter, const char *prefix, char *str)
+{
+ int i;
+ kstring_t tmp = {0,0,0};
+ if ( prefix ) kputs(prefix,&tmp);
+ kputs(str,&tmp);
+ for (i=0; i<filter->nused_tag; i++)
+ if ( !strcmp(tmp.s,filter->used_tag[i]) ) break;
+ if ( i<filter->nused_tag )
+ {
+ free(tmp.s);
+ return;
+ }
+
+ filter->nused_tag++;
+ filter->used_tag = (char**)realloc(filter->used_tag,sizeof(*filter->used_tag)*filter->nused_tag);
+ if ( !filter->used_tag ) error("Could not allocate memory\n");
+ filter->used_tag[filter->nused_tag-1] = tmp.s;
+ if ( !filter->used_tag[filter->nused_tag-1] ) error("Could not allocate memory\n");
+}
+const char **filter_list_used_tags(filter_t *filter, int *ntags)
+{
+ *ntags = filter->nused_tag;
+ return (const char**)filter->used_tag;
+}
+
/*
{
tok->setter = filters_set_qual;
tok->tag = strdup("QUAL");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
tok->tag = strdup("FILTER");
filter->max_unpack |= BCF_UN_FLT;
tok->tag_type = BCF_HL_FLT;
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
{
tok->comparator = filters_cmp_id;
tok->tag = strdup("ID");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"CHROM",len) )
{
tok->setter = &filters_set_chrom;
tok->tag = strdup("CHROM");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"POS",len) )
{
tok->setter = &filters_set_pos;
tok->tag = strdup("POS");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"REF",len) )
tok->setter = &filters_set_ref_string;
tok->is_str = 1;
tok->tag = strdup("REF");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"ALT",len) )
tok->idxs[0] = -1;
tok->nidxs = 1;
tok->idx = -2;
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"N_ALT",len) )
}
tok->tag = strdup(tmp.s);
if ( tmp.s ) free(tmp.s);
+ filter_add_used_tag(filter,is_fmt ? "FORMAT/" : "INFO/",tok->tag);
return 0;
}
else if ( !strcasecmp(tmp.s,"ALT") )
tok->is_str = 1;
tok->tag = strdup(tmp.s);
free(tmp.s);
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strcasecmp(tmp.s,"AN") )
}
}
for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
+ for (i=0; i<filter->nused_tag; i++) free(filter->used_tag[i]);
free(filter->undef_tag);
+ free(filter->used_tag);
free(filter->cached_GT.buf);
free(filter->cached_GT.mask);
free(filter->filters);
#if ENABLE_PERL_FILTERS
PerlInterpreter *perl;
#endif
- char **undef_tag;
- int nundef_tag;
+ char **undef_tag, **used_tag;
+ int nundef_tag, nused_tag;
int status, exit_on_error;
};
*ntags = filter->nundef_tag;
return (const char**)filter->undef_tag;
}
+static void filter_add_used_tag(filter_t *filter, const char *prefix, char *str)
+{
+ int i;
+ kstring_t tmp = {0,0,0};
+ if ( prefix ) kputs(prefix,&tmp);
+ kputs(str,&tmp);
+ for (i=0; i<filter->nused_tag; i++)
+ if ( !strcmp(tmp.s,filter->used_tag[i]) ) break;
+ if ( i<filter->nused_tag )
+ {
+ free(tmp.s);
+ return;
+ }
+
+ filter->nused_tag++;
+ filter->used_tag = (char**)realloc(filter->used_tag,sizeof(*filter->used_tag)*filter->nused_tag);
+ if ( !filter->used_tag ) error("Could not allocate memory\n");
+ filter->used_tag[filter->nused_tag-1] = tmp.s;
+ if ( !filter->used_tag[filter->nused_tag-1] ) error("Could not allocate memory\n");
+}
+const char **filter_list_used_tags(filter_t *filter, int *ntags)
+{
+ *ntags = filter->nused_tag;
+ return (const char**)filter->used_tag;
+}
+
/*
{
tok->setter = filters_set_qual;
tok->tag = strdup("QUAL");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ )
tok->tag = strdup("FILTER");
filter->max_unpack |= BCF_UN_FLT;
tok->tag_type = BCF_HL_FLT;
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ )
{
tok->comparator = filters_cmp_id;
tok->tag = strdup("ID");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"CHROM",len) )
{
tok->setter = &filters_set_chrom;
tok->tag = strdup("CHROM");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"POS",len) )
{
tok->setter = &filters_set_pos;
tok->tag = strdup("POS");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"REF",len) )
tok->setter = &filters_set_ref_string;
tok->is_str = 1;
tok->tag = strdup("REF");
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"ALT",len) )
tok->idxs[0] = -1;
tok->nidxs = 1;
tok->idx = -2;
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strncasecmp(str,"N_ALT",len) )
}
tok->tag = strdup(tmp.s);
if ( tmp.s ) free(tmp.s);
+ filter_add_used_tag(filter,is_fmt ? "FORMAT/" : "INFO/",tok->tag);
return 0;
}
else if ( !strcasecmp(tmp.s,"ALT") )
tok->is_str = 1;
tok->tag = strdup(tmp.s);
free(tmp.s);
+ filter_add_used_tag(filter,NULL,tok->tag);
return 0;
}
else if ( !strcasecmp(tmp.s,"AN") )
}
}
for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
+ for (i=0; i<filter->nused_tag; i++) free(filter->used_tag[i]);
free(filter->undef_tag);
+ free(filter->used_tag);
free(filter->cached_GT.buf);
free(filter->cached_GT.mask);
free(filter->filters);
*/
int filter_status(filter_t *filter);
const char **filter_list_undef_tags(filter_t *filter, int *nundef);
+const char **filter_list_used_tags(filter_t *filter, int *nused);
#endif
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2023 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "gff.h"
+
+/*
+ Helper structures, only for initialization
+
+ ftr_t
+ temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, gf_tscript_t*)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+ int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+ uint32_t beg;
+ uint32_t end;
+ uint32_t trid;
+ uint32_t strand:1; // STRAND_REV,STRAND_FWD
+ uint32_t phase:2; // 0, 1, 2, or 3 for unknown
+ uint32_t iseq:29;
+}
+ftr_t;
+
+/*
+ Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+ to integer id. To keep the memory requirements low, the original version
+ relied on IDs in the form of a string prefix and a numerical id. However,
+ it turns out that this assumption is not valid for some ensembl GFFs, see
+ for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+ void *str2id; // khash_str2int
+ int nstr, mstr;
+ char **str; // numeric id to string
+}
+id_tbl_t;
+
+typedef struct
+{
+ // all exons, CDS, UTRs
+ ftr_t *ftr;
+ int nftr, mftr;
+
+ // mapping from gene id to gf_gene_t
+ kh_int2gene_t *gid2gene;
+
+ // mapping from transcript id to tscript, for quick CDS anchoring
+ kh_int2tscript_t *id2tr;
+
+ // sequences
+ void *seq2int; // str2int hash
+ char **seq;
+ int nseq, mseq;
+
+ // ignored biotypes
+ void *ignored_biotypes;
+
+ id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
+
+ // pointers to the current partially processed line
+ char *id, *id_end, *parent, *parent_end, *biotype, *biotype_end,
+ *chr, *chr_end, *name, *name_end, *type, *type_end;
+}
+aux_t;
+
+struct gff_t_
+{
+ const char *fname, *dump_fname;
+
+ // the main regidx lookups, from chr:beg-end to overlapping features and
+ // index iterator
+ regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+
+ // temporary structures, deleted after initializtion
+ aux_t init;
+
+ // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
+ id_tbl_t tscript_ids;
+
+ int strip_chr_names, verbosity;
+ int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
+
+ struct {
+ int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ } warned;
+};
+
+static const char *gf_strings_noncoding[] =
+{
+ "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+ "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+ "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+ "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+ "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+ "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
+ "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+ "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf",
+ "lncRNA"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+int gff_set(gff_t *gff, gff_opt_t key, ...)
+{
+ va_list args;
+ switch (key)
+ {
+ case dump_fname:
+ va_start(args, key);
+ gff->dump_fname = va_arg(args,char*);
+ va_end(args);
+ return 0;
+
+ case force_out_of_phase:
+ va_start(args, key);
+ gff->force = va_arg(args,int);
+ va_end(args);
+ return 0;
+
+ case strip_chr_names:
+ va_start(args, key);
+ gff->strip_chr_names = va_arg(args,int);
+ va_end(args);
+ return 0;
+
+ case verbosity:
+ va_start(args, key);
+ gff->verbosity = va_arg(args,int);
+ va_end(args);
+ return 0;
+
+ default:
+ error("The key %d is not supported with gff_set\n",key);
+ }
+ return 0;
+}
+
+void *gff_get(gff_t *gff, gff_opt_t key)
+{
+ switch (key)
+ {
+ case idx_cds: return gff->idx_cds;
+ case idx_utr: return gff->idx_utr;
+ case idx_exon: return gff->idx_exon;
+ case idx_tscript: return gff->idx_tscript;
+ default:
+ error("The key %d is not supported with gff_get\n",key);
+ }
+ return NULL;
+}
+
+const char *gff_id2string(gff_t *gff, id_type_t type, int id) // currently only transcript ids
+{
+ return gff->tscript_ids.str[id];
+}
+
+const char *gf_type2gff_string(int type)
+{
+ if ( !GF_is_coding(type) )
+ {
+ if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+ type &= (1<<(GF_coding_bit+1)) - 1;
+ return gf_strings_special[type - 1];
+ }
+ type &= (1<<GF_coding_bit) - 1;
+ return gf_strings_coding[type - 1];
+}
+
+/*
+ gff parsing functions
+*/
+static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
+{
+ aux_t *aux = &gff->init;
+ char tmp = chr_end[1];
+ chr_end[1] = 0;
+ int iseq;
+ if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+ {
+ char *new_chr = strdup(chr_beg);
+ hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+ aux->seq[aux->nseq] = new_chr;
+ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+ aux->nseq++;
+ assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
+ }
+ chr_end[1] = tmp;
+ return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+ while ( *ss && *ss!='\t' ) ss++;
+ if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return ss+1;
+}
+static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, char **chr_end)
+{
+ char *se = (char*) line;
+ while ( *se && *se!='\t' ) se++;
+ if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
+ *chr_beg = (char*) line;
+ *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+ char *se = ss;
+ *beg = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+ ss = se+1;
+ *end = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return se+1;
+}
+static void gff_id_init(id_tbl_t *tbl)
+{
+ memset(tbl, 0, sizeof(*tbl));
+ tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+ khash_str2int_destroy_free(tbl->str2id);
+ free(tbl->str);
+}
+static inline int gff_id_register(id_tbl_t *tbl, char *beg, char *end, uint32_t *id_ptr)
+{
+ char tmp = end[1];
+ end[1] = 0;
+ int id;
+ if ( khash_str2int_get(tbl->str2id, beg, &id) < 0 )
+ {
+ id = tbl->nstr++;
+ hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+ tbl->str[id] = strdup(beg);
+ khash_str2int_set(tbl->str2id, tbl->str[id], id);
+ }
+ end[1] = tmp;
+ *id_ptr = id;
+ return 0;
+}
+static inline int gff_parse_biotype(char *line)
+{
+ if ( !line ) return -1;
+ switch (*line)
+ {
+ case 'p':
+ if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+ else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+ else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+ else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+ break;
+ case 'a':
+ if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+ else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+ else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+ break;
+ case 'I':
+ if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_C",4) ) return GF_IG_C;
+ else if ( !strncmp(line,"IG_D",4) ) return GF_IG_D;
+ else if ( !strncmp(line,"IG_J",4) ) return GF_IG_J;
+ else if ( !strncmp(line,"IG_V",4) ) return GF_IG_V;
+ else if ( !strncmp(line,"IG_LV",5) ) return GF_IG_LV;
+ break;
+ case 'T':
+ if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+ else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+ else if ( !strncmp(line,"TR_C",4) ) return GF_TR_C;
+ else if ( !strncmp(line,"TR_D",4) ) return GF_TR_D;
+ else if ( !strncmp(line,"TR_J",4) ) return GF_TR_J;
+ else if ( !strncmp(line,"TR_V",4) ) return GF_TR_V;
+ break;
+ case 'M':
+ if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+ else if ( !strncasecmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+ else if ( !strncasecmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+ else if ( !strncasecmp(line,"MRNA",4) ) return GF_PROTEIN_CODING;
+ break;
+ case 'l':
+ if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+ if ( !strncmp(line,"lncRNA",7) ) return GF_lncRNA;
+ break;
+ case 'm':
+ if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+ else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+ else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+ else if ( !strncasecmp(line,"mRNA",4) ) return GF_PROTEIN_CODING;
+ break;
+ case 'r':
+ if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+ else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+ else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+ else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+ break;
+ case 's':
+ if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+ else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+ else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+ else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+ else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+ else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+ else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+ break;
+ case 't':
+ if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+ break;
+ case 'n':
+ if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+ else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+ break;
+ case 'N':
+ if ( !strncmp(line,"NMD",3) ) return GF_NMD;
+ break;
+ case 'k':
+ if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+ break;
+ case 'u':
+ if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+ break;
+ case 'L':
+ if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+ break;
+ case '3':
+ if ( !strncasecmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+ else if ( !strncasecmp(line,"3_prime_overlapping_ncRNA",25) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+ break;
+ case 'd':
+ if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+ break;
+ case 'v':
+ if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+ break;
+ case 'b':
+ if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+ break;
+ }
+ return 0;
+}
+static inline int gff_ignored_biotype(gff_t *gff, char *ss, char *se)
+{
+ if ( !ss ) return 0;
+
+ char tmp = se[1];
+ se[1] = 0;
+
+ char *key = ss;
+ int n = 0;
+ if ( khash_str2int_get(gff->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+ khash_str2int_set(gff->init.ignored_biotypes, key, n+1);
+
+ se[1] = tmp;
+ return 1;
+}
+static gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+ khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+ gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+ if ( !gene )
+ {
+ gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+ int ret;
+ k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+ kh_val(aux->gid2gene,k) = gene;
+ }
+ return gene;
+}
+static void gff_parse_transcript(gff_t *gff, const char *line, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+
+ ftr->type = gff_parse_biotype(aux->biotype);
+ if ( ftr->type <= 0 )
+ {
+ char tmp = aux->type_end[1];
+ aux->type_end[1] = 0;
+ ftr->type = gff_parse_biotype(aux->type);
+ aux->type_end[1] = tmp;
+ }
+ if ( ftr->type <= 0 )
+ {
+ if ( !gff_ignored_biotype(gff,aux->biotype,aux->biotype_end) )
+ {
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_tscript_biotype || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: Ignoring transcript with unknown biotype .. %s\n", line);
+ gff->warned.unknown_tscript_biotype++;
+ }
+ }
+ return;
+ }
+
+ if ( !aux->id )
+ error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ if ( !aux->parent )
+ error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+ uint32_t trid,gene_id;
+ gff_id_register(&gff->tscript_ids, aux->id, aux->id_end, &trid);
+ gff_id_register(&aux->gene_ids, aux->parent, aux->parent_end, &gene_id);
+
+ gf_tscript_t *tr = (gf_tscript_t*) calloc(1,sizeof(gf_tscript_t));
+ tr->id = trid;
+ tr->strand = ftr->strand;
+ tr->gene = gene_init(aux, gene_id);
+ tr->type = ftr->type;
+ tr->beg = ftr->beg;
+ tr->end = ftr->end;
+
+ khint_t k;
+ int ret;
+ k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+ kh_val(aux->id2tr,k) = tr;
+}
+// register exon, CDS, UTR
+static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+ if ( !aux->parent )
+ error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring found: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+ // associate with transcript id
+ gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid);
+
+ if ( ftr->strand==-1 && gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_strand || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line);
+ gff->warned.unknown_strand++;
+ }
+ if ( ftr->phase==-1 && gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_phase|| gff->verbosity > 1 )
+ fprintf(stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line);
+ gff->warned.unknown_phase++;
+ }
+ ftr->iseq = feature_set_seq(gff, aux->chr,aux->chr_end);
+}
+static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+ if ( !aux->id ) return;
+
+ uint32_t gene_id;
+ gff_id_register(&aux->gene_ids, aux->id, aux->id_end, &gene_id);
+
+ gf_gene_t *gene = gene_init(aux, gene_id);
+ if ( gene->name )
+ {
+ if ( !gff->warned.duplicate_id || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: The GFF contains features with duplicate id .. %s\n",line);
+ gff->warned.duplicate_id++;
+ return;
+ }
+
+ gene->iseq = feature_set_seq(gff, aux->chr,aux->chr_end);
+ gene->beg = ftr->beg;
+ gene->end = ftr->end;
+ gene->strand = ftr->strand;
+ gene->id = gene_id;
+
+ if ( aux->name )
+ {
+ gene->name = (char*) malloc(aux->name_end - aux->name + 2);
+ memcpy(gene->name,aux->name,aux->name_end - aux->name + 1);
+ gene->name[aux->name_end - aux->name + 1] = 0;
+ }
+ else
+ gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
+}
+
+// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them,
+// or -1 to indiciate the structure needs not be saved (either because of an error or because saved
+// as transcript or gene.)
+static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
+{
+ // - skip empty lines and commented lines
+ // - columns
+ // 1. chr
+ // 2. <skip>
+ // 3. CDS, transcript, gene, ...
+ // 4-5. beg,end
+ // 6. <skip>
+ // 7. strand
+ // 8. phase
+ // 9. Parent=transcript:ENST(\d+);ID=...;biotype=... etc
+
+ char *ss = line;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ aux_t *aux = &gff->init;
+ gff_parse_chr(gff, line, &aux->chr, &aux->chr_end);
+ ss = gff_skip(line, aux->chr_end + 2);
+
+ // 3rd column: is this a CDS, transcript, gene, etc.. The parsing order by frequency in Homo_sapiens.GRCh37.87.gff3
+ int is_gene_line = 0;
+ ftr->type = 0;
+ aux->type = ss;
+ if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+ else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+ else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+ else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+ else if ( !strncmp("biological_region\t",ss,18) ) { return -1; } // skip
+ else if ( !strncmp("gene\t",ss,5) ) { is_gene_line = 1; ss += 5; }
+ else ss = gff_skip(line, ss);
+ aux->type_end = ss - 1;
+
+ // 4-5th columns: beg,end
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+
+ // 6th column: skip
+ ss = gff_skip(line, ss);
+
+ // 7th column: strand
+ ftr->strand = -1;
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ ss += 2;
+
+ // 8th column: phase (codon offset)
+ ftr->phase = -1;
+ if ( *ss == '0' ) ftr->phase = 0;
+ else if ( *ss == '1' ) ftr->phase = 1;
+ else if ( *ss == '2' ) ftr->phase = 2;
+ else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase
+ ss += 2;
+
+ // 9th column: id, parent, name, biotype
+ aux->name = NULL, aux->id = NULL, aux->parent = NULL, aux->biotype = NULL;
+ while ( *ss )
+ {
+ char *es = ss;
+ while ( *es && *es!=';' ) es++;
+ if ( !strncmp(ss,"ID=",3) )
+ {
+ ss += 3;
+ aux->id_end = es - 1;
+ aux->id = ss;
+ if ( !strncmp(ss,"gene:",5) ) { aux->id += 5; is_gene_line = 1; }
+ else if ( !strncmp(ss,"transcript:",11) ) aux->id += 11;
+ }
+ else if ( !strncmp(ss,"Name=",5) ) { aux->name = ss + 5; aux->name_end = es - 1; }
+ else if ( !strncmp(ss,"Parent=",7) )
+ {
+ ss += 7;
+ aux->parent_end = es - 1;
+ aux->parent = ss;
+ if ( !strncmp(ss,"gene:",5) ) aux->parent += 5;
+ else if ( !strncmp(ss,"transcript:",11) ) aux->parent += 11;
+ }
+ else if ( !strncmp(ss,"biotype=",8) ) { aux->biotype = ss + 8; aux->biotype_end = es - 1; }
+ else if ( !strncmp(ss,"gene_biotype=",13) ) { aux->biotype = ss + 13; aux->biotype_end = es - 1; }
+ if ( !*es ) break;
+ ss = es + 1;
+ }
+
+ if ( is_gene_line || !aux->parent )
+ {
+ gff_parse_gene(gff, line, ftr);
+ return -1;
+ }
+
+ if ( ftr->type )
+ {
+ gff_parse_exon(gff, line, ftr);
+ return 0;
+ }
+
+ gff_parse_transcript(gff, line, ftr);
+ return -1;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+ // comparison function for qsort of transcripts's CDS
+ if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+ if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+ return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+ *chr_beg = *chr_end = aux->seq[iseq];
+ while ( (*chr_end)[1] ) (*chr_end)++;
+}
+static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+ gf_tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+ assert( tr );
+ return tr;
+}
+static void register_cds(gff_t *gff, ftr_t *ftr)
+{
+ // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+ // ftr is the result of parsing a gff CDS line
+ aux_t *aux = &gff->init;
+
+ gf_tscript_t *tr = tscript_init(aux, ftr->trid);
+ if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+ gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+ cds->tr = tr;
+ cds->beg = ftr->beg;
+ cds->len = ftr->end - ftr->beg + 1;
+ cds->icds = 0; // to keep valgrind on mac happy
+ cds->phase = ftr->phase;
+
+ hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+ tr->cds[tr->ncds++] = cds;
+}
+static void register_utr(gff_t *gff, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+ gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+ utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+ utr->beg = ftr->beg;
+ utr->end = ftr->end;
+ utr->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+static void register_exon(gff_t *gff, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+ gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+ exon->beg = ftr->beg;
+ exon->end = ftr->end;
+ exon->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+static void tscript_init_cds(gff_t *gff)
+{
+ aux_t *aux = &gff->init;
+
+ // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+ khint_t k;
+ for (k=0; k<kh_end(aux->id2tr); k++)
+ {
+ if ( !kh_exist(aux->id2tr, k) ) continue;
+ gf_tscript_t *tr = (gf_tscript_t*) kh_val(aux->id2tr, k);
+
+ // position-to-tscript lookup
+ char *chr_beg, *chr_end;
+ chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+ if ( !tr->ncds ) continue; // transcript with no CDS
+
+ // sort CDs
+ qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+ // trim non-coding start
+ int i, len = 0;
+ if ( tr->strand==STRAND_FWD )
+ {
+ if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
+ {
+ if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+ tr->cds[0]->beg += tr->cds[0]->phase;
+ tr->cds[0]->len -= tr->cds[0]->phase;
+ tr->cds[0]->phase = 0;
+ }
+
+ // sanity check phase; the phase number in gff tells us how many bases to skip in this
+ // feature to reach the first base of the next codon
+ int tscript_ok = 1;
+ for (i=0; i<tr->ncds; i++)
+ {
+ if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+ {
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+ gff->warned.unknown_cds_phase++;
+ }
+ len += tr->cds[i]->len;
+ continue;
+ }
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3 )
+ {
+ if ( !gff->force )
+ error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+ gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+ gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ gff->warned.wrong_phase++;
+ }
+ tscript_ok = 0;
+ break;
+ }
+ len += tr->cds[i]->len;
+ }
+ if ( !tscript_ok ) continue; // skip this transcript
+ }
+ else
+ {
+ if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
+ {
+ // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+ // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141.
+ // This also fixes phase of 5' incomplete CDS, see test/csq/ENST00000520868/ENST00000520868.gff
+ // todo: the same for the fwd strand
+ i = tr->ncds - 1;
+ int phase = tr->cds[i]->phase;
+ if ( phase ) tr->trim |= TRIM_5PRIME;
+ while ( i>=0 && phase > tr->cds[i]->len )
+ {
+ phase -= tr->cds[i]->len;
+ tr->cds[i]->phase = 0;
+ tr->cds[i]->len = 0;
+ i--;
+ }
+ if ( gff->verbosity > 0 && tr->cds[i]->phase )
+ {
+ if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+ fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+ gff->warned.incomplete_cds++;
+ }
+ tr->cds[i]->len -= tr->cds[i]->phase;
+ tr->cds[i]->phase = 0;
+ }
+
+ // sanity check phase
+ int tscript_ok = 1;
+ for (i=tr->ncds-1; i>=0; i--)
+ {
+ if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+ {
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+ gff->warned.unknown_cds_phase++;
+ }
+ len += tr->cds[i]->len;
+ continue;
+ }
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3 )
+ {
+ if ( !gff->force )
+ error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+ gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+ gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ gff->warned.wrong_phase++;
+ }
+ tscript_ok = 0;
+ break;
+ }
+ len += tr->cds[i]->len;
+ }
+ if ( !tscript_ok ) continue; // skip this transcript
+ }
+
+ // set len. At the same check that CDS within a transcript do not overlap
+ len = 0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->icds = i;
+ len += tr->cds[i]->len;
+ if ( !i ) continue;
+
+ gf_cds_t *a = tr->cds[i-1];
+ gf_cds_t *b = tr->cds[i];
+ if ( a->beg + a->len - 1 >= b->beg )
+ {
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.overlapping_cds || gff->verbosity > 1 )
+ fprintf(stderr,"Warning: GFF contains overlapping CDS %s, %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32" (ribosomal slippage?)\n",
+ gff->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ gff->warned.overlapping_cds++;
+ }
+ }
+ }
+
+ if ( len%3 != 0 )
+ {
+ // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+ // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+ // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+ fprintf(stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+ gff->warned.incomplete_cds++;
+ }
+
+ tr->trim |= TRIM_3PRIME;
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = tr->ncds - 1;
+ while ( i>=0 && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ len -= dlen;
+ i--;
+ }
+ }
+ else
+ {
+ i = 0;
+ while ( i<tr->ncds && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ tr->cds[i]->beg += dlen;
+ len -= dlen;
+ i++;
+ }
+ }
+ }
+
+ // set CDS offsets and insert into regidx
+ len=0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->pos = len;
+ len += tr->cds[i]->len;
+ regidx_push(gff->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+ }
+ }
+}
+
+static void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+static void regidx_free_tscript(void *payload) { gf_tscript_t *tr = *((gf_tscript_t**)payload); free(tr->cds); free(tr); }
+
+static int gff_dump(gff_t *gff, const char *fname)
+{
+ BGZF *out = bgzf_open(fname,"wg");
+ if ( !out ) error("Failed to open %s: %s\n", fname, strerror(errno));
+
+ kstring_t str = {0,0,0};
+
+ khint_t k;
+ for (k=0; k<kh_end(gff->init.gid2gene); k++)
+ {
+ if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+ gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+ char *gene_id = gff->init.gene_ids.str[gene->id];
+ str.l = 0;
+ ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+
+ regitr_t *itr = regitr_init(gff->idx_tscript);
+ while ( regitr_loop(itr) )
+ {
+ gf_tscript_t *tr = regitr_payload(itr, gf_tscript_t*);
+ char *gene_id = gff->init.gene_ids.str[tr->gene->id];
+ const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type);
+ str.l = 0;
+ ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+ regitr_destroy(itr);
+
+ itr = regitr_init(gff->idx_cds);
+ while ( regitr_loop(itr) )
+ {
+ gf_cds_t *cds = regitr_payload(itr,gf_cds_t*);
+ gf_tscript_t *tr = cds->tr;
+ str.l = 0;
+ ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+ regitr_destroy(itr);
+
+ itr = regitr_init(gff->idx_utr);
+ while ( regitr_loop(itr) )
+ {
+ gf_utr_t *utr = regitr_payload(itr,gf_utr_t*);
+ gf_tscript_t *tr = utr->tr;
+ str.l = 0;
+ ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+ regitr_destroy(itr);
+
+ itr = regitr_init(gff->idx_exon);
+ while ( regitr_loop(itr) )
+ {
+ gf_exon_t *exon = regitr_payload(itr,gf_exon_t*);
+ gf_tscript_t *tr = exon->tr;
+ str.l = 0;
+ ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+ regitr_destroy(itr);
+
+ if ( bgzf_close(out)!=0 ) error("Error: close failed .. %s\n", fname);
+ free(str.s);
+
+ return 0;
+}
+
+int gff_parse(gff_t *gff)
+{
+ if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname);
+
+ aux_t *aux = &gff->init;
+ aux->seq2int = khash_str2int_init(); // chrom's numeric id
+ aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
+ aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
+ gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
+ aux->ignored_biotypes = khash_str2int_init();
+ gff_id_init(&aux->gene_ids);
+ gff_id_init(&gff->tscript_ids);
+
+ // parse gff
+ kstring_t str = {0,0,0};
+ htsFile *fp = hts_open(gff->fname,"r");
+ if ( !fp ) error("Failed to read %s\n", gff->fname);
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+ int ret = gff_parse_line(gff, str.s, aux->ftr + aux->nftr);
+ if ( !ret ) aux->nftr++;
+ }
+ free(str.s);
+ if ( hts_close(fp)!=0 ) error("Close failed: %s\n", gff->fname);
+
+
+ // process gff information: connect CDS and exons to transcripts
+ gff->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+ gff->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+ gff->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+
+ int i;
+ for (i=0; i<aux->nftr; i++)
+ {
+ ftr_t *ftr = &aux->ftr[i];
+
+ // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+ if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype
+
+ gf_tscript_t *tr = kh_val(aux->id2tr,k);
+ tr->used = 1;
+ tr->gene->used = 1;
+
+ // populate regidx by category:
+ // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+ // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+ if ( ftr->type==GF_CDS ) register_cds(gff, ftr);
+ else if ( ftr->type==GF_EXON ) register_exon(gff, ftr);
+ else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
+ else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
+ else
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+ }
+ tscript_init_cds(gff);
+
+ if ( gff->verbosity > 0 )
+ {
+ fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+ regidx_nregs(gff->idx_tscript),
+ regidx_nregs(gff->idx_exon),
+ regidx_nregs(gff->idx_cds),
+ regidx_nregs(gff->idx_utr));
+ }
+
+ if ( gff->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
+ {
+ khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+ fprintf(stderr,"Ignored the following biotypes:\n");
+ for (i = kh_begin(ign); i < kh_end(ign); i++)
+ {
+ if ( !kh_exist(ign,i)) continue;
+ const char *biotype = kh_key(ign,i);
+ if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
+ fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
+ }
+ }
+ khash_str2int_destroy_free(aux->ignored_biotypes);
+
+ // warned about unprinted warnings
+ if ( gff->verbosity > 0 )
+ {
+ int nwarn = 0;
+ #define INC_NWARN(X) if (gff->warned.X) nwarn += gff->verbosity > 1 ? 0 : gff->warned.X - 1;
+ INC_NWARN(unknown_chr);
+ INC_NWARN(unknown_tscript_biotype);
+ INC_NWARN(unknown_strand);
+ INC_NWARN(unknown_phase);
+ INC_NWARN(duplicate_id);
+ INC_NWARN(unknown_cds_phase);
+ INC_NWARN(incomplete_cds);
+ INC_NWARN(wrong_phase);
+ INC_NWARN(overlapping_cds);
+ if ( nwarn > 0 )
+ fprintf(stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn);
+ }
+
+ if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname);
+
+ if ( !regidx_nregs(gff->idx_tscript) )
+ error("Error: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
+ " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
+ " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
+
+ free(aux->seq);
+ free(aux->ftr);
+ khash_str2int_destroy_free(aux->seq2int);
+ // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+ kh_destroy(int2tscript,aux->id2tr);
+ gff_id_destroy(&aux->gene_ids);
+
+ return 0;
+}
+
+gff_t *gff_init(const char *fname)
+{
+ gff_t *gff = calloc(sizeof(gff_t),1);
+ gff->fname = fname;
+ return gff;
+}
+void gff_destroy(gff_t *gff)
+{
+ khint_t k;
+ if ( gff->init.gid2gene )
+ {
+ for (k=0; k<kh_end(gff->init.gid2gene); k++)
+ {
+ if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+ gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+ free(gene->name);
+ free(gene);
+ }
+ kh_destroy(int2gene,gff->init.gid2gene);
+ }
+
+ regidx_destroy(gff->idx_cds);
+ regidx_destroy(gff->idx_utr);
+ regidx_destroy(gff->idx_exon);
+ regidx_destroy(gff->idx_tscript);
+
+ gff_id_destroy(&gff->tscript_ids);
+ free(gff);
+}
+
--- /dev/null
+#include "bcftools.pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2023 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "gff.h"
+
+/*
+ Helper structures, only for initialization
+
+ ftr_t
+ temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, gf_tscript_t*)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+ int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+ uint32_t beg;
+ uint32_t end;
+ uint32_t trid;
+ uint32_t strand:1; // STRAND_REV,STRAND_FWD
+ uint32_t phase:2; // 0, 1, 2, or 3 for unknown
+ uint32_t iseq:29;
+}
+ftr_t;
+
+/*
+ Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+ to integer id. To keep the memory requirements low, the original version
+ relied on IDs in the form of a string prefix and a numerical id. However,
+ it turns out that this assumption is not valid for some ensembl GFFs, see
+ for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+ void *str2id; // khash_str2int
+ int nstr, mstr;
+ char **str; // numeric id to string
+}
+id_tbl_t;
+
+typedef struct
+{
+ // all exons, CDS, UTRs
+ ftr_t *ftr;
+ int nftr, mftr;
+
+ // mapping from gene id to gf_gene_t
+ kh_int2gene_t *gid2gene;
+
+ // mapping from transcript id to tscript, for quick CDS anchoring
+ kh_int2tscript_t *id2tr;
+
+ // sequences
+ void *seq2int; // str2int hash
+ char **seq;
+ int nseq, mseq;
+
+ // ignored biotypes
+ void *ignored_biotypes;
+
+ id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
+
+ // pointers to the current partially processed line
+ char *id, *id_end, *parent, *parent_end, *biotype, *biotype_end,
+ *chr, *chr_end, *name, *name_end, *type, *type_end;
+}
+aux_t;
+
+struct gff_t_
+{
+ const char *fname, *dump_fname;
+
+ // the main regidx lookups, from chr:beg-end to overlapping features and
+ // index iterator
+ regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+
+ // temporary structures, deleted after initializtion
+ aux_t init;
+
+ // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
+ id_tbl_t tscript_ids;
+
+ int strip_chr_names, verbosity;
+ int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
+
+ struct {
+ int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id;
+ int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds;
+ } warned;
+};
+
+static const char *gf_strings_noncoding[] =
+{
+ "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+ "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+ "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+ "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+ "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+ "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
+ "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+ "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf",
+ "lncRNA"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+int gff_set(gff_t *gff, gff_opt_t key, ...)
+{
+ va_list args;
+ switch (key)
+ {
+ case dump_fname:
+ va_start(args, key);
+ gff->dump_fname = va_arg(args,char*);
+ va_end(args);
+ return 0;
+
+ case force_out_of_phase:
+ va_start(args, key);
+ gff->force = va_arg(args,int);
+ va_end(args);
+ return 0;
+
+ case strip_chr_names:
+ va_start(args, key);
+ gff->strip_chr_names = va_arg(args,int);
+ va_end(args);
+ return 0;
+
+ case verbosity:
+ va_start(args, key);
+ gff->verbosity = va_arg(args,int);
+ va_end(args);
+ return 0;
+
+ default:
+ error("The key %d is not supported with gff_set\n",key);
+ }
+ return 0;
+}
+
+void *gff_get(gff_t *gff, gff_opt_t key)
+{
+ switch (key)
+ {
+ case idx_cds: return gff->idx_cds;
+ case idx_utr: return gff->idx_utr;
+ case idx_exon: return gff->idx_exon;
+ case idx_tscript: return gff->idx_tscript;
+ default:
+ error("The key %d is not supported with gff_get\n",key);
+ }
+ return NULL;
+}
+
+const char *gff_id2string(gff_t *gff, id_type_t type, int id) // currently only transcript ids
+{
+ return gff->tscript_ids.str[id];
+}
+
+const char *gf_type2gff_string(int type)
+{
+ if ( !GF_is_coding(type) )
+ {
+ if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+ type &= (1<<(GF_coding_bit+1)) - 1;
+ return gf_strings_special[type - 1];
+ }
+ type &= (1<<GF_coding_bit) - 1;
+ return gf_strings_coding[type - 1];
+}
+
+/*
+ gff parsing functions
+*/
+static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
+{
+ aux_t *aux = &gff->init;
+ char tmp = chr_end[1];
+ chr_end[1] = 0;
+ int iseq;
+ if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+ {
+ char *new_chr = strdup(chr_beg);
+ hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+ aux->seq[aux->nseq] = new_chr;
+ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+ aux->nseq++;
+ assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
+ }
+ chr_end[1] = tmp;
+ return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+ while ( *ss && *ss!='\t' ) ss++;
+ if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return ss+1;
+}
+static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, char **chr_end)
+{
+ char *se = (char*) line;
+ while ( *se && *se!='\t' ) se++;
+ if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
+ *chr_beg = (char*) line;
+ *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+ char *se = ss;
+ *beg = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+ ss = se+1;
+ *end = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return se+1;
+}
+static void gff_id_init(id_tbl_t *tbl)
+{
+ memset(tbl, 0, sizeof(*tbl));
+ tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+ khash_str2int_destroy_free(tbl->str2id);
+ free(tbl->str);
+}
+static inline int gff_id_register(id_tbl_t *tbl, char *beg, char *end, uint32_t *id_ptr)
+{
+ char tmp = end[1];
+ end[1] = 0;
+ int id;
+ if ( khash_str2int_get(tbl->str2id, beg, &id) < 0 )
+ {
+ id = tbl->nstr++;
+ hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+ tbl->str[id] = strdup(beg);
+ khash_str2int_set(tbl->str2id, tbl->str[id], id);
+ }
+ end[1] = tmp;
+ *id_ptr = id;
+ return 0;
+}
+static inline int gff_parse_biotype(char *line)
+{
+ if ( !line ) return -1;
+ switch (*line)
+ {
+ case 'p':
+ if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+ else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+ else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+ else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+ break;
+ case 'a':
+ if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+ else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+ else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+ break;
+ case 'I':
+ if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_C",4) ) return GF_IG_C;
+ else if ( !strncmp(line,"IG_D",4) ) return GF_IG_D;
+ else if ( !strncmp(line,"IG_J",4) ) return GF_IG_J;
+ else if ( !strncmp(line,"IG_V",4) ) return GF_IG_V;
+ else if ( !strncmp(line,"IG_LV",5) ) return GF_IG_LV;
+ break;
+ case 'T':
+ if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+ else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+ else if ( !strncmp(line,"TR_C",4) ) return GF_TR_C;
+ else if ( !strncmp(line,"TR_D",4) ) return GF_TR_D;
+ else if ( !strncmp(line,"TR_J",4) ) return GF_TR_J;
+ else if ( !strncmp(line,"TR_V",4) ) return GF_TR_V;
+ break;
+ case 'M':
+ if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+ else if ( !strncasecmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+ else if ( !strncasecmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+ else if ( !strncasecmp(line,"MRNA",4) ) return GF_PROTEIN_CODING;
+ break;
+ case 'l':
+ if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+ if ( !strncmp(line,"lncRNA",7) ) return GF_lncRNA;
+ break;
+ case 'm':
+ if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+ else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+ else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+ else if ( !strncasecmp(line,"mRNA",4) ) return GF_PROTEIN_CODING;
+ break;
+ case 'r':
+ if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+ else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+ else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+ else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+ break;
+ case 's':
+ if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+ else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+ else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+ else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+ else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+ else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+ else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+ break;
+ case 't':
+ if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+ break;
+ case 'n':
+ if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+ else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+ break;
+ case 'N':
+ if ( !strncmp(line,"NMD",3) ) return GF_NMD;
+ break;
+ case 'k':
+ if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+ break;
+ case 'u':
+ if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+ break;
+ case 'L':
+ if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+ break;
+ case '3':
+ if ( !strncasecmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+ else if ( !strncasecmp(line,"3_prime_overlapping_ncRNA",25) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+ break;
+ case 'd':
+ if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+ break;
+ case 'v':
+ if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+ break;
+ case 'b':
+ if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+ break;
+ }
+ return 0;
+}
+static inline int gff_ignored_biotype(gff_t *gff, char *ss, char *se)
+{
+ if ( !ss ) return 0;
+
+ char tmp = se[1];
+ se[1] = 0;
+
+ char *key = ss;
+ int n = 0;
+ if ( khash_str2int_get(gff->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+ khash_str2int_set(gff->init.ignored_biotypes, key, n+1);
+
+ se[1] = tmp;
+ return 1;
+}
+static gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+ khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+ gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+ if ( !gene )
+ {
+ gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+ int ret;
+ k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+ kh_val(aux->gid2gene,k) = gene;
+ }
+ return gene;
+}
+static void gff_parse_transcript(gff_t *gff, const char *line, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+
+ ftr->type = gff_parse_biotype(aux->biotype);
+ if ( ftr->type <= 0 )
+ {
+ char tmp = aux->type_end[1];
+ aux->type_end[1] = 0;
+ ftr->type = gff_parse_biotype(aux->type);
+ aux->type_end[1] = tmp;
+ }
+ if ( ftr->type <= 0 )
+ {
+ if ( !gff_ignored_biotype(gff,aux->biotype,aux->biotype_end) )
+ {
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_tscript_biotype || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: Ignoring transcript with unknown biotype .. %s\n", line);
+ gff->warned.unknown_tscript_biotype++;
+ }
+ }
+ return;
+ }
+
+ if ( !aux->id )
+ error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ if ( !aux->parent )
+ error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+ uint32_t trid,gene_id;
+ gff_id_register(&gff->tscript_ids, aux->id, aux->id_end, &trid);
+ gff_id_register(&aux->gene_ids, aux->parent, aux->parent_end, &gene_id);
+
+ gf_tscript_t *tr = (gf_tscript_t*) calloc(1,sizeof(gf_tscript_t));
+ tr->id = trid;
+ tr->strand = ftr->strand;
+ tr->gene = gene_init(aux, gene_id);
+ tr->type = ftr->type;
+ tr->beg = ftr->beg;
+ tr->end = ftr->end;
+
+ khint_t k;
+ int ret;
+ k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+ kh_val(aux->id2tr,k) = tr;
+}
+// register exon, CDS, UTR
+static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+ if ( !aux->parent )
+ error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring found: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+
+ // associate with transcript id
+ gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid);
+
+ if ( ftr->strand==-1 && gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_strand || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line);
+ gff->warned.unknown_strand++;
+ }
+ if ( ftr->phase==-1 && gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_phase|| gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: Ignoring GFF feature with unknown phase .. %s\n",line);
+ gff->warned.unknown_phase++;
+ }
+ ftr->iseq = feature_set_seq(gff, aux->chr,aux->chr_end);
+}
+static void gff_parse_gene(gff_t *gff, const char *line, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+ if ( !aux->id ) return;
+
+ uint32_t gene_id;
+ gff_id_register(&aux->gene_ids, aux->id, aux->id_end, &gene_id);
+
+ gf_gene_t *gene = gene_init(aux, gene_id);
+ if ( gene->name )
+ {
+ if ( !gff->warned.duplicate_id || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: The GFF contains features with duplicate id .. %s\n",line);
+ gff->warned.duplicate_id++;
+ return;
+ }
+
+ gene->iseq = feature_set_seq(gff, aux->chr,aux->chr_end);
+ gene->beg = ftr->beg;
+ gene->end = ftr->end;
+ gene->strand = ftr->strand;
+ gene->id = gene_id;
+
+ if ( aux->name )
+ {
+ gene->name = (char*) malloc(aux->name_end - aux->name + 2);
+ memcpy(gene->name,aux->name,aux->name_end - aux->name + 1);
+ gene->name[aux->name_end - aux->name + 1] = 0;
+ }
+ else
+ gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
+}
+
+// Returns 0 for exons,CDS,UTRs to indiciate these need to be pruned later and regidx built on them,
+// or -1 to indiciate the structure needs not be saved (either because of an error or because saved
+// as transcript or gene.)
+static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
+{
+ // - skip empty lines and commented lines
+ // - columns
+ // 1. chr
+ // 2. <skip>
+ // 3. CDS, transcript, gene, ...
+ // 4-5. beg,end
+ // 6. <skip>
+ // 7. strand
+ // 8. phase
+ // 9. Parent=transcript:ENST(\d+);ID=...;biotype=... etc
+
+ char *ss = line;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ aux_t *aux = &gff->init;
+ gff_parse_chr(gff, line, &aux->chr, &aux->chr_end);
+ ss = gff_skip(line, aux->chr_end + 2);
+
+ // 3rd column: is this a CDS, transcript, gene, etc.. The parsing order by frequency in Homo_sapiens.GRCh37.87.gff3
+ int is_gene_line = 0;
+ ftr->type = 0;
+ aux->type = ss;
+ if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+ else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+ else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+ else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+ else if ( !strncmp("biological_region\t",ss,18) ) { return -1; } // skip
+ else if ( !strncmp("gene\t",ss,5) ) { is_gene_line = 1; ss += 5; }
+ else ss = gff_skip(line, ss);
+ aux->type_end = ss - 1;
+
+ // 4-5th columns: beg,end
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+
+ // 6th column: skip
+ ss = gff_skip(line, ss);
+
+ // 7th column: strand
+ ftr->strand = -1;
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ ss += 2;
+
+ // 8th column: phase (codon offset)
+ ftr->phase = -1;
+ if ( *ss == '0' ) ftr->phase = 0;
+ else if ( *ss == '1' ) ftr->phase = 1;
+ else if ( *ss == '2' ) ftr->phase = 2;
+ else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase
+ ss += 2;
+
+ // 9th column: id, parent, name, biotype
+ aux->name = NULL, aux->id = NULL, aux->parent = NULL, aux->biotype = NULL;
+ while ( *ss )
+ {
+ char *es = ss;
+ while ( *es && *es!=';' ) es++;
+ if ( !strncmp(ss,"ID=",3) )
+ {
+ ss += 3;
+ aux->id_end = es - 1;
+ aux->id = ss;
+ if ( !strncmp(ss,"gene:",5) ) { aux->id += 5; is_gene_line = 1; }
+ else if ( !strncmp(ss,"transcript:",11) ) aux->id += 11;
+ }
+ else if ( !strncmp(ss,"Name=",5) ) { aux->name = ss + 5; aux->name_end = es - 1; }
+ else if ( !strncmp(ss,"Parent=",7) )
+ {
+ ss += 7;
+ aux->parent_end = es - 1;
+ aux->parent = ss;
+ if ( !strncmp(ss,"gene:",5) ) aux->parent += 5;
+ else if ( !strncmp(ss,"transcript:",11) ) aux->parent += 11;
+ }
+ else if ( !strncmp(ss,"biotype=",8) ) { aux->biotype = ss + 8; aux->biotype_end = es - 1; }
+ else if ( !strncmp(ss,"gene_biotype=",13) ) { aux->biotype = ss + 13; aux->biotype_end = es - 1; }
+ if ( !*es ) break;
+ ss = es + 1;
+ }
+
+ if ( is_gene_line || !aux->parent )
+ {
+ gff_parse_gene(gff, line, ftr);
+ return -1;
+ }
+
+ if ( ftr->type )
+ {
+ gff_parse_exon(gff, line, ftr);
+ return 0;
+ }
+
+ gff_parse_transcript(gff, line, ftr);
+ return -1;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+ // comparison function for qsort of transcripts's CDS
+ if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+ if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+ return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+ *chr_beg = *chr_end = aux->seq[iseq];
+ while ( (*chr_end)[1] ) (*chr_end)++;
+}
+static gf_tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+ gf_tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+ assert( tr );
+ return tr;
+}
+static void register_cds(gff_t *gff, ftr_t *ftr)
+{
+ // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+ // ftr is the result of parsing a gff CDS line
+ aux_t *aux = &gff->init;
+
+ gf_tscript_t *tr = tscript_init(aux, ftr->trid);
+ if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+ gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+ cds->tr = tr;
+ cds->beg = ftr->beg;
+ cds->len = ftr->end - ftr->beg + 1;
+ cds->icds = 0; // to keep valgrind on mac happy
+ cds->phase = ftr->phase;
+
+ hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+ tr->cds[tr->ncds++] = cds;
+}
+static void register_utr(gff_t *gff, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+ gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+ utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+ utr->beg = ftr->beg;
+ utr->end = ftr->end;
+ utr->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&gff->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(gff->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+static void register_exon(gff_t *gff, ftr_t *ftr)
+{
+ aux_t *aux = &gff->init;
+ gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+ exon->beg = ftr->beg;
+ exon->end = ftr->end;
+ exon->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&gff->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(gff->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+static void tscript_init_cds(gff_t *gff)
+{
+ aux_t *aux = &gff->init;
+
+ // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+ khint_t k;
+ for (k=0; k<kh_end(aux->id2tr); k++)
+ {
+ if ( !kh_exist(aux->id2tr, k) ) continue;
+ gf_tscript_t *tr = (gf_tscript_t*) kh_val(aux->id2tr, k);
+
+ // position-to-tscript lookup
+ char *chr_beg, *chr_end;
+ chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(gff->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+ if ( !tr->ncds ) continue; // transcript with no CDS
+
+ // sort CDs
+ qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+ // trim non-coding start
+ int i, len = 0;
+ if ( tr->strand==STRAND_FWD )
+ {
+ if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
+ {
+ if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+ tr->cds[0]->beg += tr->cds[0]->phase;
+ tr->cds[0]->len -= tr->cds[0]->phase;
+ tr->cds[0]->phase = 0;
+ }
+
+ // sanity check phase; the phase number in gff tells us how many bases to skip in this
+ // feature to reach the first base of the next codon
+ int tscript_ok = 1;
+ for (i=0; i<tr->ncds; i++)
+ {
+ if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+ {
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+ gff->warned.unknown_cds_phase++;
+ }
+ len += tr->cds[i]->len;
+ continue;
+ }
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3 )
+ {
+ if ( !gff->force )
+ error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+ gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+ gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ gff->warned.wrong_phase++;
+ }
+ tscript_ok = 0;
+ break;
+ }
+ len += tr->cds[i]->len;
+ }
+ if ( !tscript_ok ) continue; // skip this transcript
+ }
+ else
+ {
+ if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
+ {
+ // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+ // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141.
+ // This also fixes phase of 5' incomplete CDS, see test/csq/ENST00000520868/ENST00000520868.gff
+ // todo: the same for the fwd strand
+ i = tr->ncds - 1;
+ int phase = tr->cds[i]->phase;
+ if ( phase ) tr->trim |= TRIM_5PRIME;
+ while ( i>=0 && phase > tr->cds[i]->len )
+ {
+ phase -= tr->cds[i]->len;
+ tr->cds[i]->phase = 0;
+ tr->cds[i]->len = 0;
+ i--;
+ }
+ if ( gff->verbosity > 0 && tr->cds[i]->phase )
+ {
+ if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+ gff->warned.incomplete_cds++;
+ }
+ tr->cds[i]->len -= tr->cds[i]->phase;
+ tr->cds[i]->phase = 0;
+ }
+
+ // sanity check phase
+ int tscript_ok = 1;
+ for (i=tr->ncds-1; i>=0; i--)
+ {
+ if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+ {
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.unknown_cds_phase || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: CDS with unknown phase, could not verify reading frame in transcript %s\n",gff->tscript_ids.str[tr->id]);
+ gff->warned.unknown_cds_phase++;
+ }
+ len += tr->cds[i]->len;
+ continue;
+ }
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3 )
+ {
+ if ( !gff->force )
+ error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+ gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.wrong_phase || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: The GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+ gff->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ gff->warned.wrong_phase++;
+ }
+ tscript_ok = 0;
+ break;
+ }
+ len += tr->cds[i]->len;
+ }
+ if ( !tscript_ok ) continue; // skip this transcript
+ }
+
+ // set len. At the same check that CDS within a transcript do not overlap
+ len = 0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->icds = i;
+ len += tr->cds[i]->len;
+ if ( !i ) continue;
+
+ gf_cds_t *a = tr->cds[i-1];
+ gf_cds_t *b = tr->cds[i];
+ if ( a->beg + a->len - 1 >= b->beg )
+ {
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.overlapping_cds || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s, %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32" (ribosomal slippage?)\n",
+ gff->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ gff->warned.overlapping_cds++;
+ }
+ }
+ }
+
+ if ( len%3 != 0 )
+ {
+ // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+ // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+ // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+ if ( gff->verbosity > 0 )
+ {
+ if ( !gff->warned.incomplete_cds || gff->verbosity > 1 )
+ fprintf(bcftools_stderr,"Note: truncated transcript %s with incomplete CDS (this is very common)\n",gff->tscript_ids.str[tr->id]);
+ gff->warned.incomplete_cds++;
+ }
+
+ tr->trim |= TRIM_3PRIME;
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = tr->ncds - 1;
+ while ( i>=0 && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ len -= dlen;
+ i--;
+ }
+ }
+ else
+ {
+ i = 0;
+ while ( i<tr->ncds && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ tr->cds[i]->beg += dlen;
+ len -= dlen;
+ i++;
+ }
+ }
+ }
+
+ // set CDS offsets and insert into regidx
+ len=0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->pos = len;
+ len += tr->cds[i]->len;
+ regidx_push(gff->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+ }
+ }
+}
+
+static void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+static void regidx_free_tscript(void *payload) { gf_tscript_t *tr = *((gf_tscript_t**)payload); free(tr->cds); free(tr); }
+
+static int gff_dump(gff_t *gff, const char *fname)
+{
+ BGZF *out = bgzf_open(fname,"wg");
+ if ( !out ) error("Failed to open %s: %s\n", fname, strerror(errno));
+
+ kstring_t str = {0,0,0};
+
+ khint_t k;
+ for (k=0; k<kh_end(gff->init.gid2gene); k++)
+ {
+ if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+ gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+ char *gene_id = gff->init.gene_ids.str[gene->id];
+ str.l = 0;
+ ksprintf(&str,"%s\t.\tgene\t%d\t%d\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+
+ regitr_t *itr = regitr_init(gff->idx_tscript);
+ while ( regitr_loop(itr) )
+ {
+ gf_tscript_t *tr = regitr_payload(itr, gf_tscript_t*);
+ char *gene_id = gff->init.gene_ids.str[tr->gene->id];
+ const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type);
+ str.l = 0;
+ ksprintf(&str,"%s\t.\t%s\t%d\t%d\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+ regitr_destroy(itr);
+
+ itr = regitr_init(gff->idx_cds);
+ while ( regitr_loop(itr) )
+ {
+ gf_cds_t *cds = regitr_payload(itr,gf_cds_t*);
+ gf_tscript_t *tr = cds->tr;
+ str.l = 0;
+ ksprintf(&str,"%s\t.\tCDS\t%d\t%d\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+ regitr_destroy(itr);
+
+ itr = regitr_init(gff->idx_utr);
+ while ( regitr_loop(itr) )
+ {
+ gf_utr_t *utr = regitr_payload(itr,gf_utr_t*);
+ gf_tscript_t *tr = utr->tr;
+ str.l = 0;
+ ksprintf(&str,"%s\t.\t%s_prime_UTR\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+ regitr_destroy(itr);
+
+ itr = regitr_init(gff->idx_exon);
+ while ( regitr_loop(itr) )
+ {
+ gf_exon_t *exon = regitr_payload(itr,gf_exon_t*);
+ gf_tscript_t *tr = exon->tr;
+ str.l = 0;
+ ksprintf(&str,"%s\t.\texon\t%d\t%d\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
+ if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
+ }
+ regitr_destroy(itr);
+
+ if ( bgzf_close(out)!=0 ) error("Error: close failed .. %s\n", fname);
+ free(str.s);
+
+ return 0;
+}
+
+int gff_parse(gff_t *gff)
+{
+ if ( gff->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", gff->fname);
+
+ aux_t *aux = &gff->init;
+ aux->seq2int = khash_str2int_init(); // chrom's numeric id
+ aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
+ aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
+ gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
+ aux->ignored_biotypes = khash_str2int_init();
+ gff_id_init(&aux->gene_ids);
+ gff_id_init(&gff->tscript_ids);
+
+ // parse gff
+ kstring_t str = {0,0,0};
+ htsFile *fp = hts_open(gff->fname,"r");
+ if ( !fp ) error("Failed to read %s\n", gff->fname);
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+ int ret = gff_parse_line(gff, str.s, aux->ftr + aux->nftr);
+ if ( !ret ) aux->nftr++;
+ }
+ free(str.s);
+ if ( hts_close(fp)!=0 ) error("Close failed: %s\n", gff->fname);
+
+
+ // process gff information: connect CDS and exons to transcripts
+ gff->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+ gff->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+ gff->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+
+ int i;
+ for (i=0; i<aux->nftr; i++)
+ {
+ ftr_t *ftr = &aux->ftr[i];
+
+ // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+ if ( k==kh_end(aux->id2tr) ) continue; // no corresponding transcript registered, must be an unsupported biotype
+
+ gf_tscript_t *tr = kh_val(aux->id2tr,k);
+ tr->used = 1;
+ tr->gene->used = 1;
+
+ // populate regidx by category:
+ // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+ // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+ if ( ftr->type==GF_CDS ) register_cds(gff, ftr);
+ else if ( ftr->type==GF_EXON ) register_exon(gff, ftr);
+ else if ( ftr->type==GF_UTR5 ) register_utr(gff, ftr);
+ else if ( ftr->type==GF_UTR3 ) register_utr(gff, ftr);
+ else
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,gff->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
+ }
+ tscript_init_cds(gff);
+
+ if ( gff->verbosity > 0 )
+ {
+ fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+ regidx_nregs(gff->idx_tscript),
+ regidx_nregs(gff->idx_exon),
+ regidx_nregs(gff->idx_cds),
+ regidx_nregs(gff->idx_utr));
+ }
+
+ if ( gff->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) )
+ {
+ khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+ fprintf(bcftools_stderr,"Ignored the following biotypes:\n");
+ for (i = kh_begin(ign); i < kh_end(ign); i++)
+ {
+ if ( !kh_exist(ign,i)) continue;
+ const char *biotype = kh_key(ign,i);
+ if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")";
+ fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype);
+ }
+ }
+ khash_str2int_destroy_free(aux->ignored_biotypes);
+
+ // warned about unprinted warnings
+ if ( gff->verbosity > 0 )
+ {
+ int nwarn = 0;
+ #define INC_NWARN(X) if (gff->warned.X) nwarn += gff->verbosity > 1 ? 0 : gff->warned.X - 1;
+ INC_NWARN(unknown_chr);
+ INC_NWARN(unknown_tscript_biotype);
+ INC_NWARN(unknown_strand);
+ INC_NWARN(unknown_phase);
+ INC_NWARN(duplicate_id);
+ INC_NWARN(unknown_cds_phase);
+ INC_NWARN(incomplete_cds);
+ INC_NWARN(wrong_phase);
+ INC_NWARN(overlapping_cds);
+ if ( nwarn > 0 )
+ fprintf(bcftools_stderr,"Warning: %d warnings were supressed, run with `--verbose 2` to see them all\n",nwarn);
+ }
+
+ if ( gff->dump_fname ) gff_dump(gff, gff->dump_fname);
+
+ if ( !regidx_nregs(gff->idx_tscript) )
+ error("Error: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
+ " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
+ " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
+
+ free(aux->seq);
+ free(aux->ftr);
+ khash_str2int_destroy_free(aux->seq2int);
+ // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+ kh_destroy(int2tscript,aux->id2tr);
+ gff_id_destroy(&aux->gene_ids);
+
+ return 0;
+}
+
+gff_t *gff_init(const char *fname)
+{
+ gff_t *gff = calloc(sizeof(gff_t),1);
+ gff->fname = fname;
+ return gff;
+}
+void gff_destroy(gff_t *gff)
+{
+ khint_t k;
+ if ( gff->init.gid2gene )
+ {
+ for (k=0; k<kh_end(gff->init.gid2gene); k++)
+ {
+ if ( !kh_exist(gff->init.gid2gene, k) ) continue;
+ gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
+ free(gene->name);
+ free(gene);
+ }
+ kh_destroy(int2gene,gff->init.gid2gene);
+ }
+
+ regidx_destroy(gff->idx_cds);
+ regidx_destroy(gff->idx_utr);
+ regidx_destroy(gff->idx_exon);
+ regidx_destroy(gff->idx_tscript);
+
+ gff_id_destroy(&gff->tscript_ids);
+ free(gff);
+}
+
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2023 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+/*
+ GFF parsing code refactored from csq.c
+
+ Things that would be nice to have
+ - dynamic N_REF_PAD
+ - for stop-lost events (also in frameshifts) report the number of truncated aa's
+ - memory could be greatly reduced by indexing gff (but it is quite compact already)
+ - deletions that go beyond transcript boundaries are not checked at sequence level
+ - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
+ - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
+
+ Read about transcript types here
+ http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
+ http://www.ensembl.org/info/genome/variation/predicted_data.html
+ https://www.gencodegenes.org/pages/biotypes.html
+
+ List of supported biotypes
+ antisense
+ IG_C_gene
+ IG_D_gene
+ IG_J_gene
+ IG_LV_gene
+ IG_V_gene
+ lincRNA
+ lncRNA .. generic term for 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping
+ macro_lncRNA
+ miRNA
+ misc_RNA
+ Mt_rRNA
+ Mt_tRNA
+ polymorphic_pseudogene
+ processed_transcript
+ protein_coding, mRNA
+ ribozyme
+ rRNA
+ sRNA
+ scRNA
+ scaRNA
+ sense_intronic
+ sense_overlapping
+ snRNA
+ snoRNA
+ TR_C_gene
+ TR_D_gene
+ TR_J_gene
+ TR_V_gene
+
+ The gff parsing logic
+ We collect features such by combining gff lines A,B,C as follows:
+ A .. gene line with a supported biotype
+ A.ID=~/^gene:/
+
+ B .. transcript line referencing A with supported biotype
+ B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
+
+ C .. corresponding CDS, exon, and UTR lines:
+ C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+
+ For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
+ complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
+
+
+ The supported consequence types, sorted by impact:
+ splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
+ splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron)
+ stop_gained .. DNA sequence variant resulting in a stop codon
+ frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
+ stop_lost .. elongated transcript, stop codon changed
+ start_lost .. the first codon changed
+ inframe_altering .. combination of indels leading to unchanged reading frame and length
+ inframe_insertion .. inserted coding sequence, unchanged reading frame
+ inframe_deletion .. deleted coding sequence, unchanged reading frame
+ missense_variant .. amino acid (aa) change, unchanged length
+ splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron
+ synonymous_variant .. DNA sequence variant resulting in no amino acid change
+ stop_retained_variant .. different stop codon
+ start_retained_variant .. start codon retained by indel realignment
+ non_coding_variant .. variant in non-coding sequence, such as RNA gene
+ 5_prime_UTR_variant
+ 3_prime_UTR_variant
+ intron_variant .. reported only if none of the above
+ intergenic_variant .. reported only if none of the above
+
+
+ The annotation algorithm.
+ The algorithm checks if the variant falls in a region of a supported type. The
+ search is performed in the following order, until a match is found:
+ 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
+ 2. idx_utr(gf_utr_t) - check UTR hits
+ 3. idx_exon(gf_exon_t) - check for splice variants
+ 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
+
+ These regidx indexes are created by parsing a gff3 file as follows:
+ 1. create the array "ftr" of all UTR, CDS, exons. This will be
+ processed later and pruned based on transcript types we want to keep.
+ In the same go, create the hash "id2tr" of transcripts to keep
+ (based on biotype) which maps from transcript_id to a transcript. At
+ the same time also build the hash "gid2gene" which maps from gene_id to
+ gf_gene_t pointer.
+
+ 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
+ Use only features from "ftr" which are present in "id2tr".
+
+ 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene.
+
+ Data structures.
+ idx_cds, idx_utr, idx_exon, idx_tscript:
+ as described above, regidx structures for fast lookup of exons/transcripts
+ overlapping a region, the payload is a pointer to tscript.cds
+*/
+
+#ifndef GFF_H__
+#define GFF_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <getopt.h>
+#include <math.h>
+#include <inttypes.h>
+#include <htslib/hts.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/faidx.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "regidx.h"
+
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+// Definition of splice_region, splice_acceptor and splice_donor
+#define N_SPLICE_DONOR 2
+#define N_SPLICE_REGION_EXON 3
+#define N_SPLICE_REGION_INTRON 8
+
+#define STRAND_REV 0
+#define STRAND_FWD 1
+
+#define TRIM_NONE 0
+#define TRIM_5PRIME 1
+#define TRIM_3PRIME 2
+
+
+// GFF line types
+#define GFF_UNKN_LINE 0
+#define GFF_TSCRIPT_LINE 1
+#define GFF_GENE_LINE 2
+
+
+/*
+ Genomic features, for fast lookup by position to overlapping features
+*/
+#define GF_coding_bit 6
+#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
+#define GF_MT_rRNA 1 // non-coding: 1, 2, ...
+#define GF_MT_tRNA 2
+#define GF_lincRNA 3
+#define GF_miRNA 4
+#define GF_MISC_RNA 5
+#define GF_rRNA 6
+#define GF_snRNA 7
+#define GF_snoRNA 8
+#define GF_PROCESSED_TRANSCRIPT 9
+#define GF_ANTISENSE 10
+#define GF_macro_lncRNA 11
+#define GF_ribozyme 12
+#define GF_sRNA 13
+#define GF_scRNA 14
+#define GF_scaRNA 15
+#define GF_SENSE_INTRONIC 16
+#define GF_SENSE_OVERLAPPING 17
+#define GF_PSEUDOGENE 18
+#define GF_PROCESSED_PSEUDOGENE 19
+#define GF_ARTIFACT 20
+#define GF_IG_PSEUDOGENE 21
+#define GF_IG_C_PSEUDOGENE 22
+#define GF_IG_J_PSEUDOGENE 23
+#define GF_IG_V_PSEUDOGENE 24
+#define GF_TR_V_PSEUDOGENE 25
+#define GF_TR_J_PSEUDOGENE 26
+#define GF_MT_tRNA_PSEUDOGENE 27
+#define GF_misc_RNA_PSEUDOGENE 28
+#define GF_miRNA_PSEUDOGENE 29
+#define GF_RIBOZYME 30
+#define GF_RETAINED_INTRON 31
+#define GF_RETROTRANSPOSED 32
+#define GF_tRNA_PSEUDOGENE 33
+#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE 34
+#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE 35
+#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE 36
+#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE 37
+#define GF_TRANSLATED_PROCESSED_PSEUDOGENE 38
+#define GF_KNOWN_NCRNA 39
+#define GF_UNITARY_PSEUDOGENE 40
+#define GF_UNPROCESSED_PSEUDOGENE 41
+#define GF_LRG_GENE 42
+#define GF_3PRIME_OVERLAPPING_ncRNA 43
+#define GF_DISRUPTED_DOMAIN 44
+#define GF_vaultRNA 45
+#define GF_BIDIRECTIONAL_PROMOTER_lncRNA 46
+#define GF_AMBIGUOUS_ORF 47
+#define GF_lncRNA 48
+#define GF_PROTEIN_CODING (1|(1<<GF_coding_bit)) // coding: 65, 66, ...
+#define GF_POLYMORPHIC_PSEUDOGENE (2|(1<<GF_coding_bit))
+#define GF_IG_C (3|(1<<GF_coding_bit))
+#define GF_IG_D (4|(1<<GF_coding_bit))
+#define GF_IG_J (5|(1<<GF_coding_bit))
+#define GF_IG_LV (6|(1<<GF_coding_bit))
+#define GF_IG_V (7|(1<<GF_coding_bit))
+#define GF_TR_C (8|(1<<GF_coding_bit))
+#define GF_TR_D (9|(1<<GF_coding_bit))
+#define GF_TR_J (10|(1<<GF_coding_bit))
+#define GF_TR_V (11|(1<<GF_coding_bit))
+#define GF_NMD (12|(1<<GF_coding_bit))
+#define GF_NON_STOP_DECAY (13|(1<<GF_coding_bit))
+#define GF_CDS ((1<<(GF_coding_bit+1))+1) // special types: 129, 130, ...
+#define GF_EXON ((1<<(GF_coding_bit+1))+2)
+#define GF_UTR3 ((1<<(GF_coding_bit+1))+3)
+#define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
+// GF_MAX = (1<<30)-1, see hap_node_t
+
+#define CDS_PHASE_UNKN 3
+typedef struct gf_tscript_t_ gf_tscript_t;
+typedef struct
+{
+ gf_tscript_t *tr; // transcript
+ uint32_t beg; // the start coordinate of the CDS (on the reference strand, 0-based)
+ uint32_t pos; // 0-based index of the first exon base within the transcript (only to
+ // update hap_node_t.sbeg in hap_init, could be calculated on the fly)
+ uint32_t len; // exon length
+ uint32_t icds:30, // exon index within the transcript
+ phase:2; // offset of the CDS: 0,1,2 or 3 for unknown
+}
+gf_cds_t;
+typedef struct
+{
+ char *name; // human readable name, e.g. ORF45
+ uint32_t iseq;
+ uint32_t id,beg,end,strand:31, // used only by --dump-gff
+ used:1; // does it have any exons, CDS, UTR?
+}
+gf_gene_t;
+typedef struct
+{
+ uint32_t beg,end;
+ gf_tscript_t *tr;
+}
+gf_exon_t;
+typedef enum { prime3, prime5 } utr_t;
+typedef struct
+{
+ utr_t which;
+ uint32_t beg,end;
+ gf_tscript_t *tr;
+}
+gf_utr_t;
+struct gf_tscript_t_
+{
+ uint32_t id; // transcript id
+ uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
+ uint32_t strand:1, // STRAND_REV or STRAND_FWD
+ used:1, // does it have any exons, UTRs, CDS?
+ ncds:30, // number of exons
+ mcds;
+ gf_cds_t **cds; // ordered list of exons
+ uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
+ type:30; // one of GF_* types
+ gf_gene_t *gene;
+ void *aux; // auxiliary user data
+};
+
+typedef enum
+{
+ // write options
+ verbosity, // int, 0-2
+ strip_chr_names, // int, 0 to leave as is, 1 to strip 'chr' prefix
+ force_out_of_phase, // int, 1 to proceed even CDS exon out of expected phase
+ dump_fname, // const char*, dump the parsed GFF into this file, for debugging purposes
+
+ // read options
+ idx_cds,
+ idx_utr,
+ idx_exon,
+ idx_tscript,
+}
+gff_opt_t;
+
+typedef enum { transcript } id_type_t; // for gff_id2str
+
+typedef struct gff_t_ gff_t;
+
+gff_t *gff_init(const char *fname);
+int gff_parse(gff_t *gff);
+void gff_destroy(gff_t *gff);
+
+int gff_set(gff_t *gff, gff_opt_t key, ...); // returns 0 on success
+void *gff_get(gff_t *gff, gff_opt_t key);
+const char *gff_id2string(gff_t *gff, id_type_t type, int id);
+const char *gf_type2gff_string(int type);
+
+#endif
// hex.h
//
// @category Libraries
-// @author Nicola Asuni <nicola.asuni@genomicsplc.com>
+// @author Nicola Asuni <info@tecnick.com>
+// @link https://github.com/tecnickcom/variantkey
+// @license MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE)
// @copyright 2017-2018 GENOMICS plc
-// @license MIT (see LICENSE)
-// @link https://github.com/genomicsplc/variantkey
//
// LICENSE
//
/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
- Copyright (C) 2008-2022 Genome Research Ltd.
+ Copyright (C) 2008-2023 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
int indels_v20;
int argc;
char **argv;
+ int write_index;
+ char *index_fn;
} mplp_conf_t;
typedef struct {
if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
// Left & right cigar op match.
int lr = b->core.l_qseq > 500;
- int lm = 0, rm = 0, k;
+ int lm = 0, rm = 0, k, nm = 0;
for (k = 0; k < ncig; k++) {
int cop = bam_cigar_op(cig[k]);
if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
continue;
if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
- cop == BAM_CEQUAL)
+ cop == BAM_CEQUAL) {
lm += bam_cigar_oplen(cig[k]);
- else
+ nm++;
+ } else {
break;
+ }
}
- for (k = ncig-1; k >= 0; k--) {
- int cop = bam_cigar_op(cig[k]);
- if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+ // if everything is a match (or sequence (mis)match) then move on
+ // because we don't have an indel in the middle
+ if (nm != ncig) {
+ for (k = ncig-1; k >= 0; k--) {
+ int cop = bam_cigar_op(cig[k]);
+ if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+ continue;
+
+ if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+ cop == BAM_CEQUAL)
+ rm += bam_cigar_oplen(cig[k]);
+ else
+ break;
+ }
+
+ if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
continue;
- if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
- cop == BAM_CEQUAL)
- rm += bam_cigar_oplen(cig[k]);
- else
- break;
+ if (lm >= REALN_DIST && rm >= REALN_DIST &&
+ has_clip < (0.15+0.05*(nt>20))*nt)
+ continue;
}
-
- if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
- continue;
-
- if (lm >= REALN_DIST && rm >= REALN_DIST &&
- has_clip < (0.15+0.05*(nt>20))*nt)
- continue;
}
if (b->core.l_qseq > 500) {
for (i=0; i<nsmpl; i++)
bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
+ if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname);
conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
conf->delta_baseQ);
bcf_destroy1(conf->bcf_rec);
if (conf->bcf_fp)
{
+ if ( conf->write_index )
+ {
+ if ( bcf_idx_save(conf->bcf_fp)<0 )
+ {
+ if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
+ error("Error: cannot write to index %s\n",conf->index_fn);
+ }
+ free(conf->index_fn);
+ }
if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
bcf_hdr_destroy(conf->bcf_hdr);
bcf_call_destroy(conf->bca);
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
" 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
" --threads INT Use multithreading with INT worker threads [0]\n"
+ " --write-index Automatically index the output files [off]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
" -X, --config STR Specify platform specific profiles (see below)\n"
{"seed", required_argument, NULL, 13},
{"ambig-reads", required_argument, NULL, 14},
{"ar", required_argument, NULL, 14},
+ {"write-index",no_argument,NULL,21},
{NULL, 0, NULL, 0}
};
while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
}
break;
case 20: mplp.indels_v20 = 1; break;
+ case 21: mplp.write_index = 1; break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
- Copyright (C) 2008-2022 Genome Research Ltd.
+ Copyright (C) 2008-2023 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
int indels_v20;
int argc;
char **argv;
+ int write_index;
+ char *index_fn;
} mplp_conf_t;
typedef struct {
if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
// Left & right cigar op match.
int lr = b->core.l_qseq > 500;
- int lm = 0, rm = 0, k;
+ int lm = 0, rm = 0, k, nm = 0;
for (k = 0; k < ncig; k++) {
int cop = bam_cigar_op(cig[k]);
if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
continue;
if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
- cop == BAM_CEQUAL)
+ cop == BAM_CEQUAL) {
lm += bam_cigar_oplen(cig[k]);
- else
+ nm++;
+ } else {
break;
+ }
}
- for (k = ncig-1; k >= 0; k--) {
- int cop = bam_cigar_op(cig[k]);
- if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+ // if everything is a match (or sequence (mis)match) then move on
+ // because we don't have an indel in the middle
+ if (nm != ncig) {
+ for (k = ncig-1; k >= 0; k--) {
+ int cop = bam_cigar_op(cig[k]);
+ if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+ continue;
+
+ if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+ cop == BAM_CEQUAL)
+ rm += bam_cigar_oplen(cig[k]);
+ else
+ break;
+ }
+
+ if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
continue;
- if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
- cop == BAM_CEQUAL)
- rm += bam_cigar_oplen(cig[k]);
- else
- break;
+ if (lm >= REALN_DIST && rm >= REALN_DIST &&
+ has_clip < (0.15+0.05*(nt>20))*nt)
+ continue;
}
-
- if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
- continue;
-
- if (lm >= REALN_DIST && rm >= REALN_DIST &&
- has_clip < (0.15+0.05*(nt>20))*nt)
- continue;
}
if (b->core.l_qseq > 500) {
for (i=0; i<nsmpl; i++)
bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
+ if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname);
conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
conf->delta_baseQ);
bcf_destroy1(conf->bcf_rec);
if (conf->bcf_fp)
{
+ if ( conf->write_index )
+ {
+ if ( bcf_idx_save(conf->bcf_fp)<0 )
+ {
+ if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
+ error("Error: cannot write to index %s\n",conf->index_fn);
+ }
+ free(conf->index_fn);
+ }
if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname);
bcf_hdr_destroy(conf->bcf_hdr);
bcf_call_destroy(conf->bca);
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
" 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
" --threads INT Use multithreading with INT worker threads [0]\n"
+ " --write-index Automatically index the output files [off]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
" -X, --config STR Specify platform specific profiles (see below)\n"
{"seed", required_argument, NULL, 13},
{"ambig-reads", required_argument, NULL, 14},
{"ar", required_argument, NULL, 14},
+ {"write-index",no_argument,NULL,21},
{NULL, 0, NULL, 0}
};
while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
}
break;
case 20: mplp.indels_v20 = 1; break;
+ case 21: mplp.write_index = 1; break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0};
char *chr_name = NULL, *p, *q = line + 9; // skip ##contig=
char *end = q;
- int nopen = 1, chr_len = 0;
+ int nopen = 1;
+ hts_pos_t chr_len = 0;
while ( *end && *end!='\n' ) end++;
while ( *q && *q!='\n' && nopen>0 )
{
if ( !strcmp("ID",key.s) )
{
if ( khash_str2int_has_key(chr_seen,val.s) ) continue;
- chr_len = faidx_seq_len(fai, val.s);
+ chr_len = faidx_seq_len64(fai, val.s);
if ( chr_len==-1 )
{
free(val.s); free(key.s); free(tmp.s);
if ( quoted ) kputc('"',&tmp);
}
if ( !chr_name ) return end;
- ksprintf(dst,"##contig=<ID=%s,length=%d%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
+ ksprintf(dst,"##contig=<ID=%s,length=%"PRIhts_pos"%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
free(key.s); free(val.s); free(tmp.s);
return q;
}
for (i=0; i<n; i++)
{
if ( khash_str2int_has_key(chr_seen,faidx_iseq(fai,i)) ) continue;
- ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%d>\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i)));
+ ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%"PRIhts_pos">\n",faidx_iseq(fai,i),faidx_seq_len64(fai,faidx_iseq(fai,i)));
}
kputs(tmp+1,&hdr_txt_new);
int c;
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
-
+
static struct option loptions[] =
{
{"temp-prefix",1,0,'T'},
kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0};
char *chr_name = NULL, *p, *q = line + 9; // skip ##contig=
char *end = q;
- int nopen = 1, chr_len = 0;
+ int nopen = 1;
+ hts_pos_t chr_len = 0;
while ( *end && *end!='\n' ) end++;
while ( *q && *q!='\n' && nopen>0 )
{
if ( !strcmp("ID",key.s) )
{
if ( khash_str2int_has_key(chr_seen,val.s) ) continue;
- chr_len = faidx_seq_len(fai, val.s);
+ chr_len = faidx_seq_len64(fai, val.s);
if ( chr_len==-1 )
{
free(val.s); free(key.s); free(tmp.s);
if ( quoted ) kputc('"',&tmp);
}
if ( !chr_name ) return end;
- ksprintf(dst,"##contig=<ID=%s,length=%d%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
+ ksprintf(dst,"##contig=<ID=%s,length=%"PRIhts_pos"%s>",chr_name,chr_len,tmp.l ? tmp.s : "");
free(key.s); free(val.s); free(tmp.s);
return q;
}
for (i=0; i<n; i++)
{
if ( khash_str2int_has_key(chr_seen,faidx_iseq(fai,i)) ) continue;
- ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%d>\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i)));
+ ksprintf(&hdr_txt_new,"##contig=<ID=%s,length=%"PRIhts_pos">\n",faidx_iseq(fai,i),faidx_seq_len64(fai,faidx_iseq(fai,i)));
}
kputs(tmp+1,&hdr_txt_new);
int c;
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
-
+
static struct option loptions[] =
{
{"temp-prefix",1,0,'T'},
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// variantkey.h
//
// @category Libraries
-// @author Nicola Asuni <nicola.asuni@genomicsplc.com>
-// @copyright 2017-2018 GENOMICS plc
-// @license MIT (see LICENSE)
-// @link https://github.com/genomicsplc/variantkey
+// @author Nicola Asuni <info@tecnick.com>
+// @link https://github.com/tecnickcom/variantkey
+// @license MIT [LICENSE](https://raw.githubusercontent.com/tecnickcom/variantkey/main/LICENSE)
+// @copyright 2017-2018 GENOMICS plc, 2018-2023 Nicola Asuni - Tecnick.com
//
// LICENSE
//
// Copyright (c) 2017-2018 GENOMICS plc
+// Copyright (c) 2018-2023 Nicola Asuni - Tecnick.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
#define VKMASK_REFALT 0x000000007FFFFFFF //!< VariantKey binary mask for REF+ALT [ 00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111111 ]
#define VKSHIFT_CHROM 59 //!< CHROM LSB position from the VariantKey LSB
#define VKSHIFT_POS 31 //!< POS LSB position from the VariantKey LSB
+#define MAXUINT32 0xFFFFFFFF //!< Maximum value for uint32_t
/**
* VariantKey struct.
uint64_t max; //!< Maximum VariantKey value for any given REF+ALT encoding
} vkrange_t;
-/** @brief Returns chromosome numerical encoding.
+/** @brief Returns the encoding for a numerical chromosome input.
*
* @param chrom Chromosome. An identifier from the reference genome, no white-space permitted.
* @param size Length of the chrom string, excluding the terminating null byte.
*
* @return CHROM code
*/
+static inline uint8_t encode_numeric_chrom(const char *chrom, size_t size)
+{
+ size_t i;
+ uint8_t v = (chrom[0] - '0');
+ for (i = 1; i < size; i++)
+ {
+ if ((chrom[i] > '9') || (chrom[i] < '0'))
+ {
+ return 0; // NA: a character that is not a numebr was found.
+ }
+ v = ((v * 10) + (chrom[i] - '0'));
+ }
+ return v;
+}
+
+
+/** @brief Returns a true value (1) if the input chrom has 'chr' prefix (case insensitive).
+ *
+ * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted.
+ * @param size Length of the chrom string, excluding the terminating null byte.
+ *
+ * @return True (1) if the chr prefix is present.
+ */
+static inline int has_chrom_chr_prefix(const char *chrom, size_t size)
+{
+ return ((size > 3)
+ && ((chrom[0] == 'c') || (chrom[0] == 'C'))
+ && ((chrom[1] == 'h') || (chrom[1] == 'H'))
+ && ((chrom[2] == 'r') || (chrom[2] == 'R')));
+}
+
+/** @brief Returns chromosome numerical encoding.
+ *
+ * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted.
+ * @param size Length of the chrom string, excluding the terminating null byte.
+ *
+ * @return CHROM code or 0 in case of invalid input.
+ */
static inline uint8_t encode_chrom(const char *chrom, size_t size)
{
- // X > 23 ; Y > 24 ; M > 25
+ // X = 23; Y = 24; M = 25; any other letter is mapped to 0:
static const uint8_t onecharmap[] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
- // remove "chr" prefix
- if ((size > 3)
- && ((chrom[0] == 'c') || (chrom[0] == 'C'))
- && ((chrom[1] == 'h') || (chrom[1] == 'H'))
- && ((chrom[2] == 'r') || (chrom[2] == 'R')))
+ if (has_chrom_chr_prefix(chrom, size))
{
+ // remove "chr" prefix
chrom += 3;
size -= 3;
}
{
return 0;
}
- if ((chrom[0] <= '9') && (chrom[0] >= '0')) // Number
+ if ((chrom[0] <= '9') && (chrom[0] >= '0'))
{
- size_t i;
- uint8_t v = (chrom[0] - '0');
- for (i = 1; i < size; i++)
- {
- if ((chrom[i] > '9') || (chrom[i] < '0'))
- {
- return 0; // NA
- }
- v = ((v * 10) + (chrom[i] - '0'));
- }
- return v;
+ return encode_numeric_chrom(chrom, size);
}
if ((size == 1) || ((size == 2) && ((chrom[1] == 'T') || (chrom[1] == 't'))))
{
{
/*
Encode base:
- A > 0
- C > 1
- G > 2
- T > 3
+ A = 0
+ C = 1
+ G = 2
+ T = 3
*/
static const uint32_t map[] =
{
uint8_t bitpos = 23;
if ((encode_allele(&h, &bitpos, ref, sizeref) < 0) || (encode_allele(&h, &bitpos, alt, sizealt) < 0))
{
- return 0; // error code
+ return MAXUINT32; // error code
}
return h;
}
if ((sizeref + sizealt) <= 11)
{
uint32_t h = encode_refalt_rev(ref, sizeref, alt, sizealt);
- if (h != 0)
+ if (h != MAXUINT32)
{
return h;
}
vk->refalt = extract_variantkey_refalt(code);
}
-/** @brief Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT.
+/**
+ * Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT.
+ * The variant should be already normalized (see normalize_variant or use normalized_variantkey).
*
* @param chrom Chromosome. An identifier from the reference genome, no white-space or leading zeros permitted.
* @param sizechrom Length of the chrom string, excluding the terminating null byte.
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
htsFile *out_fh;
int output_type, n_threads, clevel;
bcf_sr_regions_t *tgts;
+ char *index_fn;
+ int write_index;
regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns
regitr_t *tgt_itr;
if ( args->mark_sites )
{
- if ( !args->targets_fname ) error("The -a option not given\n");
- bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
- args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
+ if ( !args->targets_fname )
+ {
+ if ( args->mark_sites_logic!=MARK_LISTED ) error("The -a option not given but -%s logic was requested\n",args->mark_sites);
+ fprintf(stderr,"Note: The -a option not given, all sites will be annotated with INFO/%s\n",args->mark_sites);
+ bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites marked with `bcftools annotate -m %s`\">",
+ args->mark_sites,args->mark_sites);
+ }
+ else
+ bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
+ args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
if ( args->n_threads )
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
}
convert_destroy(args->set_ids);
if ( args->filter )
filter_destroy(args->filter);
- if (args->out_fh) hts_close(args->out_fh);
+ if (args->out_fh)
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ }
free(args->sample_map);
free(args->merge_method_str.s);
}
for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
{
+ hts_pos_t vcf_end = line->pos + line->rlen - 1;
while ( regitr_overlap(args->tgt_itr) )
{
annot_line_t *tmp = &args->alines[0];
// Check min overlap
int len_ann = tmp->end - tmp->start + 1;
int len_vcf = line->rlen;
- int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
+ int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
assert( isec > 0 );
if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
if ( ret==0 )
args->cols[j].done = 1;
+ has_overlap = 1;
}
}
- has_overlap = 1;
}
for (j=0; j<args->ncols; j++)
{
if ( args->mark_sites )
{
+ if ( !args->targets_fname ) has_overlap = 1;
+
// ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
if ( args->mark_sites_logic==MARK_LISTED )
bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0);
fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n");
{"min-overlap",required_argument,NULL,12},
{"no-version",no_argument,NULL,8},
{"force",no_argument,NULL,'f'},
+ {"write-index",no_argument,NULL,13},
{NULL,0,NULL,0}
};
char *tmp;
case 10 : args->single_overlaps = 1; break;
case 11 : args->rename_annots = optarg; break;
case 12 : args->min_overlap_str = optarg; break;
+ case 13 : args->write_index = 1; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
htsFile *out_fh;
int output_type, n_threads, clevel;
bcf_sr_regions_t *tgts;
+ char *index_fn;
+ int write_index;
regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns
regitr_t *tgt_itr;
if ( args->mark_sites )
{
- if ( !args->targets_fname ) error("The -a option not given\n");
- bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
- args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
+ if ( !args->targets_fname )
+ {
+ if ( args->mark_sites_logic!=MARK_LISTED ) error("The -a option not given but -%s logic was requested\n",args->mark_sites);
+ fprintf(bcftools_stderr,"Note: The -a option not given, all sites will be annotated with INFO/%s\n",args->mark_sites);
+ bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites marked with `bcftools annotate -m %s`\">",
+ args->mark_sites,args->mark_sites);
+ }
+ else
+ bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
+ args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
if ( args->n_threads )
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
}
convert_destroy(args->set_ids);
if ( args->filter )
filter_destroy(args->filter);
- if (args->out_fh) hts_close(args->out_fh);
+ if (args->out_fh)
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ }
free(args->sample_map);
free(args->merge_method_str.s);
}
for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
{
+ hts_pos_t vcf_end = line->pos + line->rlen - 1;
while ( regitr_overlap(args->tgt_itr) )
{
annot_line_t *tmp = &args->alines[0];
// Check min overlap
int len_ann = tmp->end - tmp->start + 1;
int len_vcf = line->rlen;
- int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
+ int isec = (tmp->end < vcf_end ? tmp->end : vcf_end) - (tmp->start > line->pos ? tmp->start : line->pos) + 1;
assert( isec > 0 );
if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue;
if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue;
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
if ( ret==0 )
args->cols[j].done = 1;
+ has_overlap = 1;
}
}
- has_overlap = 1;
}
for (j=0; j<args->ncols; j++)
{
if ( args->mark_sites )
{
+ if ( !args->targets_fname ) has_overlap = 1;
+
// ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87
if ( args->mark_sites_logic==MARK_LISTED )
bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0);
fprintf(bcftools_stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
fprintf(bcftools_stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
fprintf(bcftools_stderr, " --threads INT Number of extra output compression threads [0]\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n");
{"min-overlap",required_argument,NULL,12},
{"no-version",no_argument,NULL,8},
{"force",no_argument,NULL,'f'},
+ {"write-index",no_argument,NULL,13},
{NULL,0,NULL,0}
};
char *tmp;
case 10 : args->single_overlaps = 1; break;
case 11 : args->rename_annots = optarg; break;
case 12 : args->min_overlap_str = optarg; break;
+ case 13 : args->write_index = 1; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int argc;
char **argv;
+ char *index_fn;
+ int write_index;
// int flag, prior_type, n1, n_sub, *sublist, n_perm;
// uint32_t *trio_aux;
if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
}
free(args->str.s);
if ( args->gvcf ) gvcf_destroy(args->gvcf);
bcf_hdr_destroy(args->aux.hdr);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
bcf_sr_destroy(args->aux.srs);
}
fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
fprintf(stderr, " -v, --variants-only Output variant sites only\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Consensus/variant calling options:\n");
fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
{"chromosome-X",no_argument,NULL,'X'},
{"chromosome-Y",no_argument,NULL,'Y'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,10},
{NULL,0,NULL,0}
};
args.regions_overlap = parse_overlap_option(optarg);
if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
break;
+ case 10: args.write_index = 1; break;
default: usage(&args);
}
}
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int argc;
char **argv;
+ char *index_fn;
+ int write_index;
// int flag, prior_type, n1, n_sub, *sublist, n_perm;
// uint32_t *trio_aux;
if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
}
free(args->str.s);
if ( args->gvcf ) gvcf_destroy(args->gvcf);
bcf_hdr_destroy(args->aux.hdr);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
bcf_sr_destroy(args->aux.srs);
}
fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Consensus/variant calling options:\n");
fprintf(bcftools_stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
{"chromosome-X",no_argument,NULL,'X'},
{"chromosome-Y",no_argument,NULL,'Y'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,10},
{NULL,0,NULL,0}
};
args.regions_overlap = parse_overlap_option(optarg);
if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
break;
+ case 10: args.write_index = 1; break;
default: usage(&args);
}
}
/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int output_type, n_threads, record_cmd_line, clevel;
bcf_hdr_t *out_hdr;
int *seen_seq;
+ char *index_fn;
+ int write_index;
// phasing
int *start_pos, start_tid, ifname;
int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap;
int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers;
int verbose, explicit_output_type, ligate_force, ligate_warn;
+ int sites_only;
htsThreadPool *tpool;
}
args_t;
+static bcf_hdr_t *drop_hdr_genotypes(args_t *args, bcf_hdr_t *hdr)
+{
+ if ( !args->sites_only ) return hdr;
+ bcf_hdr_t *rmme = hdr;
+ hdr = bcf_hdr_subset(rmme, 0, 0, 0);
+ bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+ bcf_hdr_destroy(rmme);
+ return hdr;
+}
+
static void init_data(args_t *args)
{
bcf1_t *line = NULL;
{
htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+ hdr = drop_hdr_genotypes(args, hdr);
+
args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool);
}
if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
if ( args->allow_overlaps )
{
int i;
if ( args->out_fh )
{
- if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"stdout");
}
if ( args->tpool && !args->files )
{
bcf1_t *brec = args->buf[i+1];
int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
- if ( nGTs < 0 )
+ if ( nGTs < 0 )
{
if ( !gt_absent_warned )
{
bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
PQ_printed = 1;
for (j=0; j<nsmpl; j++)
- if ( args->phase_qual[j] < args->min_PQ )
+ if ( args->phase_qual[j] < args->min_PQ )
{
args->phase_set[j] = rec->pos+1;
args->phase_set_changed = 1;
{
bcf1_t *line = bcf_sr_get_line(args->files,i);
if ( !line ) continue;
+ if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
bcf_translate(args->out_hdr, args->files->readers[i].header, line);
if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
if ( args->remove_dups ) break;
}
}
}
- else // concatenating
+ else // concatenate as is
{
struct timeval t0, t1;
kstring_t tmp = {0,0,0};
htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]);
if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool);
bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]);
+ if ( args->sites_only )
+ {
+ bcf_hdr_t *hdr_ori = hdr;
+ hdr = bcf_hdr_subset(hdr_ori, 0, 0, 0);
+ bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+ bcf_hdr_destroy(hdr_ori);
+ }
if ( !fp->is_bin && args->output_type&FT_VCF )
{
line->max_unpack = BCF_UN_STR;
while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
{
char *str = fp->line.s;
+
+ // remove genotypes
+ if ( args->sites_only )
+ {
+ int ntab = 0;
+ while ( *str )
+ {
+ if ( *str == '\t' && ++ntab==8 )
+ {
+ *str = 0;
+ break;
+ }
+ str++;
+ }
+ str = fp->line.s;
+ }
while ( *str && *str!='\t' ) str++;
tmp.l = 0;
kputsn(fp->line.s,str-fp->line.s,&tmp);
line->max_unpack = 0;
while ( bcf_read(fp, hdr, line)==0 )
{
+ if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
bcf_translate(args->out_hdr, hdr, line);
if ( prev_chr_id!=line->rid )
fprintf(stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
fprintf(stderr, " -D, --remove-duplicates Alias for -d exact\n");
fprintf(stderr, " -f, --file-list FILE Read the list of files from a file.\n");
+ fprintf(stderr, " -G, --drop-genotypes Drop individual genotype information.\n");
fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
fprintf(stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n");
fprintf(stderr, " --ligate-warn Drop sites in imperfect overlaps\n");
fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(stderr, " -v, --verbose 0|1 Set verbosity level [1]\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
}
{"file-list",required_argument,NULL,'f'},
{"min-PQ",required_argument,NULL,'q'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,13},
+ {"drop-genotypes",no_argument,NULL,'G'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0)
{
switch (c) {
case 'c': args->compact_PS = 1; break;
case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
case 'd': args->remove_dups = optarg; break;
case 'D': args->remove_dups = "exact"; break;
- case 'q':
+ case 'q':
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
break;
case 'a': args->allow_overlaps = 1; break;
case 'l': args->phased_concat = 1; break;
case 'f': args->file_list = optarg; break;
+ case 'G': args->sites_only = 1; break;
case 'o': args->output_fname = optarg; break;
case 'O':
args->explicit_output_type = 1;
args->verbose = strtol(optarg, &tmp, 0);
if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
break;
+ case 13 : args->write_index = 1; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n");
if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n");
+ if ( args->sites_only && args->phased_concat ) error("The options --drop-genotypes and --ligate cannot be combined\n");
if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
if ( args->file_list )
{
{
if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
+ if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n");
naive_concat(args);
destroy_data(args);
free(args);
/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int output_type, n_threads, record_cmd_line, clevel;
bcf_hdr_t *out_hdr;
int *seen_seq;
+ char *index_fn;
+ int write_index;
// phasing
int *start_pos, start_tid, ifname;
int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap;
int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers;
int verbose, explicit_output_type, ligate_force, ligate_warn;
+ int sites_only;
htsThreadPool *tpool;
}
args_t;
+static bcf_hdr_t *drop_hdr_genotypes(args_t *args, bcf_hdr_t *hdr)
+{
+ if ( !args->sites_only ) return hdr;
+ bcf_hdr_t *rmme = hdr;
+ hdr = bcf_hdr_subset(rmme, 0, 0, 0);
+ bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+ bcf_hdr_destroy(rmme);
+ return hdr;
+}
+
static void init_data(args_t *args)
{
bcf1_t *line = NULL;
{
htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
+ hdr = drop_hdr_genotypes(args, hdr);
+
args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool);
}
if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
if ( args->allow_overlaps )
{
int i;
if ( args->out_fh )
{
- if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"bcftools_stdout");
}
if ( args->tpool && !args->files )
{
bcf1_t *brec = args->buf[i+1];
int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
- if ( nGTs < 0 )
+ if ( nGTs < 0 )
{
if ( !gt_absent_warned )
{
bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
PQ_printed = 1;
for (j=0; j<nsmpl; j++)
- if ( args->phase_qual[j] < args->min_PQ )
+ if ( args->phase_qual[j] < args->min_PQ )
{
args->phase_set[j] = rec->pos+1;
args->phase_set_changed = 1;
{
bcf1_t *line = bcf_sr_get_line(args->files,i);
if ( !line ) continue;
+ if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
bcf_translate(args->out_hdr, args->files->readers[i].header, line);
if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
if ( args->remove_dups ) break;
}
}
}
- else // concatenating
+ else // concatenate as is
{
struct timeval t0, t1;
kstring_t tmp = {0,0,0};
htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]);
if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool);
bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]);
+ if ( args->sites_only )
+ {
+ bcf_hdr_t *hdr_ori = hdr;
+ hdr = bcf_hdr_subset(hdr_ori, 0, 0, 0);
+ bcf_hdr_remove(hdr, BCF_HL_FMT, NULL);
+ bcf_hdr_destroy(hdr_ori);
+ }
if ( !fp->is_bin && args->output_type&FT_VCF )
{
line->max_unpack = BCF_UN_STR;
while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
{
char *str = fp->line.s;
+
+ // remove genotypes
+ if ( args->sites_only )
+ {
+ int ntab = 0;
+ while ( *str )
+ {
+ if ( *str == '\t' && ++ntab==8 )
+ {
+ *str = 0;
+ break;
+ }
+ str++;
+ }
+ str = fp->line.s;
+ }
while ( *str && *str!='\t' ) str++;
tmp.l = 0;
kputsn(fp->line.s,str-fp->line.s,&tmp);
line->max_unpack = 0;
while ( bcf_read(fp, hdr, line)==0 )
{
+ if ( args->sites_only ) bcf_subset(args->out_hdr, line, 0, 0);
bcf_translate(args->out_hdr, hdr, line);
if ( prev_chr_id!=line->rid )
fprintf(bcftools_stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d exact\n");
fprintf(bcftools_stderr, " -f, --file-list FILE Read the list of files from a file.\n");
+ fprintf(bcftools_stderr, " -G, --drop-genotypes Drop individual genotype information.\n");
fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
fprintf(bcftools_stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n");
fprintf(bcftools_stderr, " --ligate-warn Drop sites in imperfect overlaps\n");
fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, " -v, --verbose 0|1 Set verbosity level [1]\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
{"file-list",required_argument,NULL,'f'},
{"min-PQ",required_argument,NULL,'q'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,13},
+ {"drop-genotypes",no_argument,NULL,'G'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:Gr:R:cnv:",loptions,NULL)) >= 0)
{
switch (c) {
case 'c': args->compact_PS = 1; break;
case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
case 'd': args->remove_dups = optarg; break;
case 'D': args->remove_dups = "exact"; break;
- case 'q':
+ case 'q':
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
break;
case 'a': args->allow_overlaps = 1; break;
case 'l': args->phased_concat = 1; break;
case 'f': args->file_list = optarg; break;
+ case 'G': args->sites_only = 1; break;
case 'o': args->output_fname = optarg; break;
case 'O':
args->explicit_output_type = 1;
args->verbose = strtol(optarg, &tmp, 0);
if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
break;
+ case 13 : args->write_index = 1; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n");
if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n");
+ if ( args->sites_only && args->phased_concat ) error("The options --drop-genotypes and --ligate cannot be combined\n");
if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
if ( args->file_list )
{
{
if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
+ if ( args->sites_only ) error("The option --naive cannot be combined with --drop-genotypes\n");
naive_concat(args);
destroy_data(args);
free(args);
/* vcfconvert.c -- convert between VCF/BCF and related formats.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
bcf_hdr_t *header;
void (*convert_func)(struct _args_t *);
struct {
- int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
+ int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written;
} n;
kstring_t str;
int32_t *gts;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname, *sex_fname;
int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
+ char *index_fn;
+ int write_index;
+ struct {
+ kstring_t ref,alt,refalt;
+ } tsv;
};
static void destroy_data(args_t *args)
free(samples);
}
+static int _set_ref_alt(args_t *args, bcf1_t *rec)
+{
+ args->tsv.refalt.l = 0;
+ kputs(args->tsv.ref.s, &args->tsv.refalt);
+ if ( strcmp(".",args->tsv.alt.s) && strcmp(args->tsv.ref.s,args->tsv.alt.s) )
+ {
+ kputc(',', &args->tsv.refalt);
+ kputs(args->tsv.alt.s, &args->tsv.refalt);
+ }
+ bcf_update_alleles_str(args->header, rec, args->tsv.refalt.s);
+ args->tsv.ref.l = 0;
+ args->tsv.alt.l = 0;
+ args->tsv.refalt.l = 0;
+ return 0;
+}
+static int tsv_setter_ref(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.ref);
+ if ( args->tsv.alt.l ) return _set_ref_alt(args,rec);
+ return 0;
+}
+static int tsv_setter_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.alt);
+ if ( args->tsv.ref.l ) return _set_ref_alt(args,rec);
+ return 0;
+}
+
// Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error
static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
{
// REF,ALT
args->str.l = 0;
se = ++ss;
- while ( se < tsv->se && *se!='_' ) se++;
+ while ( se < tsv->se && *se!='_' ) se++;
if ( *se!='_' ) return -1;
kputsn(ss,se-ss,&args->str);
ss = ++se;
if ( aa >= ab )
{
if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
- else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
+ else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
}
- else if ( ab >= bb )
+ else if ( ab >= bb )
{
args->gts[2*i+0] = bcf_gt_unphased(0);
- args->gts[2*i+1] = bcf_gt_unphased(1);
+ args->gts[2*i+1] = bcf_gt_unphased(1);
}
else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
}
else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
// up is short for "unphased"
- int nup = 0;
+ int nup = 0;
for (i=0; i<nsamples; i++)
{
char *ss = tsv->ss + 4*i + nup;
break;
default :
fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
- return -1;
+ return -1;
}
if( ss[all*2+up+1]=='*' ) up = up + 1;
}
-
+
if(up && up != 2)
{
fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
static void gensample_to_vcf(args_t *args)
{
/*
- * Inpute: IMPUTE2 output (indentation changed here for clarity):
+ * Inpute: IMPUTE2 output (indentation changed here for clarity):
*
* 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ...
* --- 20:62116698_C_A 62116698 C A 1 0 0 ...
*
* Second column is expected in the form of CHROM:POS_REF_ALT. We use second
- * column because the first can be empty ("--") when filling sites from reference
+ * column because the first can be empty ("--") when filling sites from reference
* panel. When the option --vcf-ids is given, the first column is used to set the
* VCF ID.
*
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
bcf1_t *rec = bcf_init();
nsamples -= 2;
}
while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
bcf_hdr_destroy(args->header);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
bcf1_t *rec = bcf_init();
args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
}
}
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
bcf1_t *rec = bcf_init();
nsamples -= 2;
}
while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
bcf_hdr_destroy(args->header);
}
for (i=0; i<nlines; i++) free(lines[i]);
free(lines);
- for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
return sample2sex;
}
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname)
+ if (sample_fname)
{
char *sample2sex = NULL;
if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
nok++;
}
}
- fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
+ fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
if ( str.m ) free(str.s);
{
char *sample2sex = NULL;
if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
-
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str);
else
kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
-
+
if ( args->hap2dip )
kputs("%_GT_TO_HAP2\n", &str);
else
{
if ( se - ss > 2 ) return -1; // currently only SNPs
- if ( ss[0]=='-' )
+ if ( ss[0]=='-' || ss[0]=='.' )
{
// missing GT
gts[0] = bcf_gt_missing;
if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
- gts[0] = bcf_gt_unphased(alleles[a0]);
+ gts[0] = bcf_gt_unphased(alleles[a0]);
gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR
}
ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
- if ( ret==-2 )
+ if ( ret==-2 )
{
// something else than a SNP
free(ref);
args->str.l = 0;
kputc(ref[0], &args->str);
- for (i=0; i<5; i++)
+ for (i=0; i<5; i++)
{
if ( alleles[i]>0 )
{
static void tsv_to_vcf(args_t *args)
{
if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
- if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
args->ref = fai_load(args->ref_fname);
if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
- int i, n;
- char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
- if ( !smpls ) error("Could not parse %s\n", args->sample_list);
- for (i=0; i<n; i++)
+ int i, nsmpl;
+ char **smpl;
+ if ( args->sample_list )
{
- bcf_hdr_add_sample(args->header, smpls[i]);
- free(smpls[i]);
+ smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+ if ( !smpl ) error("Could not parse %s\n", args->sample_list);
+ for (i=0; i<nsmpl; i++)
+ {
+ bcf_hdr_add_sample(args->header, smpl[i]);
+ free(smpl[i]);
+ }
+ free(smpl);
+ bcf_hdr_add_sample(args->header, NULL);
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsmpl*2);
}
- free(smpls);
- bcf_hdr_add_sample(args->header, NULL);
- args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
char wmode[8];
set_wmode(wmode,args->output_type,args->outfname,args->clevel);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
- if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
+ if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 )
+ {
+ if ( args->sample_list ) error("Expected AA column with -s/-S\n");
+ if ( tsv_register(tsv, "REF", tsv_setter_ref, args) < 0 || tsv_register(tsv, "ALT", tsv_setter_alt, args) < 0 )
+ error("Expected REF and ALT columns when AA was not given\n");
+ }
bcf1_t *rec = bcf_init();
bcf_float_set_missing(rec->qual);
if ( !tsv_parse(tsv, rec, line.s) )
{
if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ args->n.written++;
}
else
args->n.skipped++;
if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
free(line.s);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
bcf_hdr_destroy(args->header);
if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
tsv_destroy(tsv);
bcf_destroy(rec);
free(args->str.s);
free(args->gts);
+ free(args->tsv.ref.s);
+ free(args->tsv.alt.s);
+ free(args->tsv.refalt.s);
fprintf(stderr,"Rows total: \t%d\n", args->n.total);
fprintf(stderr,"Rows skipped: \t%d\n", args->n.skipped);
- fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing);
- fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr);
- fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra);
- fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa);
- fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa);
+ fprintf(stderr,"Sites written: \t%d\n", args->n.written);
+ if ( args->sample_list )
+ {
+ fprintf(stderr,"Missing GTs: \t%d\n", args->n.missing);
+ fprintf(stderr,"Hom RR: \t%d\n", args->n.hom_rr);
+ fprintf(stderr,"Het RA: \t%d\n", args->n.het_ra);
+ fprintf(stderr,"Hom AA: \t%d\n", args->n.hom_aa);
+ fprintf(stderr,"Het AA: \t%d\n", args->n.het_aa);
+ }
}
static void vcf_to_vcf(args_t *args)
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
while ( bcf_sr_next_line(args->files) )
{
}
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
}
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
}
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
int32_t *itmp = NULL, nitmp = 0;
{
int pass = filter_test(args->filter, line, NULL);
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
- if ( !pass )
+ if ( !pass )
{
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
continue;
}
}
free(itmp);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
}
fprintf(stderr, " -o, --output FILE Output file name [stdout]\n");
fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
fprintf(stderr, " -G, --gensample2vcf ... <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
fprintf(stderr, "\n");
fprintf(stderr, "TSV conversion:\n");
fprintf(stderr, " --tsv2vcf FILE\n");
- fprintf(stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n");
+ fprintf(stderr, " -c, --columns STRING Columns of the input tsv file, see man page for details [ID,CHROM,POS,AA]\n");
fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
fprintf(stderr, " -s, --samples LIST List of sample names\n");
fprintf(stderr, " -S, --samples-file FILE File of sample names\n");
{"fasta-ref",required_argument,NULL,'f'},
{"no-version",no_argument,NULL,10},
{"keep-duplicates",no_argument,NULL,12},
+ {"write-index",no_argument,NULL,16},
{NULL,0,NULL,0}
};
char *tmp;
case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break;
case 15 : args->gen_3N6 = 1; break;
+ case 16 : args->write_index = 1; break;
case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
case 'f': args->ref_fname = optarg; break;
case 'c': args->columns = optarg; break;
else args->infname = argv[optind];
}
if ( !args->infname ) usage();
-
+
if ( args->convert_func ) args->convert_func(args);
else vcf_to_vcf(args);
/* vcfconvert.c -- convert between VCF/BCF and related formats.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
bcf_hdr_t *header;
void (*convert_func)(struct _args_t *);
struct {
- int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
+ int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written;
} n;
kstring_t str;
int32_t *gts;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname, *sex_fname;
int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
+ char *index_fn;
+ int write_index;
+ struct {
+ kstring_t ref,alt,refalt;
+ } tsv;
};
static void destroy_data(args_t *args)
free(samples);
}
+static int _set_ref_alt(args_t *args, bcf1_t *rec)
+{
+ args->tsv.refalt.l = 0;
+ kputs(args->tsv.ref.s, &args->tsv.refalt);
+ if ( strcmp(".",args->tsv.alt.s) && strcmp(args->tsv.ref.s,args->tsv.alt.s) )
+ {
+ kputc(',', &args->tsv.refalt);
+ kputs(args->tsv.alt.s, &args->tsv.refalt);
+ }
+ bcf_update_alleles_str(args->header, rec, args->tsv.refalt.s);
+ args->tsv.ref.l = 0;
+ args->tsv.alt.l = 0;
+ args->tsv.refalt.l = 0;
+ return 0;
+}
+static int tsv_setter_ref(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.ref);
+ if ( args->tsv.alt.l ) return _set_ref_alt(args,rec);
+ return 0;
+}
+static int tsv_setter_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
+{
+ args_t *args = (args_t*) usr;
+ kputsn(tsv->ss,tsv->se - tsv->ss,&args->tsv.alt);
+ if ( args->tsv.ref.l ) return _set_ref_alt(args,rec);
+ return 0;
+}
+
// Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error
static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
{
// REF,ALT
args->str.l = 0;
se = ++ss;
- while ( se < tsv->se && *se!='_' ) se++;
+ while ( se < tsv->se && *se!='_' ) se++;
if ( *se!='_' ) return -1;
kputsn(ss,se-ss,&args->str);
ss = ++se;
if ( aa >= ab )
{
if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
- else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
+ else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
}
- else if ( ab >= bb )
+ else if ( ab >= bb )
{
args->gts[2*i+0] = bcf_gt_unphased(0);
- args->gts[2*i+1] = bcf_gt_unphased(1);
+ args->gts[2*i+1] = bcf_gt_unphased(1);
}
else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
}
else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
// up is short for "unphased"
- int nup = 0;
+ int nup = 0;
for (i=0; i<nsamples; i++)
{
char *ss = tsv->ss + 4*i + nup;
break;
default :
fprintf(bcftools_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
- return -1;
+ return -1;
}
if( ss[all*2+up+1]=='*' ) up = up + 1;
}
-
+
if(up && up != 2)
{
fprintf(bcftools_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
static void gensample_to_vcf(args_t *args)
{
/*
- * Inpute: IMPUTE2 output (indentation changed here for clarity):
+ * Inpute: IMPUTE2 output (indentation changed here for clarity):
*
* 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ...
* --- 20:62116698_C_A 62116698 C A 1 0 0 ...
*
* Second column is expected in the form of CHROM:POS_REF_ALT. We use second
- * column because the first can be empty ("--") when filling sites from reference
+ * column because the first can be empty ("--") when filling sites from reference
* panel. When the option --vcf-ids is given, the first column is used to set the
* VCF ID.
*
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
bcf1_t *rec = bcf_init();
nsamples -= 2;
}
while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 );
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname);
bcf_hdr_destroy(args->header);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
bcf1_t *rec = bcf_init();
args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2);
}
}
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
bcf1_t *rec = bcf_init();
nsamples -= 2;
}
while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 );
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname);
bcf_hdr_destroy(args->header);
}
for (i=0; i<nlines; i++) free(lines[i]);
free(lines);
- for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
return sample2sex;
}
if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname)
+ if (sample_fname)
{
char *sample2sex = NULL;
if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
nok++;
}
}
- fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
+ fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
if ( str.m ) free(str.s);
{
char *sample2sex = NULL;
if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
-
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str);
else
kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
-
+
if ( args->hap2dip )
kputs("%_GT_TO_HAP2\n", &str);
else
{
if ( se - ss > 2 ) return -1; // currently only SNPs
- if ( ss[0]=='-' )
+ if ( ss[0]=='-' || ss[0]=='.' )
{
// missing GT
gts[0] = bcf_gt_missing;
if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
- gts[0] = bcf_gt_unphased(alleles[a0]);
+ gts[0] = bcf_gt_unphased(alleles[a0]);
gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR
}
ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
- if ( ret==-2 )
+ if ( ret==-2 )
{
// something else than a SNP
free(ref);
args->str.l = 0;
kputc(ref[0], &args->str);
- for (i=0; i<5; i++)
+ for (i=0; i<5; i++)
{
if ( alleles[i]>0 )
{
static void tsv_to_vcf(args_t *args)
{
if ( !args->ref_fname ) error("--tsv2vcf requires the --fasta-ref option\n");
- if ( !args->sample_list ) error("--tsv2vcf requires the --samples option\n");
args->ref = fai_load(args->ref_fname);
if ( !args->ref ) error("Could not load the reference %s\n", args->ref_fname);
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
- int i, n;
- char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
- if ( !smpls ) error("Could not parse %s\n", args->sample_list);
- for (i=0; i<n; i++)
+ int i, nsmpl;
+ char **smpl;
+ if ( args->sample_list )
{
- bcf_hdr_add_sample(args->header, smpls[i]);
- free(smpls[i]);
+ smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+ if ( !smpl ) error("Could not parse %s\n", args->sample_list);
+ for (i=0; i<nsmpl; i++)
+ {
+ bcf_hdr_add_sample(args->header, smpl[i]);
+ free(smpl[i]);
+ }
+ free(smpl);
+ bcf_hdr_add_sample(args->header, NULL);
+ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsmpl*2);
}
- free(smpls);
- bcf_hdr_add_sample(args->header, NULL);
- args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
char wmode[8];
set_wmode(wmode,args->output_type,args->outfname,args->clevel);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA");
if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n");
if ( tsv_register(tsv, "POS", tsv_setter_pos, NULL) < 0 ) error("Expected POS column\n");
if ( tsv_register(tsv, "ID", tsv_setter_id, args->header) < 0 && !args->columns ) error("Expected ID column\n");
- if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 ) error("Expected AA column\n");
+ if ( tsv_register(tsv, "AA", tsv_setter_aa, args) < 0 )
+ {
+ if ( args->sample_list ) error("Expected AA column with -s/-S\n");
+ if ( tsv_register(tsv, "REF", tsv_setter_ref, args) < 0 || tsv_register(tsv, "ALT", tsv_setter_alt, args) < 0 )
+ error("Expected REF and ALT columns when AA was not given\n");
+ }
bcf1_t *rec = bcf_init();
bcf_float_set_missing(rec->qual);
if ( !tsv_parse(tsv, rec, line.s) )
{
if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ args->n.written++;
}
else
args->n.skipped++;
if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname);
free(line.s);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
bcf_hdr_destroy(args->header);
if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
tsv_destroy(tsv);
bcf_destroy(rec);
free(args->str.s);
free(args->gts);
+ free(args->tsv.ref.s);
+ free(args->tsv.alt.s);
+ free(args->tsv.refalt.s);
fprintf(bcftools_stderr,"Rows total: \t%d\n", args->n.total);
fprintf(bcftools_stderr,"Rows skipped: \t%d\n", args->n.skipped);
- fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing);
- fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr);
- fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra);
- fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa);
- fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa);
+ fprintf(bcftools_stderr,"Sites written: \t%d\n", args->n.written);
+ if ( args->sample_list )
+ {
+ fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing);
+ fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr);
+ fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra);
+ fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa);
+ fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa);
+ }
}
static void vcf_to_vcf(args_t *args)
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
while ( bcf_sr_next_line(args->files) )
{
}
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
}
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
}
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
+ if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname);
int32_t *itmp = NULL, nitmp = 0;
{
int pass = filter_test(args->filter, line, NULL);
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
- if ( !pass )
+ if ( !pass )
{
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
continue;
}
}
free(itmp);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname);
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
}
fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n");
fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
fprintf(bcftools_stderr, " -G, --gensample2vcf ... <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "TSV conversion:\n");
fprintf(bcftools_stderr, " --tsv2vcf FILE\n");
- fprintf(bcftools_stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n");
+ fprintf(bcftools_stderr, " -c, --columns STRING Columns of the input tsv file, see man page for details [ID,CHROM,POS,AA]\n");
fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
fprintf(bcftools_stderr, " -s, --samples LIST List of sample names\n");
fprintf(bcftools_stderr, " -S, --samples-file FILE File of sample names\n");
{"fasta-ref",required_argument,NULL,'f'},
{"no-version",no_argument,NULL,10},
{"keep-duplicates",no_argument,NULL,12},
+ {"write-index",no_argument,NULL,16},
{NULL,0,NULL,0}
};
char *tmp;
case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break;
case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break;
case 15 : args->gen_3N6 = 1; break;
+ case 16 : args->write_index = 1; break;
case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break;
case 'f': args->ref_fname = optarg; break;
case 'c': args->columns = optarg; break;
else args->infname = argv[optind];
}
if ( !args->infname ) usage();
-
+
if ( args->convert_func ) args->convert_func(args);
else vcf_to_vcf(args);
/* vcffilter.c -- Apply fixed-threshold filters.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *output_fname, *targets_list, *regions_list, *mask_list;
int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate;
regidx_t *mask;
+ char *index_fn;
+ int write_index;
}
args_t;
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
}
{"SnpGap",required_argument,NULL,'g'},
{"IndelGap",required_argument,NULL,'G'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,12},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) {
switch (c) {
case 'g':
- args->snp_gap = strtol(optarg,&tmp,10);
+ args->snp_gap = strtol(optarg,&tmp,10);
if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
if ( *tmp==':' )
{
else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2;
else error("Could not parse: --mask-overlap %s\n",optarg);
break;
+ case 12 : args->write_index = 1; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
init_data(args);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
while ( bcf_sr_next_line(args->files) )
{
bcf1_t *line = bcf_sr_get_line(args->files, 0);
}
}
buffered_filters(args, NULL);
-
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
destroy_data(args);
bcf_sr_destroy(args->files);
/* vcffilter.c -- Apply fixed-threshold filters.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *output_fname, *targets_list, *regions_list, *mask_list;
int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate;
regidx_t *mask;
+ char *index_fn;
+ int write_index;
}
args_t;
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
{"SnpGap",required_argument,NULL,'g'},
{"IndelGap",required_argument,NULL,'G'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,12},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) {
switch (c) {
case 'g':
- args->snp_gap = strtol(optarg,&tmp,10);
+ args->snp_gap = strtol(optarg,&tmp,10);
if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
if ( *tmp==':' )
{
else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2;
else error("Could not parse: --mask-overlap %s\n",optarg);
break;
+ case 12 : args->write_index = 1; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
init_data(args);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
while ( bcf_sr_next_line(args->files) )
{
bcf1_t *line = bcf_sr_get_line(args->files, 0);
}
}
buffered_filters(args, NULL);
-
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
destroy_data(args);
bcf_sr_destroy(args->files);
/* vcfgtcheck.c -- Check sample identity.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
int regions_overlap, targets_overlap;
int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+ int nused[2][2];
double *pdiff, *qry_prob, *gt_prob;
uint32_t *ndiff,*ncnt,ncmp, npairs;
int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
}
if ( args->gt_samples )
- {
+ {
init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
args->gt_hdr ? args->gt_hdr : args->qry_hdr,
args->gt_fname ? args->gt_fname : args->qry_fname);
args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
// dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
- // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
+ // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
// probabilities of 0/0, 0/1, and 1/1 genotypes
for (i=0; i<8; i++)
for (j=0; j<3; j++)
args->gt_arr = args->qry_arr;
}
+ // stats: number of compared sites, and used tags
args->ncmp++;
+ args->nused[qry_use_GT][gt_use_GT]++;
double af,hwe_dsg[8];
if ( args->calc_hwe_prob )
gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
if ( !gt_dsg ) continue; // missing value
if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom
-
+
ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
if ( !qry_dsg ) continue; // missing value
fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+ fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
+ fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
+ fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
+ fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
fprintf(args->fp,"# DC, discordance:\n");
fprintf(args->fp,"# - query sample\n");
fprintf(args->fp,"# - genotyped sample\n");
- fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n");
- fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+ fprintf(args->fp,"# - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n");
+ fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n");
fprintf(args->fp,"# - number of sites compared (bigger is better)\n");
fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
return 1;
not_okay:
- fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
+ fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
bcf_seqname(hdr,rec),rec->pos+1,msg);
return 0;
}
args->es_max_mem = strdup("500M");
// In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
- // - min_inter: pairs with smaller err value will be considered identical
+ // - min_inter: pairs with smaller err value will be considered identical
// - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
// different. If negative, the cutoff may be heuristically lowered
args->min_inter_err = 0.23;
case 3 : args->calc_hwe_prob = 0; break;
case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
case 5 : args->dry_run = 1; break;
- case 6 :
+ case 6 :
args->distinctive_sites = strtod(optarg,&tmp);
if ( *tmp )
{
else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
break;
- case 'S':
+ case 'S':
if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
/* vcfgtcheck.c -- Check sample identity.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
int regions_overlap, targets_overlap;
int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+ int nused[2][2];
double *pdiff, *qry_prob, *gt_prob;
uint32_t *ndiff,*ncnt,ncmp, npairs;
int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
}
if ( args->gt_samples )
- {
+ {
init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
args->gt_hdr ? args->gt_hdr : args->qry_hdr,
args->gt_fname ? args->gt_fname : args->qry_fname);
args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
// dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
- // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
+ // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
// probabilities of 0/0, 0/1, and 1/1 genotypes
for (i=0; i<8; i++)
for (j=0; j<3; j++)
args->gt_arr = args->qry_arr;
}
+ // stats: number of compared sites, and used tags
args->ncmp++;
+ args->nused[qry_use_GT][gt_use_GT]++;
double af,hwe_dsg[8];
if ( args->calc_hwe_prob )
gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
if ( !gt_dsg ) continue; // missing value
if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom
-
+
ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
if ( !qry_dsg ) continue; // missing value
fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+ fprintf(args->fp,"INFO\tsites-used-PL-vs-PL\t%u\n",args->nused[0][0]);
+ fprintf(args->fp,"INFO\tsites-used-PL-vs-GT\t%u\n",args->nused[0][1]);
+ fprintf(args->fp,"INFO\tsites-used-GT-vs-PL\t%u\n",args->nused[1][0]);
+ fprintf(args->fp,"INFO\tsites-used-GT-vs-GT\t%u\n",args->nused[1][1]);
fprintf(args->fp,"# DC, discordance:\n");
fprintf(args->fp,"# - query sample\n");
fprintf(args->fp,"# - genotyped sample\n");
- fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n");
- fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+ fprintf(args->fp,"# - discordance (either an abstract score or number of mismatches, see -e/-u in the man page for details; smaller is better)\n");
+ fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes matches are more informative, bigger is better)\n");
fprintf(args->fp,"# - number of sites compared (bigger is better)\n");
fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
return 1;
not_okay:
- fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
+ fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
bcf_seqname(hdr,rec),rec->pos+1,msg);
return 0;
}
args->es_max_mem = strdup("500M");
// In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
- // - min_inter: pairs with smaller err value will be considered identical
+ // - min_inter: pairs with smaller err value will be considered identical
// - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
// different. If negative, the cutoff may be heuristically lowered
args->min_inter_err = 0.23;
case 3 : args->calc_hwe_prob = 0; break;
case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
case 5 : args->dry_run = 1; break;
- case 6 :
+ case 6 :
args->distinctive_sites = strtod(optarg,&tmp);
if ( *tmp )
{
else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
break;
- case 'S':
+ case 'S':
if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
/* vcfisec.c -- Create intersections, unions and complements of VCF files.
- Copyright (C) 2012-2022 Genome Research Ltd.
+ Copyright (C) 2012-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
char *isec_exact;
int argc, record_cmd_line;
+ char *index_fn;
+ int write_index;
}
args_t;
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+ if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 )
+ error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output");
}
if ( !args->nwrite && !out_std && !args->prefix )
fprintf(stderr,"Note: -w option not given, printing list of sites...\n");
}
}
if ( str.s ) free(str.s);
- if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+ if ( out_fh )
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+ }
}
static void add_filter(args_t *args, char *expr, int logic)
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,10},
{NULL,0,NULL,0}
};
char *tmp;
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 10 : args->write_index = 1; break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
/* vcfisec.c -- Create intersections, unions and complements of VCF files.
- Copyright (C) 2012-2022 Genome Research Ltd.
+ Copyright (C) 2012-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
char *isec_exact;
int argc, record_cmd_line;
+ char *index_fn;
+ int write_index;
}
args_t;
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+ if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 )
+ error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output");
}
if ( !args->nwrite && !out_std && !args->prefix )
fprintf(bcftools_stderr,"Note: -w option not given, printing list of sites...\n");
}
}
if ( str.s ) free(str.s);
- if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+ if ( out_fh )
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out_fh)<0 )
+ {
+ if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-");
+ }
}
static void add_filter(args_t *args, char *expr, int logic)
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,10},
{NULL,0,NULL,0}
};
char *tmp;
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 10 : args->write_index = 1; break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2022 Genome Research Ltd.
+ Copyright (C) 2012-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#define PL2PROB_MAX 1024
+// Rules for merging FORMAT Number=A,G,R vectors with missing values
+#define MERGE_MISSING_DOT 0 // leave as is, i.e. use a missing value "."
+#define MERGE_MISSING_CONST 1 // use a constant value
+#define MERGE_MISSING_MAX 2 // use the existing maximum value
+
+typedef struct _missing_rule_t
+{
+ char *hdr_tag;
+ int type;
+ float value;
+}
+missing_rule_t;
+
// For merging INFO Number=A,G,R tags
typedef struct
{
int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles)
int mmap; // size of map array (only buffer[i].n_allele is actually used)
int als_differ;
+ int var_types; // variant types in this record, shifted by <<1 to account for VCF_REF
}
maux1_t;
+
+// Buffered lines for a single reader
typedef struct
{
int rid; // current rid
int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+ int unkn_allele;// the index of the unknown allele (<*>, <NON_REF>)
int cur; // current line or -1 if none
int mrec; // allocated size of buf
maux1_t *rec; // buffer to keep reader's lines
bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+ int var_types; // reader's variant types in the active [beg,end] window
}
buffer_t;
typedef struct
{
- int n, pos, var_types; // number of readers, current position, currently available variant types
+ int n, pos, var_types; // number of readers; current position; variant types at this position across all available records
+ int *als_types, // allele type of each output allele
+ mals_types;
char *chr; // current chromosome
char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output
int nals, mals, nout_als, mout_als; // size of the output array
int *cnt, ncnt; // number of records that refer to the alleles
int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
+ const char **fmt_key;// temporary short-lived array to store output tag names
bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
int nfmt_map; // number of rows in the fmt_map array
- int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes
+ int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes, from src idxs to dst file idxs
void *tmp_arr;
size_t ntmp_arr;
buffer_t *buf;
faidx_t *gvcf_fai;
info_rule_t *rules;
int nrules;
+ char *missing_rules_str;
+ missing_rule_t *missing_rules; // lookup for -M, --missing-rules
+ int nmissing_rules;
strdict_t *tmph;
kstring_t tmps;
bcf_srs_t *files;
int argc, n_threads, record_cmd_line, clevel;
int local_alleles; // the value of -L option
int keep_AC_AN;
+ char *index_fn;
+ int write_index;
}
args_t;
}
}
+static int missing_rules_comp_key2(const void *a, const void *b)
+{
+ missing_rule_t *rule1 = (missing_rule_t*) a;
+ missing_rule_t *rule2 = (missing_rule_t*) b;
+ return strcmp(rule1->hdr_tag, rule2->hdr_tag);
+}
+static int missing_rules_comp_key(const void *a, const void *b)
+{
+ char *key = (char*) a;
+ missing_rule_t *rule = (missing_rule_t*) b;
+ return strcmp(key, rule->hdr_tag);
+}
+static void missing_rules_init(args_t *args)
+{
+ kstring_t str = {0,0,0};
+ if ( args->missing_rules_str )
+ {
+ if ( !strcmp("-",args->missing_rules_str) ) kputs("PL:.,AD:.",&str);
+ else kputs(args->missing_rules_str,&str);
+ }
+ else if ( args->do_gvcf ) kputs("PL:max,AD:0",&str);
+ else return;
+
+ args->nmissing_rules = 1;
+ char *ss = str.s, *tmp = ss;
+ int n = 0;
+ while ( *ss )
+ {
+ if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+ else if ( *ss==',' ) { *ss = 0; args->nmissing_rules++; n++; if ( n%2==1 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+ ss++;
+ }
+ if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+ args->missing_rules = (missing_rule_t*) calloc(args->nmissing_rules,sizeof(missing_rule_t));
+
+ n = args->nmissing_rules;
+ args->nmissing_rules = 0;
+ ss = tmp;
+ while ( args->nmissing_rules < n )
+ {
+ missing_rule_t *rule = &args->missing_rules[args->nmissing_rules];
+ rule->hdr_tag = strdup(ss);
+ int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
+ if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_FMT,id) )
+ {
+ if ( args->missing_rules_str ) error("The FORMAT tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+ free(rule->hdr_tag);
+ n--;
+ ss = strchr(ss, '\0'); ss++;
+ if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+ ss = strchr(ss, '\0'); ss++;
+ continue;
+ }
+
+ ss = strchr(ss, '\0'); ss++;
+ if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+
+ if ( !strcasecmp(ss,".") ) rule->type = MERGE_MISSING_DOT;
+ else if ( !strcasecmp(ss,"max") ) rule->type = MERGE_MISSING_MAX;
+ else
+ {
+ char *tmp = ss;
+ rule->value = strtod(ss, &tmp);
+ if ( *tmp ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+ rule->type = MERGE_MISSING_CONST;
+ }
+ ss = strchr(ss, '\0'); ss++;
+ args->nmissing_rules++;
+ }
+ qsort(args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key2);
+ free(str.s);
+}
+static void missing_rules_destroy(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nmissing_rules; i++)
+ {
+ missing_rule_t *rule = &args->missing_rules[i];
+ free(rule->hdr_tag);
+ }
+ free(args->missing_rules);
+}
+
static int info_rules_comp_key2(const void *a, const void *b)
{
info_rule_t *rule1 = (info_rule_t*) a;
int i,j;
for (i=0; i<ma->nout_smpl; i++) free(ma->str[i].s);
free(ma->str);
+ free(ma->als_types);
for (i=0; i<ma->mals; i++)
{
free(ma->als[i]);
free(ma->AGR_info);
if (ma->ntmp_arr) free(ma->tmp_arr);
if (ma->nfmt_map) free(ma->fmt_map);
+ free(ma->fmt_key);
// ma->inf freed in bcf_destroy1
for (i=0; i<ma->mals; i++) free(ma->als[i]);
if (ma->mout_als) free(ma->out_als);
{
int i,j;
for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
- for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
for (i=0; i<ma->mals; i++)
{
free(ma->als[i]);
for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
{
ma->buf[i].rec[j].skip = 0;
+ ma->buf[i].rec[j].var_types = 0;
bcf1_t *line = ma->files->readers[i].buffer[j];
if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
}
int ir, j;
for (ir=0; ir<files->nreaders; ir++)
{
+ ma->buf[ir].unkn_allele = 0;
bcf1_t *line = maux_get_line(args,ir);
if ( !line ) continue;
for (j=1; j<line->n_allele; j++)
{
int irec = ma->buf[ir].cur;
if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+ if ( bcf_has_variant_type(line,j,VCF_REF) && line->d.allele[j][0]=='<' ) ma->buf[ir].unkn_allele = j;
}
}
}
bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
ma->laa_dirty = 1;
}
-void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule, bcf1_t *out)
{
bcf_srs_t *files = args->files;
bcf_hdr_t *out_hdr = args->out_hdr;
for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
continue; \
} \
- int ngsize = ma->smpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
- for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+ int haploid = ma->smpl_ploidy[ismpl+j]==1 ? 1 : 0; \
+ int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
+ if ( ma->buf[i].unkn_allele ) /* Use value from the unknown allele when available */ \
+ { \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \
+ for (l=0; l<ngsize; l++) { *tgt = src[iunkn]; tgt++; } \
+ } \
+ else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+ { \
+ for (l=0; l<ngsize; l++) { *tgt = mrule->value; tgt++; } \
+ } \
+ else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+ { \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ src_type_t max = src[0]; \
+ for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+ for (l=0; l<ngsize; l++) { *tgt = max; tgt++; } \
+ } \
+ else \
+ { \
+ for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+ } \
for (; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
- if ( ma->smpl_ploidy[ismpl+j]==1 ) \
+ if ( haploid ) \
{ \
- /* Haploid */ \
int iori, inew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
continue; \
} \
src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
- for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+ if ( ma->buf[i].unkn_allele ) /* Use value from the unknown allele when available */ \
+ { \
+ int iunkn = ma->buf[i].unkn_allele; \
+ for (l=0; l<nsize; l++) { *tgt = src[iunkn]; tgt++; } \
+ } \
+ else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+ { \
+ for (l=0; l<nsize; l++) { *tgt = mrule->value; tgt++; } \
+ } \
+ else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+ { \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ src_type_t max = src[0]; \
+ for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+ for (l=0; l<nsize; l++) { *tgt = max; tgt++; } \
+ } \
+ else \
+ { \
+ for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+ } \
int iori,inew; \
for (iori=ifrom; iori<line->n_allele; iori++) \
{ \
{
ma->nfmt_map = 2;
ma->fmt_map = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*));
+ ma->fmt_key = (const char**) malloc(ma->nfmt_map*sizeof(*ma->fmt_key));
}
else
memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**));
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_fmt; j++)
{
- // Wat this tag already seen?
+ // Was this tag already seen?
bcf_fmt_t *fmt = &line->d.fmt[j];
const char *key = hdr->id[BCF_DT_ID][fmt->id].key;
kitr = kh_get(strdict, tmph, key);
{
ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders);
memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
+ ma->fmt_key = (const char**) realloc(ma->fmt_key, sizeof(*ma->fmt_key)*(max_ifmt+1));
ma->nfmt_map = max_ifmt+1;
}
if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; }
+ ma->fmt_key[max_ifmt] = key;
}
kitr = kh_put(strdict, tmph, key, &ret);
kh_value(tmph, kitr) = ifmt;
update_AN_AC(out_hdr, out);
for (i=1; i<=max_ifmt; i++)
- merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+ {
+ missing_rule_t *rule = (missing_rule_t*) bsearch(ma->fmt_key[i], args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key);
+ merge_format_field(args, &ma->fmt_map[i*files->nreaders], rule, out);
+ }
if ( ma->laa_dirty )
update_local_alleles(args, out);
{
int slen = 0;
char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+ if (!seq)
+ exit(1); // faidx_fetch_seq has already reported the error.
+
if (slen)
{
out->d.allele[0][0] = seq[0];
return 0;
}
-// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
-// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and
-// to accommodate for VCF_GVCF_REF defined below
-static const int
- snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2),
- indel_mask = VCF_INDEL<<2,
- ins_mask = VCF_INS<<2,
- del_mask = VCF_DEL<<2,
- ref_mask = 2;
-
/*
Check incoming lines for new gVCF blocks, set pointer to the current source
buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
{
if ( ma->gvcf[ir].active )
{
- if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0;
+ if ( ma->pos > ma->gvcf[ir].end ) ma->gvcf[ir].active = 0;
else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block
}
if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
{
bcf_sr_t *reader = &files->readers[j];
buffer_t *buf = &maux->buf[j];
- fprintf(stderr," reader %d: ", j);
+ fprintf(stderr," reader %d (k=%d-%d): ", j,buf->beg,buf->end);
for (k=buf->beg; k<buf->end; k++)
{
- if ( buf->rec[k].skip & SKIP_DONE ) continue;
- bcf1_t *line = reader->buffer[k];
+ if ( buf->rec[k].skip & SKIP_DONE ) { fprintf(stderr," DONE"); continue; }
+ bcf1_t *line = reader->buffer[k]; // selected for merging by can_merge
fprintf(stderr,"\t");
- if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record will not be merged in this round
+ if ( buf->cur==k ) fprintf(stderr,"!"); // selected for merging by stage_line
+ if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record cannot be merged in this round
+ if ( !line->n_allele && maux->gvcf[j].active )
+ fprintf(stderr,"<*>");
for (l=0; l<line->n_allele; l++)
fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]);
if ( buf->rec[k].skip ) fprintf(stderr,"]");
{
maux_t *maux = args->maux;
int i,j;
+ fprintf(stderr,"State after position=%d done:\n",maux->pos+1);
for (i=0; i<args->files->nreaders; i++)
{
- fprintf(stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
+ fprintf(stderr,"\treader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
if ( maux->buf[i].cur >=0 )
{
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
}
fprintf(stderr,"\n");
}
- fprintf(stderr,"gvcf_min=%d\n", args->maux->gvcf_min);
+ fprintf(stderr,"\tgvcf_min=%d\n", args->maux->gvcf_min);
for (i=0; i<args->files->nreaders; i++)
{
- fprintf(stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
+ fprintf(stderr,"\t\treader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1);
fprintf(stderr,"\n");
}
fprintf(stderr,"\n");
}
+
+// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
+// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault)
+static const int
+ snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1),
+ indel_mask = (VCF_INDEL<<1),
+ ins_mask = VCF_INS<<1,
+ del_mask = VCF_DEL<<1,
+ ref_mask = 1;
+
+// Can these types be merged given the -m settings? Despite the function's name, its focus is on
+// excluding incompatible records, there will be a finer matching later in stage_line()
+static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec)
+{
+ int k;
+ maux_t *maux = args->maux;
+ bcf1_t *rec = buf->lines[irec];
+ int rec_types = buf->rec[irec].var_types;
+
+ assert( selected_types ); // this is trivially true, set in can_merge()
+
+ if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything
+
+ // REF and gVCF_REF with no other alleles present can be merged with anything
+ if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1;
+ if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
+
+ if ( args->collapse!=COLLAPSE_NONE )
+ {
+ // If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del
+ // Include the new record if
+ // - rec has SNV, we already have SNV, and -m is both,snps,snp-ins-del
+ // - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
+ if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) )
+ {
+ if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1;
+ }
+ if ( args->collapse&COLLAPSE_INDELS )
+ {
+ if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1;
+ }
+ if ( args->collapse&COLLAPSE_SNP_INS_DEL )
+ {
+ if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1;
+ if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1;
+ }
+ // Whatever is left, allow to match if the alleles match exactly
+ }
+
+ // The -m none mode or exact matching requested
+ // Simple test first: are the variants of the same type?
+ int x = selected_types >> 1; // remove REF
+ int y = rec_types >> 1; // remove REF
+ while ( x && y ) { x>>=1; y>>=1; }
+ if ( x || y ) return 0; // the types differ
+
+ if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible
+ for (k=1; k<rec->n_allele; k++)
+ {
+ if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue; // this must be gVCF_REF (<*> or <NON_REF>)
+ if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
+ }
+ if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k]
+ return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
+}
+
+static void maux_update_alleles(args_t *args, int ireader, int irec)
+{
+ int k;
+ bcf_sr_t *reader = &args->files->readers[ireader];
+ maux_t *maux = args->maux;
+ buffer_t *buf = &maux->buf[ireader];
+ maux1_t *ma1 = &buf->rec[irec];
+ bcf1_t *line = buf->lines[irec];
+ hts_expand(int, line->n_allele, ma1->mmap, ma1->map);
+ if ( !maux->nals ) // first record to be merged, copy the alleles to the output
+ {
+ maux->nals = line->n_allele;
+ hts_expand0(char*, maux->nals, maux->mals, maux->als);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+ for (k=0; k<maux->nals; k++)
+ {
+ free(maux->als[k]);
+ maux->als[k] = strdup(line->d.allele[k]);
+ ma1->map[k] = k;
+ maux->cnt[k] = 1;
+ int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+ if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+ maux->als_types[k] = var_type ? var_type<<1 : ref_mask;
+ }
+ return;
+ }
+ // normalize alleles
+ maux->als = merge_alleles(line->d.allele, line->n_allele, ma1->map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+ for (k=1; k<line->n_allele; k++)
+ {
+ int ik = ma1->map[k];
+ int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+ if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+ maux->als_types[ik] = var_type ? var_type<<1 : ref_mask;
+ maux->cnt[ik]++; // how many times an allele appears in the files
+ }
+ maux->cnt[0]++;
+}
+
/*
- Determine which line should be merged from which reader: go through all
- readers and all buffered lines, expand REF,ALT and try to match lines with
- the same ALTs.
+ Determine which lines remain to be merged across readers at the current position and
+ are compatible given the -m criteria. This is indicated by maux1_t.skip: 0=compatible,
+ SKIP_DONE=the record is done, SKIP_DIFF=not compatible and will be included next time.
+
+ At the same time count how many times is each allele present across the readers and records
+ so that we can prioritize the records with the same alleles to come first. In the end maximum
+ one record at a time can be selected from each reader and that witll be done in stage_line().
+
+ The function maux_reset already initialized structures for this position, so here each
+ reader comes with the beg,end indexes that point to records with the same maux_t.pos position.
*/
int can_merge(args_t *args)
{
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
char *id = NULL, ref = 'N';
- int i,j,k, ntodo = 0;
+ int i,j, ntodo = 0;
for (i=0; i<maux->nals; i++)
{
free(maux->als[i]);
maux->als[i] = NULL;
+ maux->cnt[i] = 0;
}
maux->var_types = maux->nals = 0;
- // this is only for the `-m none -g` mode, ensure that <*> lines come last
- #define VCF_GVCF_REF 1
-
+ // In this loop we do the following:
+ // - remember the first encountered ID if matching by ID
+ // - count the number of unprocessed records at this position
+ // - collect all variant types at this position. This is to be able to perform -m matching and
+ // print SNVs first, then indels, then gVCF blocks
+ // - init the 'skip' variable to SKIP_DIFF for each record that has not been used yet
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
+ buf->var_types = 0;
- if ( gaux && gaux[i].active )
+ if ( gaux && gaux[i].active ) // active gvcf block
{
- // skip readers with active gvcf blocks
buf->rec[buf->beg].skip = SKIP_DIFF;
+ maux->var_types |= ref_mask;
+ buf->var_types |= ref_mask;
+ buf->rec[buf->beg].var_types = ref_mask;
continue;
}
+
+ // for gvcf: find out REF at this position
+ if ( buf->beg < buf->end && ref=='N' ) ref = buf->lines[buf->beg]->d.allele[0][0];
+
for (j=buf->beg; j<buf->end; j++)
{
if ( buf->rec[j].skip & SKIP_DONE ) continue;
ntodo++;
bcf1_t *line = buf->lines[j];
- if ( args->merge_by_id )
- id = line->d.id;
- else
+ if ( args->merge_by_id && !id ) { id = line->d.id; continue; } // set ID when merging by id
+
+ if ( !buf->rec[j].var_types )
{
int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
- if (var_type < 0) error("bcf_has_variant_types() failed.");
+ if ( var_type < 0 ) error("bcf_has_variant_types() failed.");
if ( args->collapse==COLLAPSE_SNP_INS_DEL )
{
// need to distinguish between ins and del so strip the VCF_INDEL flag
var_type &= ~VCF_INDEL;
}
- maux->var_types |= var_type ? var_type<<2 : 2;
-
- // for the `-m none -g` mode
- if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
- maux->var_types |= VCF_GVCF_REF;
+ var_type = var_type ? var_type<<1 : ref_mask;
+ if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask;
+ buf->rec[j].var_types = var_type;
}
+ maux->var_types |= buf->rec[j].var_types;
+ buf->var_types |= buf->rec[j].var_types;
}
-
- // for gvcf: find out REF at this position
- if ( buf->beg < buf->end && ref=='N' )
- ref = buf->lines[buf->beg]->d.allele[0][0];
}
if ( !ntodo ) return 0;
+ int selected_types = 0;
+
// In this loop we select from each reader compatible candidate lines.
// (i.e. SNPs or indels). Go through all files and all lines at this
// position and normalize relevant alleles.
// REF-only sites may be associated with both SNPs and indels.
for (i=0; i<files->nreaders; i++)
{
- bcf_sr_t *reader = &files->readers[i];
buffer_t *buf = &maux->buf[i];
-
if ( gaux && gaux[i].active )
{
+ // gVCF records inherited from an upstream gVCF block have incorrect or missing allele and position
gaux[i].line->d.allele[0][0] = ref;
gaux[i].line->pos = maux->pos;
+ maux_update_alleles(args, i, buf->beg);
+ selected_types |= ref_mask;
+ continue;
}
-
for (j=buf->beg; j<buf->end; j++)
{
if ( buf->rec[j].skip & SKIP_DONE ) continue;
bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
-
- int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
- if (line_type < 0) error("bcf_has_variant_types() failed.");
- line_type = line_type ? line_type<<2 : 2;
+ int line_types = buf->rec[j].var_types;
// select relevant lines
if ( args->merge_by_id )
{
- if ( strcmp(id,line->d.id) ) continue;
+ if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record
}
+ else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue;
else
{
- // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
- // records come last, otherwise infinite loop is created (#1164)
- if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
- {
- if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
- }
- if ( args->collapse==COLLAPSE_NONE && maux->nals )
- {
- // All alleles of the tested record must be present in the
- // selected maux record plus variant types must be the same
- if ( (maux->var_types & line_type) != line_type ) continue;
- if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
- for (k=1; k<line->n_allele; k++)
- {
- if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
- }
- if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele
- }
- if ( !(args->collapse&COLLAPSE_ANY) )
- {
- // Merge:
- // - SNPs+SNPs+MNPs+REF if -m both,snps
- // - indels+indels+REF if -m both,indels, REF only if SNPs are not present
- // - SNPs come first
- if ( line_type & (indel_mask|ins_mask|del_mask) )
- {
- if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
- if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
- }
- }
+ // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
+ if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics
+ && (maux->var_types&snp_mask) // there are SNVs at the current position
+ && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
+ ) continue;
}
- buf->rec[j].skip = 0;
+ selected_types |= line_types;
- hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
- if ( !maux->nals ) // first record, copy the alleles to the output
- {
- maux->nals = line->n_allele;
- hts_expand0(char*, maux->nals, maux->mals, maux->als);
- hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
- for (k=0; k<maux->nals; k++)
- {
- free(maux->als[k]);
- maux->als[k] = strdup(line->d.allele[k]);
- buf->rec[j].map[k] = k;
- maux->cnt[k] = 1;
- }
- continue;
- }
- // normalize alleles
- maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
- if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
- hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
- for (k=1; k<line->n_allele; k++)
- maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files
- maux->cnt[0]++;
+ buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line
+ maux_update_alleles(args, i, j);
}
}
return 1;
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
- // debug_maux(args);
-
- // take the most frequent allele present in multiple files, REF is skipped
- int i,j,k,icnt = 1;
- for (i=2; i<maux->nals; i++)
- if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+ // Take the most frequent allele present in multiple files, REF and gVCF_REF is skipped.
+ int i,j,k,icnt = -1;
+ for (i=1; i<maux->nals; i++)
+ {
+ if ( maux->als_types[i] & ref_mask ) continue;
+ if ( icnt==-1 || maux->cnt[icnt] < maux->cnt[i] ) icnt = i;
+ }
+ int selected_type = icnt>0 ? maux->als_types[icnt] : ref_mask;
int nout = 0;
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
buf->cur = -1;
- if ( buf->beg >= buf->end ) continue; // no lines in the buffer
+ if ( buf->beg >= buf->end ) continue; // No lines in the buffer at this site
// find lines with the same allele
for (j=buf->beg; j<buf->end; j++)
{
- if ( buf->rec[j].skip ) continue; // done or not compatible
- if ( args->merge_by_id ) break;
- if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record
+ if ( buf->rec[j].skip )
+ {
+ int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0;
+ if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1;
+ if ( !is_gvcf ) continue; // done or not compatible
+ }
+ if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line
+
+ // skip if the reader has a record that matches the most frequent allele and this record is not it
+ if ( (selected_type & buf->var_types) && !(selected_type & buf->rec[j].var_types) ) continue;
+ // if the reader does not have the most frequent allele type but is a ref, accept
+ if ( !(selected_type & buf->var_types) && (buf->rec[j].var_types & ref_mask) ) break;
+ if ( selected_type==ref_mask ) break;
+
+ // accept if the record has the most frequent allele
for (k=0; k<buf->lines[j]->n_allele; k++)
if ( icnt==buf->rec[j].map[k] ) break;
-
if ( k<buf->lines[j]->n_allele ) break;
}
if ( j>=buf->end )
{
// no matching allele found in this file
- if ( args->collapse==COLLAPSE_NONE ) continue;
+ if ( args->collapse==COLLAPSE_NONE ) continue; // exact matching requested, skip
+ // choose something compatible to create a multiallelic site given the -m criteria
for (j=buf->beg; j<buf->end; j++)
{
if ( buf->rec[j].skip ) continue; // done or not compatible
if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged
- int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap);
- if (line_type < 0) error("bcf_has_variant_types() failed.");
- if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
- if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
- if ( line_type==VCF_REF )
+ int line_type = buf->rec[j].var_types;
+ if ( maux->var_types&snp_mask && line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ins_mask && line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( maux->var_types&del_mask && line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( line_type&ref_mask )
{
if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
{
// found a suitable line for merging
buf->cur = j;
-
- // mark as finished so that it's ignored next time
- buf->rec[j].skip = SKIP_DONE;
- nout++;
}
}
+
+ // debug_maux(args);
+
+ // Mark lines staged for merging as finished so that they are ignored next time
+ for (i=0; i<files->nreaders; i++)
+ {
+ buffer_t *buf = &maux->buf[i];
+ if ( buf->cur == -1 ) continue;
+
+ buf->rec[buf->cur].skip = SKIP_DONE;
+ nout++;
+ }
+
assert( nout );
}
error_errno("[%s] Failed to update header", __func__);
}
info_rules_init(args);
+ missing_rules_init(args);
bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
return;
}
+ else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
args->maux = maux_init(args);
gvcf_flush(args,1);
info_rules_destroy(args);
+ missing_rules_destroy(args);
maux_destroy(args->maux);
bcf_hdr_destroy(args->out_hdr);
- if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"stdout");
bcf_destroy1(args->out_line);
kh_destroy(strdict, args->tmph);
if ( args->tmps.m ) free(args->tmps.s);
fprintf(stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n");
fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
fprintf(stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
- fprintf(stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
+ fprintf(stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(stderr, " -l, --file-list FILE Read file names from the file\n");
fprintf(stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
fprintf(stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
+ fprintf(stderr, " -M, --missing-rules TAG:METHOD Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n");
fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
}
{"regions-file",required_argument,NULL,'R'},
{"regions-overlap",required_argument,NULL,4},
{"info-rules",required_argument,NULL,'i'},
+ {"missing-rules",required_argument,NULL,'M'},
{"no-version",no_argument,NULL,8},
{"no-index",no_argument,NULL,10},
{"filter-logic",required_argument,NULL,'F'},
+ {"write-index",no_argument,NULL,11},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) {
switch (c) {
case 'L':
args->local_alleles = strtol(optarg,&tmp,10);
break;
case 'l': args->file_list = optarg; break;
case 'i': args->info_rules = optarg; break;
+ case 'M': args->missing_rules_str = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
- else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL;
+ else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
else error("The -m type \"%s\" is not recognised.\n", optarg);
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 10 : args->no_index = 1; break;
+ case 11 : args->write_index = 1; break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2022 Genome Research Ltd.
+ Copyright (C) 2012-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#define PL2PROB_MAX 1024
+// Rules for merging FORMAT Number=A,G,R vectors with missing values
+#define MERGE_MISSING_DOT 0 // leave as is, i.e. use a missing value "."
+#define MERGE_MISSING_CONST 1 // use a constant value
+#define MERGE_MISSING_MAX 2 // use the existing maximum value
+
+typedef struct _missing_rule_t
+{
+ char *hdr_tag;
+ int type;
+ float value;
+}
+missing_rule_t;
+
// For merging INFO Number=A,G,R tags
typedef struct
{
int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles)
int mmap; // size of map array (only buffer[i].n_allele is actually used)
int als_differ;
+ int var_types; // variant types in this record, shifted by <<1 to account for VCF_REF
}
maux1_t;
+
+// Buffered lines for a single reader
typedef struct
{
int rid; // current rid
int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+ int unkn_allele;// the index of the unknown allele (<*>, <NON_REF>)
int cur; // current line or -1 if none
int mrec; // allocated size of buf
maux1_t *rec; // buffer to keep reader's lines
bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+ int var_types; // reader's variant types in the active [beg,end] window
}
buffer_t;
typedef struct
{
- int n, pos, var_types; // number of readers, current position, currently available variant types
+ int n, pos, var_types; // number of readers; current position; variant types at this position across all available records
+ int *als_types, // allele type of each output allele
+ mals_types;
char *chr; // current chromosome
char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output
int nals, mals, nout_als, mout_als; // size of the output array
int *cnt, ncnt; // number of records that refer to the alleles
int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
+ const char **fmt_key;// temporary short-lived array to store output tag names
bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
int nfmt_map; // number of rows in the fmt_map array
- int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes
+ int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes, from src idxs to dst file idxs
void *tmp_arr;
size_t ntmp_arr;
buffer_t *buf;
faidx_t *gvcf_fai;
info_rule_t *rules;
int nrules;
+ char *missing_rules_str;
+ missing_rule_t *missing_rules; // lookup for -M, --missing-rules
+ int nmissing_rules;
strdict_t *tmph;
kstring_t tmps;
bcf_srs_t *files;
int argc, n_threads, record_cmd_line, clevel;
int local_alleles; // the value of -L option
int keep_AC_AN;
+ char *index_fn;
+ int write_index;
}
args_t;
}
}
+static int missing_rules_comp_key2(const void *a, const void *b)
+{
+ missing_rule_t *rule1 = (missing_rule_t*) a;
+ missing_rule_t *rule2 = (missing_rule_t*) b;
+ return strcmp(rule1->hdr_tag, rule2->hdr_tag);
+}
+static int missing_rules_comp_key(const void *a, const void *b)
+{
+ char *key = (char*) a;
+ missing_rule_t *rule = (missing_rule_t*) b;
+ return strcmp(key, rule->hdr_tag);
+}
+static void missing_rules_init(args_t *args)
+{
+ kstring_t str = {0,0,0};
+ if ( args->missing_rules_str )
+ {
+ if ( !strcmp("-",args->missing_rules_str) ) kputs("PL:.,AD:.",&str);
+ else kputs(args->missing_rules_str,&str);
+ }
+ else if ( args->do_gvcf ) kputs("PL:max,AD:0",&str);
+ else return;
+
+ args->nmissing_rules = 1;
+ char *ss = str.s, *tmp = ss;
+ int n = 0;
+ while ( *ss )
+ {
+ if ( *ss==':' ) { *ss = 0; n++; if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+ else if ( *ss==',' ) { *ss = 0; args->nmissing_rules++; n++; if ( n%2==1 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str); }
+ ss++;
+ }
+ if ( n%2==0 ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+ args->missing_rules = (missing_rule_t*) calloc(args->nmissing_rules,sizeof(missing_rule_t));
+
+ n = args->nmissing_rules;
+ args->nmissing_rules = 0;
+ ss = tmp;
+ while ( args->nmissing_rules < n )
+ {
+ missing_rule_t *rule = &args->missing_rules[args->nmissing_rules];
+ rule->hdr_tag = strdup(ss);
+ int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
+ if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_FMT,id) )
+ {
+ if ( args->missing_rules_str ) error("The FORMAT tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+ free(rule->hdr_tag);
+ n--;
+ ss = strchr(ss, '\0'); ss++;
+ if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+ ss = strchr(ss, '\0'); ss++;
+ continue;
+ }
+
+ ss = strchr(ss, '\0'); ss++;
+ if ( !*ss ) error("Could not parse --missing-rules, missing logic of \"%s\"\n", rule->hdr_tag);
+
+ if ( !strcasecmp(ss,".") ) rule->type = MERGE_MISSING_DOT;
+ else if ( !strcasecmp(ss,"max") ) rule->type = MERGE_MISSING_MAX;
+ else
+ {
+ char *tmp = ss;
+ rule->value = strtod(ss, &tmp);
+ if ( *tmp ) error("Could not parse --missing-rules: \"%s\"\n", args->missing_rules_str);
+ rule->type = MERGE_MISSING_CONST;
+ }
+ ss = strchr(ss, '\0'); ss++;
+ args->nmissing_rules++;
+ }
+ qsort(args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key2);
+ free(str.s);
+}
+static void missing_rules_destroy(args_t *args)
+{
+ int i;
+ for (i=0; i<args->nmissing_rules; i++)
+ {
+ missing_rule_t *rule = &args->missing_rules[i];
+ free(rule->hdr_tag);
+ }
+ free(args->missing_rules);
+}
+
static int info_rules_comp_key2(const void *a, const void *b)
{
info_rule_t *rule1 = (info_rule_t*) a;
int i,j;
for (i=0; i<ma->nout_smpl; i++) free(ma->str[i].s);
free(ma->str);
+ free(ma->als_types);
for (i=0; i<ma->mals; i++)
{
free(ma->als[i]);
free(ma->AGR_info);
if (ma->ntmp_arr) free(ma->tmp_arr);
if (ma->nfmt_map) free(ma->fmt_map);
+ free(ma->fmt_key);
// ma->inf freed in bcf_destroy1
for (i=0; i<ma->mals; i++) free(ma->als[i]);
if (ma->mout_als) free(ma->out_als);
{
int i,j;
for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
- for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
for (i=0; i<ma->mals; i++)
{
free(ma->als[i]);
for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
{
ma->buf[i].rec[j].skip = 0;
+ ma->buf[i].rec[j].var_types = 0;
bcf1_t *line = ma->files->readers[i].buffer[j];
if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
}
int ir, j;
for (ir=0; ir<files->nreaders; ir++)
{
+ ma->buf[ir].unkn_allele = 0;
bcf1_t *line = maux_get_line(args,ir);
if ( !line ) continue;
for (j=1; j<line->n_allele; j++)
{
int irec = ma->buf[ir].cur;
if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+ if ( bcf_has_variant_type(line,j,VCF_REF) && line->d.allele[j][0]=='<' ) ma->buf[ir].unkn_allele = j;
}
}
}
bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
ma->laa_dirty = 1;
}
-void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
+void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, missing_rule_t *mrule, bcf1_t *out)
{
bcf_srs_t *files = args->files;
bcf_hdr_t *out_hdr = args->out_hdr;
for (l=1; l<nsize; l++) { tgt++; tgt_set_vector_end; } \
continue; \
} \
- int ngsize = ma->smpl_ploidy[ismpl+j]==1 ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
- for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+ int haploid = ma->smpl_ploidy[ismpl+j]==1 ? 1 : 0; \
+ int ngsize = haploid ? out->n_allele : out->n_allele*(out->n_allele + 1)/2; \
+ if ( ma->buf[i].unkn_allele ) /* Use value from the unknown allele when available */ \
+ { \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ int iunkn = haploid ? ma->buf[i].unkn_allele : (ma->buf[i].unkn_allele+1)*(ma->buf[i].unkn_allele + 2)/2 - 1; \
+ for (l=0; l<ngsize; l++) { *tgt = src[iunkn]; tgt++; } \
+ } \
+ else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+ { \
+ for (l=0; l<ngsize; l++) { *tgt = mrule->value; tgt++; } \
+ } \
+ else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+ { \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ src_type_t max = src[0]; \
+ for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+ for (l=0; l<ngsize; l++) { *tgt = max; tgt++; } \
+ } \
+ else \
+ { \
+ for (l=0; l<ngsize; l++) { tgt_set_missing; tgt++; } \
+ } \
for (; l<nsize; l++) { tgt_set_vector_end; tgt++; } \
- if ( ma->smpl_ploidy[ismpl+j]==1 ) \
+ if ( haploid ) \
{ \
- /* Haploid */ \
int iori, inew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
continue; \
} \
src = (src_type_t*) (fmt_ori->p + j*fmt_ori->size); \
- for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+ if ( ma->buf[i].unkn_allele ) /* Use value from the unknown allele when available */ \
+ { \
+ int iunkn = ma->buf[i].unkn_allele; \
+ for (l=0; l<nsize; l++) { *tgt = src[iunkn]; tgt++; } \
+ } \
+ else if ( mrule && mrule->type==MERGE_MISSING_CONST ) \
+ { \
+ for (l=0; l<nsize; l++) { *tgt = mrule->value; tgt++; } \
+ } \
+ else if ( mrule && mrule->type==MERGE_MISSING_MAX ) \
+ { \
+ src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ src_type_t max = src[0]; \
+ for (l=1; l<fmt_ori->n; l++) if ( max < src[l] ) max = src[l]; \
+ for (l=0; l<nsize; l++) { *tgt = max; tgt++; } \
+ } \
+ else \
+ { \
+ for (l=0; l<nsize; l++) { tgt_set_missing; tgt++; } \
+ } \
int iori,inew; \
for (iori=ifrom; iori<line->n_allele; iori++) \
{ \
{
ma->nfmt_map = 2;
ma->fmt_map = (bcf_fmt_t**) calloc(ma->nfmt_map*files->nreaders, sizeof(bcf_fmt_t*));
+ ma->fmt_key = (const char**) malloc(ma->nfmt_map*sizeof(*ma->fmt_key));
}
else
memset(ma->fmt_map, 0, ma->nfmt_map*files->nreaders*sizeof(bcf_fmt_t**));
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_fmt; j++)
{
- // Wat this tag already seen?
+ // Was this tag already seen?
bcf_fmt_t *fmt = &line->d.fmt[j];
const char *key = hdr->id[BCF_DT_ID][fmt->id].key;
kitr = kh_get(strdict, tmph, key);
{
ma->fmt_map = (bcf_fmt_t**) realloc(ma->fmt_map, sizeof(bcf_fmt_t*)*(max_ifmt+1)*files->nreaders);
memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
+ ma->fmt_key = (const char**) realloc(ma->fmt_key, sizeof(*ma->fmt_key)*(max_ifmt+1));
ma->nfmt_map = max_ifmt+1;
}
if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; }
+ ma->fmt_key[max_ifmt] = key;
}
kitr = kh_put(strdict, tmph, key, &ret);
kh_value(tmph, kitr) = ifmt;
update_AN_AC(out_hdr, out);
for (i=1; i<=max_ifmt; i++)
- merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+ {
+ missing_rule_t *rule = (missing_rule_t*) bsearch(ma->fmt_key[i], args->missing_rules, args->nmissing_rules, sizeof(*args->missing_rules), missing_rules_comp_key);
+ merge_format_field(args, &ma->fmt_map[i*files->nreaders], rule, out);
+ }
if ( ma->laa_dirty )
update_local_alleles(args, out);
{
int slen = 0;
char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+ if (!seq)
+ bcftools_exit(1); // faidx_fetch_seq has already reported the error.
+
if (slen)
{
out->d.allele[0][0] = seq[0];
return 0;
}
-// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
-// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and
-// to accommodate for VCF_GVCF_REF defined below
-static const int
- snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2),
- indel_mask = VCF_INDEL<<2,
- ins_mask = VCF_INS<<2,
- del_mask = VCF_DEL<<2,
- ref_mask = 2;
-
/*
Check incoming lines for new gVCF blocks, set pointer to the current source
buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
{
if ( ma->gvcf[ir].active )
{
- if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0;
+ if ( ma->pos > ma->gvcf[ir].end ) ma->gvcf[ir].active = 0;
else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block
}
if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
{
bcf_sr_t *reader = &files->readers[j];
buffer_t *buf = &maux->buf[j];
- fprintf(bcftools_stderr," reader %d: ", j);
+ fprintf(bcftools_stderr," reader %d (k=%d-%d): ", j,buf->beg,buf->end);
for (k=buf->beg; k<buf->end; k++)
{
- if ( buf->rec[k].skip & SKIP_DONE ) continue;
- bcf1_t *line = reader->buffer[k];
+ if ( buf->rec[k].skip & SKIP_DONE ) { fprintf(bcftools_stderr," DONE"); continue; }
+ bcf1_t *line = reader->buffer[k]; // selected for merging by can_merge
fprintf(bcftools_stderr,"\t");
- if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"["); // this record will not be merged in this round
+ if ( buf->cur==k ) fprintf(bcftools_stderr,"!"); // selected for merging by stage_line
+ if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"["); // this record cannot be merged in this round
+ if ( !line->n_allele && maux->gvcf[j].active )
+ fprintf(bcftools_stderr,"<*>");
for (l=0; l<line->n_allele; l++)
fprintf(bcftools_stderr,"%s%s", l==0?"":",", line->d.allele[l]);
if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"]");
{
maux_t *maux = args->maux;
int i,j;
+ fprintf(bcftools_stderr,"State after position=%d done:\n",maux->pos+1);
for (i=0; i<args->files->nreaders; i++)
{
- fprintf(bcftools_stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
+ fprintf(bcftools_stderr,"\treader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end);
if ( maux->buf[i].cur >=0 )
{
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
}
fprintf(bcftools_stderr,"\n");
}
- fprintf(bcftools_stderr,"gvcf_min=%d\n", args->maux->gvcf_min);
+ fprintf(bcftools_stderr,"\tgvcf_min=%d\n", args->maux->gvcf_min);
for (i=0; i<args->files->nreaders; i++)
{
- fprintf(bcftools_stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
+ fprintf(bcftools_stderr,"\t\treader %d:\tgvcf_active=%d", i,maux->gvcf[i].active);
if ( maux->gvcf[i].active ) fprintf(bcftools_stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1);
fprintf(bcftools_stderr,"\n");
}
fprintf(bcftools_stderr,"\n");
}
+
+// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
+// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault)
+static const int
+ snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1),
+ indel_mask = (VCF_INDEL<<1),
+ ins_mask = VCF_INS<<1,
+ del_mask = VCF_DEL<<1,
+ ref_mask = 1;
+
+// Can these types be merged given the -m settings? Despite the function's name, its focus is on
+// excluding incompatible records, there will be a finer matching later in stage_line()
+static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec)
+{
+ int k;
+ maux_t *maux = args->maux;
+ bcf1_t *rec = buf->lines[irec];
+ int rec_types = buf->rec[irec].var_types;
+
+ assert( selected_types ); // this is trivially true, set in can_merge()
+
+ if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything
+
+ // REF and gVCF_REF with no other alleles present can be merged with anything
+ if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1;
+ if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
+
+ if ( args->collapse!=COLLAPSE_NONE )
+ {
+ // If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del
+ // Include the new record if
+ // - rec has SNV, we already have SNV, and -m is both,snps,snp-ins-del
+ // - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
+ if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) )
+ {
+ if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1;
+ }
+ if ( args->collapse&COLLAPSE_INDELS )
+ {
+ if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1;
+ }
+ if ( args->collapse&COLLAPSE_SNP_INS_DEL )
+ {
+ if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1;
+ if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1;
+ }
+ // Whatever is left, allow to match if the alleles match exactly
+ }
+
+ // The -m none mode or exact matching requested
+ // Simple test first: are the variants of the same type?
+ int x = selected_types >> 1; // remove REF
+ int y = rec_types >> 1; // remove REF
+ while ( x && y ) { x>>=1; y>>=1; }
+ if ( x || y ) return 0; // the types differ
+
+ if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible
+ for (k=1; k<rec->n_allele; k++)
+ {
+ if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue; // this must be gVCF_REF (<*> or <NON_REF>)
+ if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
+ }
+ if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k]
+ return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
+}
+
+static void maux_update_alleles(args_t *args, int ireader, int irec)
+{
+ int k;
+ bcf_sr_t *reader = &args->files->readers[ireader];
+ maux_t *maux = args->maux;
+ buffer_t *buf = &maux->buf[ireader];
+ maux1_t *ma1 = &buf->rec[irec];
+ bcf1_t *line = buf->lines[irec];
+ hts_expand(int, line->n_allele, ma1->mmap, ma1->map);
+ if ( !maux->nals ) // first record to be merged, copy the alleles to the output
+ {
+ maux->nals = line->n_allele;
+ hts_expand0(char*, maux->nals, maux->mals, maux->als);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+ for (k=0; k<maux->nals; k++)
+ {
+ free(maux->als[k]);
+ maux->als[k] = strdup(line->d.allele[k]);
+ ma1->map[k] = k;
+ maux->cnt[k] = 1;
+ int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+ if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+ maux->als_types[k] = var_type ? var_type<<1 : ref_mask;
+ }
+ return;
+ }
+ // normalize alleles
+ maux->als = merge_alleles(line->d.allele, line->n_allele, ma1->map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ hts_expand0(int, maux->nals, maux->mals_types, maux->als_types);
+ for (k=1; k<line->n_allele; k++)
+ {
+ int ik = ma1->map[k];
+ int var_type = bcf_has_variant_type(line, k, VCF_ANY);
+ if ( args->collapse==COLLAPSE_SNP_INS_DEL ) var_type &= ~VCF_INDEL;
+ maux->als_types[ik] = var_type ? var_type<<1 : ref_mask;
+ maux->cnt[ik]++; // how many times an allele appears in the files
+ }
+ maux->cnt[0]++;
+}
+
/*
- Determine which line should be merged from which reader: go through all
- readers and all buffered lines, expand REF,ALT and try to match lines with
- the same ALTs.
+ Determine which lines remain to be merged across readers at the current position and
+ are compatible given the -m criteria. This is indicated by maux1_t.skip: 0=compatible,
+ SKIP_DONE=the record is done, SKIP_DIFF=not compatible and will be included next time.
+
+ At the same time count how many times is each allele present across the readers and records
+ so that we can prioritize the records with the same alleles to come first. In the end maximum
+ one record at a time can be selected from each reader and that witll be done in stage_line().
+
+ The function maux_reset already initialized structures for this position, so here each
+ reader comes with the beg,end indexes that point to records with the same maux_t.pos position.
*/
int can_merge(args_t *args)
{
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
char *id = NULL, ref = 'N';
- int i,j,k, ntodo = 0;
+ int i,j, ntodo = 0;
for (i=0; i<maux->nals; i++)
{
free(maux->als[i]);
maux->als[i] = NULL;
+ maux->cnt[i] = 0;
}
maux->var_types = maux->nals = 0;
- // this is only for the `-m none -g` mode, ensure that <*> lines come last
- #define VCF_GVCF_REF 1
-
+ // In this loop we do the following:
+ // - remember the first encountered ID if matching by ID
+ // - count the number of unprocessed records at this position
+ // - collect all variant types at this position. This is to be able to perform -m matching and
+ // print SNVs first, then indels, then gVCF blocks
+ // - init the 'skip' variable to SKIP_DIFF for each record that has not been used yet
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
+ buf->var_types = 0;
- if ( gaux && gaux[i].active )
+ if ( gaux && gaux[i].active ) // active gvcf block
{
- // skip readers with active gvcf blocks
buf->rec[buf->beg].skip = SKIP_DIFF;
+ maux->var_types |= ref_mask;
+ buf->var_types |= ref_mask;
+ buf->rec[buf->beg].var_types = ref_mask;
continue;
}
+
+ // for gvcf: find out REF at this position
+ if ( buf->beg < buf->end && ref=='N' ) ref = buf->lines[buf->beg]->d.allele[0][0];
+
for (j=buf->beg; j<buf->end; j++)
{
if ( buf->rec[j].skip & SKIP_DONE ) continue;
ntodo++;
bcf1_t *line = buf->lines[j];
- if ( args->merge_by_id )
- id = line->d.id;
- else
+ if ( args->merge_by_id && !id ) { id = line->d.id; continue; } // set ID when merging by id
+
+ if ( !buf->rec[j].var_types )
{
int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
- if (var_type < 0) error("bcf_has_variant_types() failed.");
+ if ( var_type < 0 ) error("bcf_has_variant_types() failed.");
if ( args->collapse==COLLAPSE_SNP_INS_DEL )
{
// need to distinguish between ins and del so strip the VCF_INDEL flag
var_type &= ~VCF_INDEL;
}
- maux->var_types |= var_type ? var_type<<2 : 2;
-
- // for the `-m none -g` mode
- if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
- maux->var_types |= VCF_GVCF_REF;
+ var_type = var_type ? var_type<<1 : ref_mask;
+ if ( args->do_gvcf && is_gvcf_block(line) ) var_type |= ref_mask;
+ buf->rec[j].var_types = var_type;
}
+ maux->var_types |= buf->rec[j].var_types;
+ buf->var_types |= buf->rec[j].var_types;
}
-
- // for gvcf: find out REF at this position
- if ( buf->beg < buf->end && ref=='N' )
- ref = buf->lines[buf->beg]->d.allele[0][0];
}
if ( !ntodo ) return 0;
+ int selected_types = 0;
+
// In this loop we select from each reader compatible candidate lines.
// (i.e. SNPs or indels). Go through all files and all lines at this
// position and normalize relevant alleles.
// REF-only sites may be associated with both SNPs and indels.
for (i=0; i<files->nreaders; i++)
{
- bcf_sr_t *reader = &files->readers[i];
buffer_t *buf = &maux->buf[i];
-
if ( gaux && gaux[i].active )
{
+ // gVCF records inherited from an upstream gVCF block have incorrect or missing allele and position
gaux[i].line->d.allele[0][0] = ref;
gaux[i].line->pos = maux->pos;
+ maux_update_alleles(args, i, buf->beg);
+ selected_types |= ref_mask;
+ continue;
}
-
for (j=buf->beg; j<buf->end; j++)
{
if ( buf->rec[j].skip & SKIP_DONE ) continue;
bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
-
- int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
- if (line_type < 0) error("bcf_has_variant_types() failed.");
- line_type = line_type ? line_type<<2 : 2;
+ int line_types = buf->rec[j].var_types;
// select relevant lines
if ( args->merge_by_id )
{
- if ( strcmp(id,line->d.id) ) continue;
+ if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record
}
+ else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue;
else
{
- // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
- // records come last, otherwise infinite loop is created (#1164)
- if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
- {
- if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
- }
- if ( args->collapse==COLLAPSE_NONE && maux->nals )
- {
- // All alleles of the tested record must be present in the
- // selected maux record plus variant types must be the same
- if ( (maux->var_types & line_type) != line_type ) continue;
- if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
- for (k=1; k<line->n_allele; k++)
- {
- if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
- }
- if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele
- }
- if ( !(args->collapse&COLLAPSE_ANY) )
- {
- // Merge:
- // - SNPs+SNPs+MNPs+REF if -m both,snps
- // - indels+indels+REF if -m both,indels, REF only if SNPs are not present
- // - SNPs come first
- if ( line_type & (indel_mask|ins_mask|del_mask) )
- {
- if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
- if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
- }
- }
+ // First time here, choosing the first line: prioritize SNPs when available in the -m snps,both modes
+ if ( (args->collapse&COLLAPSE_SNPS || args->collapse==COLLAPSE_NONE) // asked to merge SNVs into multiallelics
+ && (maux->var_types&snp_mask) // there are SNVs at the current position
+ && !(buf->rec[j].var_types&(snp_mask|ref_mask)) // and this record is not a SNV nor ref
+ ) continue;
}
- buf->rec[j].skip = 0;
+ selected_types |= line_types;
- hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
- if ( !maux->nals ) // first record, copy the alleles to the output
- {
- maux->nals = line->n_allele;
- hts_expand0(char*, maux->nals, maux->mals, maux->als);
- hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
- for (k=0; k<maux->nals; k++)
- {
- free(maux->als[k]);
- maux->als[k] = strdup(line->d.allele[k]);
- buf->rec[j].map[k] = k;
- maux->cnt[k] = 1;
- }
- continue;
- }
- // normalize alleles
- maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
- if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname);
- hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
- for (k=1; k<line->n_allele; k++)
- maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files
- maux->cnt[0]++;
+ buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line
+ maux_update_alleles(args, i, j);
}
}
return 1;
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
- // debug_maux(args);
-
- // take the most frequent allele present in multiple files, REF is skipped
- int i,j,k,icnt = 1;
- for (i=2; i<maux->nals; i++)
- if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+ // Take the most frequent allele present in multiple files, REF and gVCF_REF is skipped.
+ int i,j,k,icnt = -1;
+ for (i=1; i<maux->nals; i++)
+ {
+ if ( maux->als_types[i] & ref_mask ) continue;
+ if ( icnt==-1 || maux->cnt[icnt] < maux->cnt[i] ) icnt = i;
+ }
+ int selected_type = icnt>0 ? maux->als_types[icnt] : ref_mask;
int nout = 0;
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
buf->cur = -1;
- if ( buf->beg >= buf->end ) continue; // no lines in the buffer
+ if ( buf->beg >= buf->end ) continue; // No lines in the buffer at this site
// find lines with the same allele
for (j=buf->beg; j<buf->end; j++)
{
- if ( buf->rec[j].skip ) continue; // done or not compatible
- if ( args->merge_by_id ) break;
- if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record
+ if ( buf->rec[j].skip )
+ {
+ int is_gvcf = maux->gvcf && maux->gvcf[i].active ? 1 : 0;
+ if ( !is_gvcf && is_gvcf_block(buf->lines[j]) ) is_gvcf = 1;
+ if ( !is_gvcf ) continue; // done or not compatible
+ }
+ if ( args->merge_by_id ) break; // if merging by ID and the line is compatible, the this is THE line
+
+ // skip if the reader has a record that matches the most frequent allele and this record is not it
+ if ( (selected_type & buf->var_types) && !(selected_type & buf->rec[j].var_types) ) continue;
+ // if the reader does not have the most frequent allele type but is a ref, accept
+ if ( !(selected_type & buf->var_types) && (buf->rec[j].var_types & ref_mask) ) break;
+ if ( selected_type==ref_mask ) break;
+
+ // accept if the record has the most frequent allele
for (k=0; k<buf->lines[j]->n_allele; k++)
if ( icnt==buf->rec[j].map[k] ) break;
-
if ( k<buf->lines[j]->n_allele ) break;
}
if ( j>=buf->end )
{
// no matching allele found in this file
- if ( args->collapse==COLLAPSE_NONE ) continue;
+ if ( args->collapse==COLLAPSE_NONE ) continue; // exact matching requested, skip
+ // choose something compatible to create a multiallelic site given the -m criteria
for (j=buf->beg; j<buf->end; j++)
{
if ( buf->rec[j].skip ) continue; // done or not compatible
if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged
- int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap);
- if (line_type < 0) error("bcf_has_variant_types() failed.");
- if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
- if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
- if ( line_type==VCF_REF )
+ int line_type = buf->rec[j].var_types;
+ if ( maux->var_types&snp_mask && line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ins_mask && line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( maux->var_types&del_mask && line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( line_type&ref_mask )
{
if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
{
// found a suitable line for merging
buf->cur = j;
-
- // mark as finished so that it's ignored next time
- buf->rec[j].skip = SKIP_DONE;
- nout++;
}
}
+
+ // debug_maux(args);
+
+ // Mark lines staged for merging as finished so that they are ignored next time
+ for (i=0; i<files->nreaders; i++)
+ {
+ buffer_t *buf = &maux->buf[i];
+ if ( buf->cur == -1 ) continue;
+
+ buf->rec[buf->cur].skip = SKIP_DONE;
+ nout++;
+ }
+
assert( nout );
}
error_errno("[%s] Failed to update header", __func__);
}
info_rules_init(args);
+ missing_rules_init(args);
bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header));
if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
return;
}
+ else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
args->maux = maux_init(args);
gvcf_flush(args,1);
info_rules_destroy(args);
+ missing_rules_destroy(args);
maux_destroy(args->maux);
bcf_hdr_destroy(args->out_hdr);
- if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"bcftools_stdout");
bcf_destroy1(args->out_line);
kh_destroy(strdict, args->tmph);
if ( args->tmps.m ) free(args->tmps.s);
fprintf(bcftools_stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n");
fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
fprintf(bcftools_stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
- fprintf(bcftools_stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
+ fprintf(bcftools_stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max -M PL:max,AD:0\n");
fprintf(bcftools_stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(bcftools_stderr, " -l, --file-list FILE Read file names from the file\n");
fprintf(bcftools_stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
fprintf(bcftools_stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
+ fprintf(bcftools_stderr, " -M, --missing-rules TAG:METHOD Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n");
fprintf(bcftools_stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
{"regions-file",required_argument,NULL,'R'},
{"regions-overlap",required_argument,NULL,4},
{"info-rules",required_argument,NULL,'i'},
+ {"missing-rules",required_argument,NULL,'M'},
{"no-version",no_argument,NULL,8},
{"no-index",no_argument,NULL,10},
{"filter-logic",required_argument,NULL,'F'},
+ {"write-index",no_argument,NULL,11},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:M:l:g:F:0L:",loptions,NULL)) >= 0) {
switch (c) {
case 'L':
args->local_alleles = strtol(optarg,&tmp,10);
break;
case 'l': args->file_list = optarg; break;
case 'i': args->info_rules = optarg; break;
+ case 'M': args->missing_rules_str = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
- else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL;
+ else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL|COLLAPSE_SNPS;
else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
else error("The -m type \"%s\" is not recognised.\n", optarg);
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 10 : args->no_index = 1; break;
+ case 11 : args->write_index = 1; break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include "bcftools.h"
#include "rbuf.h"
#include "abuf.h"
+#include "gff.h"
+#include "regidx.h"
#define CHECK_REF_EXIT 1
#define CHECK_REF_WARN 2
int32_t *int32_arr;
int ntmp_arr1, ntmp_arr2, nint32_arr;
kstring_t *tmp_str;
- kstring_t *tmp_als, tmp_kstr;
- int ntmp_als;
+ kstring_t *tmp_als, *tmp_del, tmp_kstr;
+ int ntmp_als, ntmp_del;
rbuf_t rbuf;
int buf_win; // maximum distance between two records to consider
int aln_win; // the realignment window size (maximum repeat size)
int use_star_allele, ma_use_ref_allele;
char *old_rec_tag;
htsFile *out;
+ char *index_fn;
+ int write_index;
+ int right_align;
+ char *gff_fname;
+ gff_t *gff;
+ regidx_t *idx_tscript;
+ regitr_t *itr_tscript;
}
args_t;
error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
}
+static int is_left_align(args_t *args, bcf1_t *line)
+{
+ if ( args->right_align ) return 0;
+ if ( !args->gff ) return 1;
+ const char *chr = bcf_seqname(args->hdr,line);
+ if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do
+ if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1;
+
+ // if there are two conflicting overlapping transcripts, go with the default left-alignment
+ int has_fwd = 0;
+ while ( regitr_overlap(args->itr_tscript) )
+ {
+ gf_tscript_t *tr = regitr_payload(args->itr_tscript, gf_tscript_t*);
+ if ( tr->strand==STRAND_FWD ) has_fwd = 1;
+ if ( tr->strand==STRAND_REV ) return 1;
+ }
+ // either no hit at all (then left-align) or everything was on fwd strand (then right-align)
+ return has_fwd ? 0 : 1;
+}
+static hts_pos_t realign_left(args_t *args, bcf1_t *line)
+{
+ // trim from right
+ char *ref = NULL;
+ int i;
+ hts_pos_t nref=0, new_pos = line->pos;
+ kstring_t *als = args->tmp_als;
+ while (1)
+ {
+ // is the rightmost base identical in all alleles?
+ int min_len = als[0].l;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+ if ( als[i].l < min_len ) min_len = als[i].l;
+ }
+ if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+ if ( min_len<=1 && new_pos==0 ) break;
+
+ int pad_from_left = 0;
+ for (i=0; i<line->n_allele; i++) // trim all alleles
+ {
+ als[i].l--;
+ if ( !als[i].l ) pad_from_left = 1;
+ }
+ if ( pad_from_left )
+ {
+ // extend all alleles to the left by aln_win bases (unless close to the chr start).
+ // Extra bases will be trimmed from the left after this loop is done
+ int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
+ free(ref);
+ ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), new_pos-npad, new_pos-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line), (int64_t) new_pos-npad+1);
+ replace_iupac_codes(ref,nref);
+ for (i=0; i<line->n_allele; i++)
+ {
+ ks_resize(&als[i], als[i].l + npad);
+ if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
+ memcpy(als[i].s,ref,npad);
+ als[i].l += npad;
+ }
+ new_pos -= npad;
+ }
+ }
+ free(ref);
+
+ // trim from left
+ int ntrim_left = 0;
+ while (1)
+ {
+ // is the first base identical in all alleles?
+ int min_len = als[0].l - ntrim_left;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+ if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+ }
+ if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
+ ntrim_left++;
+ }
+ if ( ntrim_left )
+ {
+ for (i=0; i<line->n_allele; i++)
+ {
+ memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
+ als[i].l -= ntrim_left;
+ }
+ new_pos += ntrim_left;
+ }
+ return new_pos;
+}
+
+static hts_pos_t realign_right(args_t *args, bcf1_t *line)
+{
+ char *ref = NULL;
+ int i;
+ hts_pos_t new_pos = line->pos, nref = 0;
+ kstring_t *als = args->tmp_als;
+
+ // trim from left
+ int ntrim_left = 0, npad_right = line->rlen, has_indel = 0;
+ while (1)
+ {
+ // is the leftmost base identical in all alleles?
+ int min_len = als[0].l - ntrim_left;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( als[0].l!=als[i].l ) has_indel = 1;
+ if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+ if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+ }
+ if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed further
+
+ ntrim_left++;
+ if ( min_len<=1 ) // pad from the right
+ {
+ free(ref);
+ ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), line->pos + npad_right, line->pos + npad_right + args->aln_win, &nref);
+ if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,line), new_pos + ntrim_left);
+ npad_right += args->aln_win;
+ replace_iupac_codes(ref,nref);
+ for (i=0; i<line->n_allele; i++) kputs(ref, &als[i]);
+ }
+ }
+ ntrim_left -= has_indel;
+ if ( ntrim_left > 0 )
+ {
+ for (i=0; i<line->n_allele; i++)
+ {
+ memmove(als[i].s, als[i].s + ntrim_left, als[i].l - ntrim_left);
+ als[i].l -= ntrim_left;
+ }
+ new_pos += ntrim_left;
+ }
+ free(ref);
+
+ // trim from right
+ while (1)
+ {
+ // is the last base identical in all alleles?
+ int min_len = als[0].l;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+ if ( min_len > als[i].l ) min_len = als[i].l;
+ }
+ if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed more
+ for (i=0; i<line->n_allele; i++) { als[i].l--; als[i].s[als[i].l]=0; }
+ }
+ return new_pos;
+}
+
#define ERR_DUP_ALLELE -2
#define ERR_REF_MISMATCH -1
#define ERR_OK 0
// make a copy of each allele for trimming
hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
+ hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
kstring_t *als = args->tmp_als;
+ kstring_t *del = args->tmp_del;
for (i=0; i<line->n_allele; i++)
{
- if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
+ del[i].l = 0;
+ if ( line->d.allele[i][0]=='<' )
+ {
+ // symbolic allele, only <DEL.*> will be realigned
+ if ( strncmp("<DEL",line->d.allele[i],4) ) return ERR_SYMBOLIC;
+ if ( nref < line->rlen )
+ {
+ free(ref);
+ reflen = line->rlen;
+ ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
+ seq_to_upper(ref,0);
+ replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N
+ als[0].l = 0;
+ kputs(ref, &als[0]);
+ als[i].l = 0;
+ kputsn(ref,1,&als[i]);
+ kputs(line->d.allele[i],&del[i]);
+ continue;
+ }
+ }
if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
{
if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
}
-
- // trim from right
- int new_pos = line->pos;
- while (1)
- {
- // is the rightmost base identical in all alleles?
- int min_len = als[0].l;
- for (i=1; i<line->n_allele; i++)
- {
- if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break;
- if ( als[i].l < min_len ) min_len = als[i].l;
- }
- if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
- if ( min_len<=1 && new_pos==0 ) break;
-
- int pad_from_left = 0;
- for (i=0; i<line->n_allele; i++) // trim all alleles
- {
- als[i].l--;
- if ( !als[i].l ) pad_from_left = 1;
- }
- if ( pad_from_left )
- {
- int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
- free(ref);
- ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
- if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
- replace_iupac_codes(ref,nref);
- for (i=0; i<line->n_allele; i++)
- {
- ks_resize(&als[i], als[i].l + npad);
- if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
- memcpy(als[i].s,ref,npad);
- als[i].l += npad;
- }
- new_pos -= npad;
- }
- }
free(ref);
+ ref = NULL;
- // trim from left
- int ntrim_left = 0;
- while (1)
- {
- // is the first base identical in all alleles?
- int min_len = als[0].l - ntrim_left;
- for (i=1; i<line->n_allele; i++)
- {
- if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
- if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
- }
- if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
- ntrim_left++;
- }
- if ( ntrim_left )
- {
- for (i=0; i<line->n_allele; i++)
- {
- memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
- als[i].l -= ntrim_left;
- }
- new_pos += ntrim_left;
- }
+ // which direction are we aligning?
+ int left_align = is_left_align(args, line);
+
+ hts_pos_t new_pos;
+ if ( left_align )
+ new_pos = realign_left(args, line);
+ else
+ new_pos = realign_right(args, line);
// Have the alleles changed?
als[0].s[ als[0].l ] = 0; // in order for strcmp to work
for (i=0; i<line->n_allele; i++)
{
if (i>0) kputc(',',&args->tmp_kstr);
- kputsn(als[i].s,als[i].l,&args->tmp_kstr);
+ if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr);
+ else kputsn(als[i].s,als[i].l,&args->tmp_kstr);
}
args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
ngts2 /= nsmpl;
if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1);
- int32_t *gt = (int32_t*) args->tmp_arr1;
- int32_t *gt2 = (int32_t*) args->tmp_arr2;
+ int32_t *gt = (int32_t*) args->tmp_arr1; // the first, destination line
+ int32_t *gt2 = (int32_t*) args->tmp_arr2; // one of the subsequent lines, i.e. the source line
for (j=0; j<nsmpl; j++)
{
+ // Take each source allele and apply to the first line. We try to preserve the order and phasing and we
+ // never overwrite with ref allele
for (k2=0; k2<ngts2; k2++)
{
if ( gt2[k2]==bcf_int32_vector_end ) break;
int ial2 = bcf_gt_allele(gt2[k2]);
if ( ial2==0 ) continue; // never overwrite with ref
if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
+
+ // The destination allele
int ial = args->maps[i].map[ial2];
- for (k=0; k<ngts; k++)
- if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
- if ( k<ngts )
+ if ( gt[k2]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k2]) || !bcf_gt_allele(gt[k2]) )
+ gt[k2] = bcf_gt_is_phased(gt[k2]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+ else
{
- gt[k] = bcf_gt_unphased(ial);
+ // conflict, the first line has non-zero allele, use the old way, possibly disrupt the phasing
+ for (k=0; k<ngts; k++)
+ if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
+ if ( k<ngts )
+ gt[k] = bcf_gt_unphased(ial);
}
}
gt += ngts;
abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
}
+ if ( args->gff_fname )
+ {
+ args->gff = gff_init(args->gff_fname);
+ gff_set(args->gff,verbosity,1);
+ gff_set(args->gff,strip_chr_names,1);
+ gff_parse(args->gff);
+ args->idx_tscript = gff_get(args->gff,idx_tscript);
+ args->itr_tscript = regitr_init(NULL);
+ }
}
static void destroy_data(args_t *args)
{
+ if ( args->gff )
+ {
+ gff_destroy(args->gff);
+ regitr_destroy(args->itr_tscript);
+ }
cmpals_destroy(&args->cmpals_in);
cmpals_destroy(&args->cmpals_out);
int i;
free(args->maps[i].map);
for (i=0; i<args->ntmp_als; i++)
free(args->tmp_als[i].s);
+ for (i=0; i<args->ntmp_del; i++)
+ free(args->tmp_del[i].s);
free(args->tmp_als);
+ free(args->tmp_del);
free(args->tmp_kstr.s);
if ( args->tmp_str )
{
hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
bcf1_t *line;
int prev_rid = -1, prev_pos = -1, prev_type = 0;
if ( j>0 ) flush_buffer(args, args->out, j);
}
flush_buffer(args, args->out, args->rbuf.n);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out)<0 )
+ {
+ if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
fprintf(stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n");
fprintf(stderr, " -f, --fasta-ref FILE Reference sequence\n");
fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+ fprintf(stderr, " -g, --gff-annot FILE Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n");
fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # normalize and left-align indels\n");
{"old-rec-tag",required_argument,NULL,12},
{"keep-sum",required_argument,NULL,10},
{"fasta-ref",required_argument,NULL,'f'},
+ {"gff-annot",required_argument,NULL,'g'},
+ {"right-align",no_argument,NULL,15}, // undocumented, only for debugging
{"do-not-normalize",no_argument,NULL,'N'},
{"multiallelics",required_argument,NULL,'m'},
{"multi-overlaps",required_argument,NULL,13},
{"check-ref",required_argument,NULL,'c'},
{"strict-filter",no_argument,NULL,'s'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,14},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) {
switch (c) {
case 10:
// possibly generalize this also to INFO/AD and other tags
error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data
break;
+ case 'g': args->gff_fname = optarg; break;
case 'a': args->atomize = SPLIT; break;
case 11 :
if ( optarg[0]=='*' ) args->use_star_allele = 1;
else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
else error("Invalid argument to --multi-overlaps\n");
break;
+ case 14 : args->write_index = 1; break;
+ case 15 : args->right_align = 1; break;
case 'N': args->do_indels = 0; break;
case 'd':
if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include "bcftools.h"
#include "rbuf.h"
#include "abuf.h"
+#include "gff.h"
+#include "regidx.h"
#define CHECK_REF_EXIT 1
#define CHECK_REF_WARN 2
int32_t *int32_arr;
int ntmp_arr1, ntmp_arr2, nint32_arr;
kstring_t *tmp_str;
- kstring_t *tmp_als, tmp_kstr;
- int ntmp_als;
+ kstring_t *tmp_als, *tmp_del, tmp_kstr;
+ int ntmp_als, ntmp_del;
rbuf_t rbuf;
int buf_win; // maximum distance between two records to consider
int aln_win; // the realignment window size (maximum repeat size)
int use_star_allele, ma_use_ref_allele;
char *old_rec_tag;
htsFile *out;
+ char *index_fn;
+ int write_index;
+ int right_align;
+ char *gff_fname;
+ gff_t *gff;
+ regidx_t *idx_tscript;
+ regitr_t *itr_tscript;
}
args_t;
error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
}
+static int is_left_align(args_t *args, bcf1_t *line)
+{
+ if ( args->right_align ) return 0;
+ if ( !args->gff ) return 1;
+ const char *chr = bcf_seqname(args->hdr,line);
+ if ( !strncasecmp("chr",chr,3) ) chr += 3; // strip 'chr' prefix, that's what we requested the GFF reader to do
+ if ( !regidx_overlap(args->idx_tscript,chr,line->pos,line->pos+line->rlen, args->itr_tscript) ) return 1;
+
+ // if there are two conflicting overlapping transcripts, go with the default left-alignment
+ int has_fwd = 0;
+ while ( regitr_overlap(args->itr_tscript) )
+ {
+ gf_tscript_t *tr = regitr_payload(args->itr_tscript, gf_tscript_t*);
+ if ( tr->strand==STRAND_FWD ) has_fwd = 1;
+ if ( tr->strand==STRAND_REV ) return 1;
+ }
+ // either no hit at all (then left-align) or everything was on fwd strand (then right-align)
+ return has_fwd ? 0 : 1;
+}
+static hts_pos_t realign_left(args_t *args, bcf1_t *line)
+{
+ // trim from right
+ char *ref = NULL;
+ int i;
+ hts_pos_t nref=0, new_pos = line->pos;
+ kstring_t *als = args->tmp_als;
+ while (1)
+ {
+ // is the rightmost base identical in all alleles?
+ int min_len = als[0].l;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+ if ( als[i].l < min_len ) min_len = als[i].l;
+ }
+ if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+ if ( min_len<=1 && new_pos==0 ) break;
+
+ int pad_from_left = 0;
+ for (i=0; i<line->n_allele; i++) // trim all alleles
+ {
+ als[i].l--;
+ if ( !als[i].l ) pad_from_left = 1;
+ }
+ if ( pad_from_left )
+ {
+ // extend all alleles to the left by aln_win bases (unless close to the chr start).
+ // Extra bases will be trimmed from the left after this loop is done
+ int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
+ free(ref);
+ ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), new_pos-npad, new_pos-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line), (int64_t) new_pos-npad+1);
+ replace_iupac_codes(ref,nref);
+ for (i=0; i<line->n_allele; i++)
+ {
+ ks_resize(&als[i], als[i].l + npad);
+ if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
+ memcpy(als[i].s,ref,npad);
+ als[i].l += npad;
+ }
+ new_pos -= npad;
+ }
+ }
+ free(ref);
+
+ // trim from left
+ int ntrim_left = 0;
+ while (1)
+ {
+ // is the first base identical in all alleles?
+ int min_len = als[0].l - ntrim_left;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+ if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+ }
+ if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
+ ntrim_left++;
+ }
+ if ( ntrim_left )
+ {
+ for (i=0; i<line->n_allele; i++)
+ {
+ memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
+ als[i].l -= ntrim_left;
+ }
+ new_pos += ntrim_left;
+ }
+ return new_pos;
+}
+
+static hts_pos_t realign_right(args_t *args, bcf1_t *line)
+{
+ char *ref = NULL;
+ int i;
+ hts_pos_t new_pos = line->pos, nref = 0;
+ kstring_t *als = args->tmp_als;
+
+ // trim from left
+ int ntrim_left = 0, npad_right = line->rlen, has_indel = 0;
+ while (1)
+ {
+ // is the leftmost base identical in all alleles?
+ int min_len = als[0].l - ntrim_left;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( als[0].l!=als[i].l ) has_indel = 1;
+ if ( toupper(als[0].s[ntrim_left]) != toupper(als[i].s[ntrim_left]) ) break;
+ if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
+ }
+ if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed further
+
+ ntrim_left++;
+ if ( min_len<=1 ) // pad from the right
+ {
+ free(ref);
+ ref = faidx_fetch_seq64(args->fai, bcf_seqname(args->hdr,line), line->pos + npad_right, line->pos + npad_right + args->aln_win, &nref);
+ if ( !ref ) error("faidx_fetch_seq64 failed at %s:%"PRIhts_pos"\n",bcf_seqname(args->hdr,line), new_pos + ntrim_left);
+ npad_right += args->aln_win;
+ replace_iupac_codes(ref,nref);
+ for (i=0; i<line->n_allele; i++) kputs(ref, &als[i]);
+ }
+ }
+ ntrim_left -= has_indel;
+ if ( ntrim_left > 0 )
+ {
+ for (i=0; i<line->n_allele; i++)
+ {
+ memmove(als[i].s, als[i].s + ntrim_left, als[i].l - ntrim_left);
+ als[i].l -= ntrim_left;
+ }
+ new_pos += ntrim_left;
+ }
+ free(ref);
+
+ // trim from right
+ while (1)
+ {
+ // is the last base identical in all alleles?
+ int min_len = als[0].l;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( toupper(als[0].s[ als[0].l-1 ]) != toupper(als[i].s[ als[i].l-1 ]) ) break;
+ if ( min_len > als[i].l ) min_len = als[i].l;
+ }
+ if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed more
+ for (i=0; i<line->n_allele; i++) { als[i].l--; als[i].s[als[i].l]=0; }
+ }
+ return new_pos;
+}
+
#define ERR_DUP_ALLELE -2
#define ERR_REF_MISMATCH -1
#define ERR_OK 0
// make a copy of each allele for trimming
hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
+ hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
kstring_t *als = args->tmp_als;
+ kstring_t *del = args->tmp_del;
for (i=0; i<line->n_allele; i++)
{
- if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
+ del[i].l = 0;
+ if ( line->d.allele[i][0]=='<' )
+ {
+ // symbolic allele, only <DEL.*> will be realigned
+ if ( strncmp("<DEL",line->d.allele[i],4) ) return ERR_SYMBOLIC;
+ if ( nref < line->rlen )
+ {
+ free(ref);
+ reflen = line->rlen;
+ ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
+ seq_to_upper(ref,0);
+ replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N
+ als[0].l = 0;
+ kputs(ref, &als[0]);
+ als[i].l = 0;
+ kputsn(ref,1,&als[i]);
+ kputs(line->d.allele[i],&del[i]);
+ continue;
+ }
+ }
if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
{
if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
}
-
- // trim from right
- int new_pos = line->pos;
- while (1)
- {
- // is the rightmost base identical in all alleles?
- int min_len = als[0].l;
- for (i=1; i<line->n_allele; i++)
- {
- if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break;
- if ( als[i].l < min_len ) min_len = als[i].l;
- }
- if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
- if ( min_len<=1 && new_pos==0 ) break;
-
- int pad_from_left = 0;
- for (i=0; i<line->n_allele; i++) // trim all alleles
- {
- als[i].l--;
- if ( !als[i].l ) pad_from_left = 1;
- }
- if ( pad_from_left )
- {
- int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
- free(ref);
- ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
- if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
- replace_iupac_codes(ref,nref);
- for (i=0; i<line->n_allele; i++)
- {
- ks_resize(&als[i], als[i].l + npad);
- if ( als[i].l ) memmove(als[i].s+npad,als[i].s,als[i].l);
- memcpy(als[i].s,ref,npad);
- als[i].l += npad;
- }
- new_pos -= npad;
- }
- }
free(ref);
+ ref = NULL;
- // trim from left
- int ntrim_left = 0;
- while (1)
- {
- // is the first base identical in all alleles?
- int min_len = als[0].l - ntrim_left;
- for (i=1; i<line->n_allele; i++)
- {
- if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
- if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
- }
- if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
- ntrim_left++;
- }
- if ( ntrim_left )
- {
- for (i=0; i<line->n_allele; i++)
- {
- memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
- als[i].l -= ntrim_left;
- }
- new_pos += ntrim_left;
- }
+ // which direction are we aligning?
+ int left_align = is_left_align(args, line);
+
+ hts_pos_t new_pos;
+ if ( left_align )
+ new_pos = realign_left(args, line);
+ else
+ new_pos = realign_right(args, line);
// Have the alleles changed?
als[0].s[ als[0].l ] = 0; // in order for strcmp to work
for (i=0; i<line->n_allele; i++)
{
if (i>0) kputc(',',&args->tmp_kstr);
- kputsn(als[i].s,als[i].l,&args->tmp_kstr);
+ if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr);
+ else kputsn(als[i].s,als[i].l,&args->tmp_kstr);
}
args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
ngts2 /= nsmpl;
if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1);
- int32_t *gt = (int32_t*) args->tmp_arr1;
- int32_t *gt2 = (int32_t*) args->tmp_arr2;
+ int32_t *gt = (int32_t*) args->tmp_arr1; // the first, destination line
+ int32_t *gt2 = (int32_t*) args->tmp_arr2; // one of the subsequent lines, i.e. the source line
for (j=0; j<nsmpl; j++)
{
+ // Take each source allele and apply to the first line. We try to preserve the order and phasing and we
+ // never overwrite with ref allele
for (k2=0; k2<ngts2; k2++)
{
if ( gt2[k2]==bcf_int32_vector_end ) break;
int ial2 = bcf_gt_allele(gt2[k2]);
if ( ial2==0 ) continue; // never overwrite with ref
if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
+
+ // The destination allele
int ial = args->maps[i].map[ial2];
- for (k=0; k<ngts; k++)
- if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
- if ( k<ngts )
+ if ( gt[k2]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k2]) || !bcf_gt_allele(gt[k2]) )
+ gt[k2] = bcf_gt_is_phased(gt[k2]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+ else
{
- gt[k] = bcf_gt_unphased(ial);
+ // conflict, the first line has non-zero allele, use the old way, possibly disrupt the phasing
+ for (k=0; k<ngts; k++)
+ if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
+ if ( k<ngts )
+ gt[k] = bcf_gt_unphased(ial);
}
}
gt += ngts;
abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
}
+ if ( args->gff_fname )
+ {
+ args->gff = gff_init(args->gff_fname);
+ gff_set(args->gff,verbosity,1);
+ gff_set(args->gff,strip_chr_names,1);
+ gff_parse(args->gff);
+ args->idx_tscript = gff_get(args->gff,idx_tscript);
+ args->itr_tscript = regitr_init(NULL);
+ }
}
static void destroy_data(args_t *args)
{
+ if ( args->gff )
+ {
+ gff_destroy(args->gff);
+ regitr_destroy(args->itr_tscript);
+ }
cmpals_destroy(&args->cmpals_in);
cmpals_destroy(&args->cmpals_out);
int i;
free(args->maps[i].map);
for (i=0; i<args->ntmp_als; i++)
free(args->tmp_als[i].s);
+ for (i=0; i<args->ntmp_del; i++)
+ free(args->tmp_del[i].s);
free(args->tmp_als);
+ free(args->tmp_del);
free(args->tmp_kstr.s);
if ( args->tmp_str )
{
hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
bcf1_t *line;
int prev_rid = -1, prev_pos = -1, prev_type = 0;
if ( j>0 ) flush_buffer(args, args->out, j);
}
flush_buffer(args, args->out, args->rbuf.n);
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out)<0 )
+ {
+ if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
fprintf(bcftools_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
fprintf(bcftools_stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n");
fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence\n");
fprintf(bcftools_stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+ fprintf(bcftools_stderr, " -g, --gff-annot FILE Follow HGVS 3'rule and right-align variants in transcripts on the forward strand\n");
fprintf(bcftools_stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
fprintf(bcftools_stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(bcftools_stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # normalize and left-align indels\n");
{"old-rec-tag",required_argument,NULL,12},
{"keep-sum",required_argument,NULL,10},
{"fasta-ref",required_argument,NULL,'f'},
+ {"gff-annot",required_argument,NULL,'g'},
+ {"right-align",no_argument,NULL,15}, // undocumented, only for debugging
{"do-not-normalize",no_argument,NULL,'N'},
{"multiallelics",required_argument,NULL,'m'},
{"multi-overlaps",required_argument,NULL,13},
{"check-ref",required_argument,NULL,'c'},
{"strict-filter",no_argument,NULL,'s'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,14},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNag:",loptions,NULL)) >= 0) {
switch (c) {
case 10:
// possibly generalize this also to INFO/AD and other tags
error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data
break;
+ case 'g': args->gff_fname = optarg; break;
case 'a': args->atomize = SPLIT; break;
case 11 :
if ( optarg[0]=='*' ) args->use_star_allele = 1;
else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
else error("Invalid argument to --multi-overlaps\n");
break;
+ case 14 : args->write_index = 1; break;
+ case 15 : args->right_align = 1; break;
case 'N': args->do_indels = 0; break;
case 'd':
if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *output_fname, *regions_list, *targets_list;
int argc, drop_header, verbose, record_cmd_line, plist_only;
+ char *index_fn;
+ int write_index;
}
args_t;
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
}
}
if ( args->filter )
filter_destroy(args->filter);
- if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+ if (args->out_fh )
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+ }
}
static void usage(args_t *args)
fprintf(stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
fprintf(stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n");
fprintf(stderr, " -V, --version Print version string and exit\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
}
if ( argv[1][0]!='-' )
{
args->verbose = is_verbose(argc, argv);
- plugin_name = argv[1];
- argc--;
- argv++;
+ plugin_name = argv[1];
+ argc--;
+ argv++;
load_plugin(args, plugin_name, 1, &args->plugin);
if ( args->plugin.run )
{
{"targets-file",required_argument,NULL,'T'},
{"targets-overlap",required_argument,NULL,2},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,10},
{NULL,0,NULL,0}
};
char *tmp;
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 10 : args->write_index = 1; break;
case '?':
case 'h': usage_only = 1; break;
default: error("Unknown argument: %s\n", optarg);
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **argv, *output_fname, *regions_list, *targets_list;
int argc, drop_header, verbose, record_cmd_line, plist_only;
+ char *index_fn;
+ int write_index;
}
args_t;
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
}
}
if ( args->filter )
filter_destroy(args->filter);
- if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+ if (args->out_fh )
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+ }
}
static void usage(args_t *args)
fprintf(bcftools_stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
fprintf(bcftools_stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n");
fprintf(bcftools_stderr, " -V, --version Print version string and exit\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
if ( argv[1][0]!='-' )
{
args->verbose = is_verbose(argc, argv);
- plugin_name = argv[1];
- argc--;
- argv++;
+ plugin_name = argv[1];
+ argc--;
+ argv++;
load_plugin(args, plugin_name, 1, &args->plugin);
if ( args->plugin.run )
{
{"targets-file",required_argument,NULL,'T'},
{"targets-overlap",required_argument,NULL,2},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,10},
{NULL,0,NULL,0}
};
char *tmp;
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 10 : args->write_index = 1; break;
case '?':
case 'h': usage_only = 1; break;
default: error("Unknown argument: %s\n", optarg);
/* vcfquery.c -- Extracts fields from VCF/BCF file.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
smpl_ilist_destroy(ilist);
}
args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+ convert_set_option(args->convert, force_newline, 1);
convert_set_option(args->convert, subset_samples, &args->smpl_pass);
if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
free(samples);
/* vcfquery.c -- Extracts fields from VCF/BCF file.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
smpl_ilist_destroy(ilist);
}
args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+ convert_set_option(args->convert, force_newline, 1);
convert_set_option(args->convert, subset_samples, &args->smpl_pass);
if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
free(samples);
/* vcfsort.c -- sort subcommand
- Copyright (C) 2017-2022 Genome Research Ltd.
+ Copyright (C) 2017-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
uint8_t *mem_block;
size_t nbuf, mbuf, nblk;
blk_t *blk;
+ char *index_fn;
+ int write_index;
}
args_t;
set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
while ( bhp->ndat )
{
blk_t *blk = bhp->dat[0];
khp_delete(blk, bhp);
blk_read(args, bhp, args->hdr, blk);
}
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out)<0 )
+ {
+ if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname);
clean_files(args);
#else
fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n");
#endif
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
}
{"output-file",required_argument,NULL,'o'},
{"output",required_argument,NULL,'o'},
{"help",no_argument,NULL,'h'},
+ {"write-index",no_argument,NULL,1},
{0,0,0,0}
};
char *tmp;
if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
+ case 1 : args->write_index = 1; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
/* vcfsort.c -- sort subcommand
- Copyright (C) 2017-2022 Genome Research Ltd.
+ Copyright (C) 2017-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
uint8_t *mem_block;
size_t nbuf, mbuf, nblk;
blk_t *blk;
+ char *index_fn;
+ int write_index;
}
args_t;
set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
while ( bhp->ndat )
{
blk_t *blk = bhp->dat[0];
khp_delete(blk, bhp);
blk_read(args, bhp, args->hdr, blk);
}
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(out)<0 )
+ {
+ if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname);
clean_files(args);
#else
fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n");
#endif
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
{"output-file",required_argument,NULL,'o'},
{"output",required_argument,NULL,'o'},
{"help",no_argument,NULL,'h'},
+ {"write-index",no_argument,NULL,1},
{0,0,0,0}
};
char *tmp;
if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
+ case 1 : args->write_index = 1; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
idist_t;
+// variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins
+typedef struct
+{
+ int snv[21], indel[21];
+}
+vaf_t;
+
typedef struct
{
uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs;
- int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+ int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+ vaf_t vaf, *smpl_vaf; // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions
unsigned long int *smpl_dp;
idist_t dp, dp_sites;
int nusr;
gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
bin_t *af_bins;
float *farr;
- int mfarr;
+ int32_t *iarr;
+ int mfarr, miarr;
+ int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref;
// indel context
indel_ctx_t *indel_ctx;
if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
error("No such INFO tag: %s\n", args->af_tag);
+ int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0;
+
#if QUAL_STATS
args->m_qual = 999;
#endif
stats->smpl_dp = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int));
stats->smpl_ndp = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_sngl = (int *) calloc(args->files->n_smpl,sizeof(int));
+ if ( has_fmt_ad )
+ stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t));
#if HWE_STATS
stats->af_hwe = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int));
#endif
free(stats->smpl_dp);
free(stats->smpl_ndp);
free(stats->smpl_sngl);
+ free(stats->smpl_vaf);
idist_destroy(&stats->dp);
idist_destroy(&stats->dp_sites);
for (j=0; j<stats->nusr; j++)
for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
if ( args->af_bins ) bin_destroy(args->af_bins);
free(args->farr);
+ free(args->iarr);
free(args->usr);
free(args->tmp_frm);
free(args->tmp_iaf);
if (args->filter[1]) filter_destroy(args->filter[1]);
}
+// The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons.
+// The number of bins, either m_af (101) or as given by the user in --af-bins
static void init_iaf(args_t *args, bcf_sr_t *reader)
{
bcf1_t *line = reader->buffer[0];
}
}
-static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal)
+// Returns the max non-ref AD value
+static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial)
{
- if ( !fmt ) return;
-
- float dvaf;
+ int iv, ad = 0;
+ *ial = 0;
#define BRANCH_INT(type_t,missing,vector_end) { \
- type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \
- if ( p[ial]==vector_end || p[jal]==vector_end ) return; \
- if ( p[ial]==missing || p[jal]==missing ) return; \
- if ( !p[ial] && !p[jal] ) return; \
- dvaf = (float)p[ial]/(p[ial]+p[jal]); \
+ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+ for (iv=1; iv<ad_fmt_ptr->n; iv++) \
+ { \
+ if ( ptr[iv]==vector_end ) break; \
+ if ( ptr[iv]==missing ) continue; \
+ if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\
+ } \
}
- switch (fmt->type) {
+ switch (ad_fmt_ptr->type) {
case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break;
+ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
}
#undef BRANCH_INT
-
+ return ad;
+}
+static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial)
+{
+ #define BRANCH_INT(type_t,missing,vector_end) { \
+ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+ if ( ptr[ial]==vector_end ) return 0; \
+ if ( ptr[ial]==missing ) return 0; \
+ return ptr[ial]; \
+ }
+ switch (ad_fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+}
+static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf)
+{
int len = line->d.var[ial].n;
if ( len < -stats->m_indel ) len = -stats->m_indel;
else if ( len > stats->m_indel ) len = stats->m_indel;
int bin = stats->m_indel + len;
stats->nvaf[bin]++;
- stats->dvaf[bin] += dvaf;
+ stats->dvaf[bin] += vaf;
+}
+#define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05))
+static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf)
+{
+ int idx = vaf2bin(vaf);
+ if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
+ else smpl_vaf->indel[idx]++;
}
-static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr)
{
- bcf_srs_t *files = args->files;
- bcf1_t *line = reader->buffer[0];
- bcf_fmt_t *fmt_ptr;
- int nref_tot = 0, nhet_tot = 0, nalt_tot = 0;
- int line_type = bcf_get_variant_types(line);
+ if ( dp_fmt_ptr )
+ {
+ #define BRANCH_INT(type_t,missing,vector_end) { \
+ type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \
+ if ( *ptr==missing || *ptr==vector_end ) return -1; \
+ return *ptr; \
+ }
+ switch (dp_fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+ }
+ if ( ad_fmt_ptr )
+ {
+ int iv, dp = 0, has_value = 0;
+ #define BRANCH_INT(type_t,missing,vector_end) { \
+ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+ for (iv=0; iv<ad_fmt_ptr->n; iv++) \
+ { \
+ if ( ptr[iv]==vector_end ) break; \
+ if ( ptr[iv]==missing ) continue; \
+ has_value = 1; \
+ dp += ptr[iv]; \
+ } \
+ }
+ switch (ad_fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); exit(1); break;
+ }
+ #undef BRANCH_INT
+ if ( !has_value ) return -1;
+ return dp;
+ }
+ return -1;
+}
+static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal)
+{
+ if ( gt==GT_UNKN )
+ {
+ stats->smpl_missing[ismpl]++;
+ return;
+ }
- if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) )
+ int var_type = 0;
+ if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+ if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+ if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
{
- bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL;
+ if ( var_type&VCF_INDEL && stats->smpl_frm_shifts )
+ {
+ assert( ial<line->n_allele );
+ stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+ }
+ if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++;
+ if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++;
+ return;
+ }
+ if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; }
+ #if HWE_STATS
+ switch (gt)
+ {
+ case GT_HOM_RR: args->nref_tot++; break;
+ case GT_HET_RA: args->nhet_tot++; break;
+ case GT_HET_AA:
+ case GT_HOM_AA: args->nalt_tot++; break;
+ }
+ #endif
- int ref = bcf_acgt2int(*line->d.allele[0]);
- int is, n_nref = 0, i_nref = 0;
- for (is=0; is<args->files->n_smpl; is++)
+ if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP
+ {
+ if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++;
+ else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++;
+ else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++;
+ else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++;
+ if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
{
- int ial, jal;
- int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
- if ( gt==GT_UNKN )
- {
- stats->smpl_missing[is]++;
- continue;
- }
- if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
+ int ref = bcf_acgt2int(*line->d.allele[0]);
+ int alt = bcf_acgt2int(*line->d.allele[ial]);
+ if ( alt<0 ) return;
+ if ( abs(ref-alt)==2 )
+ stats->smpl_ts[ismpl]++;
+ else
+ stats->smpl_tv[ismpl]++;
+ }
+ if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal )
+ {
+ int ref = bcf_acgt2int(*line->d.allele[0]);
+ int alt = bcf_acgt2int(*line->d.allele[jal]);
+ if ( alt<0 ) return;
+ if ( abs(ref-alt)==2 )
+ stats->smpl_ts[ismpl]++;
+ else
+ stats->smpl_tv[ismpl]++;
+ }
+ }
+ if ( var_type&VCF_INDEL )
+ {
+ if ( gt != GT_HOM_RR )
+ {
+ stats->smpl_indels[ismpl]++;
+ if ( gt==GT_HET_RA || gt==GT_HET_AA )
{
- if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
+ int is_ins = 0, is_del = 0;
+ if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
{
- assert( ial<line->n_allele );
- stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+ if ( line->d.var[ial].n < 0 ) is_del = 1;
+ else is_ins = 1;
}
- if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
- if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
- continue;
- }
- if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
- #if HWE_STATS
- switch (gt)
+ if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
{
- case GT_HOM_RR: nref_tot++; break;
- case GT_HET_RA: nhet_tot++; break;
- case GT_HET_AA:
- case GT_HOM_AA: nalt_tot++; break;
- }
- #endif
- int var_type = 0;
- if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
- if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
- if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP
- {
- if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
- else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
- else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++;
- else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++;
- if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
- {
- int alt = bcf_acgt2int(*line->d.allele[ial]);
- if ( alt<0 ) continue;
- if ( abs(ref-alt)==2 )
- stats->smpl_ts[is]++;
- else
- stats->smpl_tv[is]++;
+ if ( line->d.var[jal].n < 0 ) is_del = 1;
+ else is_ins = 1;
}
+ // Note that alt-het genotypes with both ins and del allele are counted twice!!
+ if ( is_del ) stats->smpl_del_hets[ismpl]++;
+ if ( is_ins ) stats->smpl_ins_hets[ismpl]++;
}
- if ( var_type&VCF_INDEL )
+ else if ( gt==GT_HOM_AA )
{
- if ( gt != GT_HOM_RR )
- {
- stats->smpl_indels[is]++;
-
- if ( gt==GT_HET_RA || gt==GT_HET_AA )
- {
- int is_ins = 0, is_del = 0;
- if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
- {
- if ( line->d.var[ial].n < 0 ) is_del = 1;
- else is_ins = 1;
- update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal);
- }
- if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
- {
- if ( line->d.var[jal].n < 0 ) is_del = 1;
- else is_ins = 1;
- update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial);
- }
- // Note that alt-het genotypes with both ins and del allele are counted twice!!
- if ( is_del ) stats->smpl_del_hets[is]++;
- if ( is_ins ) stats->smpl_ins_hets[is]++;
- }
- else if ( gt==GT_HOM_AA )
- {
- if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++;
- else stats->smpl_ins_homs[is]++;
- }
- }
- if ( stats->smpl_frm_shifts )
- {
- assert( ial<line->n_allele && jal<line->n_allele );
- stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
- stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++;
- }
+ if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++;
+ else stats->smpl_ins_homs[ismpl]++;
}
}
- if ( n_nref==1 ) stats->smpl_sngl[i_nref]++;
+ if ( stats->smpl_frm_shifts )
+ {
+ assert( ial<line->n_allele && jal<line->n_allele );
+ stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+ stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++;
+ }
}
+}
+static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+{
+ bcf_srs_t *files = args->files;
+ bcf1_t *line = reader->buffer[0];
- #if HWE_STATS
- if ( nhet_tot + nref_tot + nalt_tot )
+ args->nref_tot = 0;
+ args->nhet_tot = 0;
+ args->nalt_tot = 0;
+ args->n_nref = 0;
+ args->i_nref = 0;
+
+ bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT");
+ bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD");
+ bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP");
+
+ int is;
+ for (is=0; is<args->files->n_smpl; is++)
+ {
+ // Determine depth
+ int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr);
+ if ( dp>0 )
{
- float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
- int idx = het_frac*(args->naf_hwe - 1);
-//check me: what is this?
- if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
- stats->af_hwe[idx]++;
+ (*idist(&stats->dp, dp))++;
+ stats->smpl_ndp[is]++;
+ stats->smpl_dp[is] += dp;
}
- #endif
- if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) )
- {
- #define BRANCH_INT(type_t,missing,vector_end) { \
- int is; \
- for (is=0; is<args->files->n_smpl; is++) \
- { \
- type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
- if ( *p==vector_end ) continue; \
- if ( *p!=missing ) \
- { \
- (*idist(&stats->dp, *p))++; \
- stats->smpl_ndp[is]++; \
- stats->smpl_dp[is] += *p; \
- } \
- } \
+ // Determine genotype
+ int ial, jal, gt=GT_UNKN;
+ if ( gt_fmt_ptr )
+ {
+ gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal);
+ sample_gt_stats(args,stats,line,is,gt,ial,jal);
}
- switch (fmt_ptr->type) {
- case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
- case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
- case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+
+ // Determine variant allele frequency
+ if ( dp>0 && ad_fmt_ptr )
+ {
+ float iad = 0, jad = 0;
+ if ( gt==GT_UNKN ) // GT not available
+ {
+ iad = get_ad(line,ad_fmt_ptr,is,&ial);
+ }
+ else if ( gt!=GT_UNKN )
+ {
+ iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial);
+ jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal);
+ }
+ if ( iad )
+ {
+ update_dvaf(stats,line,ial,(float)iad/dp);
+ update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp);
+ }
+ if ( jad && iad!=jad )
+ {
+ update_dvaf(stats,line,jal,(float)jad/dp);
+ update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp);
+ }
}
- #undef BRANCH_INT
}
- else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) )
+ if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++;
+
+#if HWE_STATS
+ if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) )
{
- #define BRANCH_INT(type_t,missing,vector_end) { \
- int is,iv; \
- for (is=0; is<args->files->n_smpl; is++) \
- { \
- type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
- int dp = 0, has_value = 0; \
- for (iv=0; iv<fmt_ptr->n; iv++) \
- { \
- if ( p[iv]==vector_end ) break; \
- if ( p[iv]==missing ) continue; \
- has_value = 1; \
- dp += p[iv]; \
- } \
- if ( has_value ) \
- { \
- (*idist(&stats->dp, dp))++; \
- stats->smpl_ndp[is]++; \
- stats->smpl_dp[is] += dp; \
- } \
- } \
- }
- switch (fmt_ptr->type) {
- case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
- case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
- case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
- }
- #undef BRANCH_INT
+ // Number of heterozygous genotypes observed for any given allele frequency. This is used
+ // by plot-vcfstats to show the observed vs expected number of hets. There the expected number
+ // of hets is calculated from the probability P(het) = 2*AF*(1-AF).
+ // The array af_hwe is organized as follows
+ // m_af .. number of allele frequency bins
+ // naf_hwe .. the number of het genotype frequency bins
+ // iallele_freq*naf_hwe + ihet_freq
+ //
+ float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot);
+ int ihet_freq = het_frac * (args->naf_hwe - 1);
+ int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe;
+ stats->af_hwe[idx]++;
}
+#endif
if ( matched==3 )
{
if ( files->n_smpl )
do_sample_stats(args, stats, reader, ret);
- if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 )
- (*idist(&stats->dp_sites, args->tmp_iaf[0]))++;
+ if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 )
+ (*idist(&stats->dp_sites, args->iarr[0]))++;
}
}
}
#endif
}
+
+ if ( args->stats[0].smpl_vaf )
+ {
+ printf("# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n");
+ printf("# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ printf("VAF\t%d\t%s\t", id,args->files->samples[i]);
+ for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].snv[j]);
+ printf("\t");
+ for (j=0; j<21; j++) printf("%s%d",j?",":"",stats->smpl_vaf[i].indel[j]);
+ printf("\n");
+ }
+ }
+ }
}
static void usage(void)
}
idist_t;
+// variant allele frequency (fraction of alt allele in pileup as determined from AD) collected into 0.05 bins
+typedef struct
+{
+ int snv[21], indel[21];
+}
+vaf_t;
+
typedef struct
{
uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs;
- int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+ int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
+ vaf_t vaf, *smpl_vaf; // total (INFO/AD) and per-sample (FMT/VAF) VAF distributions
unsigned long int *smpl_dp;
idist_t dp, dp_sites;
int nusr;
gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
bin_t *af_bins;
float *farr;
- int mfarr;
+ int32_t *iarr;
+ int mfarr, miarr;
+ int nref_tot, nhet_tot, nalt_tot, n_nref, i_nref;
// indel context
indel_ctx_t *indel_ctx;
if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
error("No such INFO tag: %s\n", args->af_tag);
+ int id, has_fmt_ad = ((id=bcf_hdr_id2int(hdr,BCF_DT_ID,"AD"))>=0 && bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id)) ? 1 : 0;
+
#if QUAL_STATS
args->m_qual = 999;
#endif
stats->smpl_dp = (unsigned long int *) calloc(args->files->n_smpl,sizeof(unsigned long int));
stats->smpl_ndp = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_sngl = (int *) calloc(args->files->n_smpl,sizeof(int));
+ if ( has_fmt_ad )
+ stats->smpl_vaf = (vaf_t*) calloc(args->files->n_smpl,sizeof(vaf_t));
#if HWE_STATS
stats->af_hwe = (int*) calloc(args->m_af*args->naf_hwe,sizeof(int));
#endif
free(stats->smpl_dp);
free(stats->smpl_ndp);
free(stats->smpl_sngl);
+ free(stats->smpl_vaf);
idist_destroy(&stats->dp);
idist_destroy(&stats->dp_sites);
for (j=0; j<stats->nusr; j++)
for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
if ( args->af_bins ) bin_destroy(args->af_bins);
free(args->farr);
+ free(args->iarr);
free(args->usr);
free(args->tmp_frm);
free(args->tmp_iaf);
if (args->filter[1]) filter_destroy(args->filter[1]);
}
+// The arary tmp_iaf keeps the index of AF bin for each allele, the first bin is for singletons.
+// The number of bins, either m_af (101) or as given by the user in --af-bins
static void init_iaf(args_t *args, bcf_sr_t *reader)
{
bcf1_t *line = reader->buffer[0];
}
}
-static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal)
+// Returns the max non-ref AD value
+static inline int get_ad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int *ial)
{
- if ( !fmt ) return;
-
- float dvaf;
+ int iv, ad = 0;
+ *ial = 0;
#define BRANCH_INT(type_t,missing,vector_end) { \
- type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \
- if ( p[ial]==vector_end || p[jal]==vector_end ) return; \
- if ( p[ial]==missing || p[jal]==missing ) return; \
- if ( !p[ial] && !p[jal] ) return; \
- dvaf = (float)p[ial]/(p[ial]+p[jal]); \
+ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+ for (iv=1; iv<ad_fmt_ptr->n; iv++) \
+ { \
+ if ( ptr[iv]==vector_end ) break; \
+ if ( ptr[iv]==missing ) continue; \
+ if ( ad < ptr[iv] ) { ad = ptr[iv]; *ial = iv; }\
+ } \
}
- switch (fmt->type) {
+ switch (ad_fmt_ptr->type) {
case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); bcftools_exit(1); break;
+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break;
}
#undef BRANCH_INT
-
+ return ad;
+}
+static inline int get_iad(bcf1_t *line, bcf_fmt_t *ad_fmt_ptr, int ismpl, int ial)
+{
+ #define BRANCH_INT(type_t,missing,vector_end) { \
+ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+ if ( ptr[ial]==vector_end ) return 0; \
+ if ( ptr[ial]==missing ) return 0; \
+ return ptr[ial]; \
+ }
+ switch (ad_fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break;
+ }
+ #undef BRANCH_INT
+}
+static inline void update_dvaf(stats_t *stats, bcf1_t *line, int ial, float vaf)
+{
int len = line->d.var[ial].n;
if ( len < -stats->m_indel ) len = -stats->m_indel;
else if ( len > stats->m_indel ) len = stats->m_indel;
int bin = stats->m_indel + len;
stats->nvaf[bin]++;
- stats->dvaf[bin] += dvaf;
+ stats->dvaf[bin] += vaf;
+}
+#define vaf2bin(vaf) ((int)nearbyintf((vaf)/0.05))
+static inline void update_vaf(vaf_t *smpl_vaf, bcf1_t *line, int ial, float vaf)
+{
+ int idx = vaf2bin(vaf);
+ if ( bcf_get_variant_type(line,ial)==VCF_SNP ) smpl_vaf->snv[idx]++;
+ else smpl_vaf->indel[idx]++;
}
-static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+static inline int calc_sample_depth(args_t *args, int ismpl, bcf_fmt_t *ad_fmt_ptr, bcf_fmt_t *dp_fmt_ptr)
{
- bcf_srs_t *files = args->files;
- bcf1_t *line = reader->buffer[0];
- bcf_fmt_t *fmt_ptr;
- int nref_tot = 0, nhet_tot = 0, nalt_tot = 0;
- int line_type = bcf_get_variant_types(line);
+ if ( dp_fmt_ptr )
+ {
+ #define BRANCH_INT(type_t,missing,vector_end) { \
+ type_t *ptr = (type_t *) (dp_fmt_ptr->p + dp_fmt_ptr->size*ismpl); \
+ if ( *ptr==missing || *ptr==vector_end ) return -1; \
+ return *ptr; \
+ }
+ switch (dp_fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, dp_fmt_ptr->type); bcftools_exit(1); break;
+ }
+ #undef BRANCH_INT
+ }
+ if ( ad_fmt_ptr )
+ {
+ int iv, dp = 0, has_value = 0;
+ #define BRANCH_INT(type_t,missing,vector_end) { \
+ type_t *ptr = (type_t *) (ad_fmt_ptr->p + ad_fmt_ptr->size*ismpl); \
+ for (iv=0; iv<ad_fmt_ptr->n; iv++) \
+ { \
+ if ( ptr[iv]==vector_end ) break; \
+ if ( ptr[iv]==missing ) continue; \
+ has_value = 1; \
+ dp += ptr[iv]; \
+ } \
+ }
+ switch (ad_fmt_ptr->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, ad_fmt_ptr->type); bcftools_exit(1); break;
+ }
+ #undef BRANCH_INT
+ if ( !has_value ) return -1;
+ return dp;
+ }
+ return -1;
+}
+static inline void sample_gt_stats(args_t *args, stats_t *stats, bcf1_t *line, int ismpl, int gt, int ial, int jal)
+{
+ if ( gt==GT_UNKN )
+ {
+ stats->smpl_missing[ismpl]++;
+ return;
+ }
- if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) )
+ int var_type = 0;
+ if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+ if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+ if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
{
- bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL;
+ if ( var_type&VCF_INDEL && stats->smpl_frm_shifts )
+ {
+ assert( ial<line->n_allele );
+ stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+ }
+ if ( gt == GT_HAPL_R ) stats->smpl_hapRef[ismpl]++;
+ if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[ismpl]++;
+ return;
+ }
+ if ( gt != GT_HOM_RR ) { args->n_nref++; args->i_nref = ismpl; }
+ #if HWE_STATS
+ switch (gt)
+ {
+ case GT_HOM_RR: args->nref_tot++; break;
+ case GT_HET_RA: args->nhet_tot++; break;
+ case GT_HET_AA:
+ case GT_HOM_AA: args->nalt_tot++; break;
+ }
+ #endif
- int ref = bcf_acgt2int(*line->d.allele[0]);
- int is, n_nref = 0, i_nref = 0;
- for (is=0; is<args->files->n_smpl; is++)
+ if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP
+ {
+ if ( gt == GT_HET_RA ) stats->smpl_hets[ismpl]++;
+ else if ( gt == GT_HET_AA ) stats->smpl_hets[ismpl]++;
+ else if ( gt == GT_HOM_RR ) stats->smpl_homRR[ismpl]++;
+ else if ( gt == GT_HOM_AA ) stats->smpl_homAA[ismpl]++;
+ if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
{
- int ial, jal;
- int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
- if ( gt==GT_UNKN )
- {
- stats->smpl_missing[is]++;
- continue;
- }
- if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
+ int ref = bcf_acgt2int(*line->d.allele[0]);
+ int alt = bcf_acgt2int(*line->d.allele[ial]);
+ if ( alt<0 ) return;
+ if ( abs(ref-alt)==2 )
+ stats->smpl_ts[ismpl]++;
+ else
+ stats->smpl_tv[ismpl]++;
+ }
+ if ( gt != GT_HOM_RR && line->d.var[jal].type&VCF_SNP && ial!=jal )
+ {
+ int ref = bcf_acgt2int(*line->d.allele[0]);
+ int alt = bcf_acgt2int(*line->d.allele[jal]);
+ if ( alt<0 ) return;
+ if ( abs(ref-alt)==2 )
+ stats->smpl_ts[ismpl]++;
+ else
+ stats->smpl_tv[ismpl]++;
+ }
+ }
+ if ( var_type&VCF_INDEL )
+ {
+ if ( gt != GT_HOM_RR )
+ {
+ stats->smpl_indels[ismpl]++;
+ if ( gt==GT_HET_RA || gt==GT_HET_AA )
{
- if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
+ int is_ins = 0, is_del = 0;
+ if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
{
- assert( ial<line->n_allele );
- stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
+ if ( line->d.var[ial].n < 0 ) is_del = 1;
+ else is_ins = 1;
}
- if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
- if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
- continue;
- }
- if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
- #if HWE_STATS
- switch (gt)
+ if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
{
- case GT_HOM_RR: nref_tot++; break;
- case GT_HET_RA: nhet_tot++; break;
- case GT_HET_AA:
- case GT_HOM_AA: nalt_tot++; break;
- }
- #endif
- int var_type = 0;
- if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
- if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
- if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP
- {
- if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
- else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
- else if ( gt == GT_HOM_RR ) stats->smpl_homRR[is]++;
- else if ( gt == GT_HOM_AA ) stats->smpl_homAA[is]++;
- if ( gt != GT_HOM_RR && line->d.var[ial].type&VCF_SNP ) // this is safe, bcf_get_variant_types has been already called
- {
- int alt = bcf_acgt2int(*line->d.allele[ial]);
- if ( alt<0 ) continue;
- if ( abs(ref-alt)==2 )
- stats->smpl_ts[is]++;
- else
- stats->smpl_tv[is]++;
+ if ( line->d.var[jal].n < 0 ) is_del = 1;
+ else is_ins = 1;
}
+ // Note that alt-het genotypes with both ins and del allele are counted twice!!
+ if ( is_del ) stats->smpl_del_hets[ismpl]++;
+ if ( is_ins ) stats->smpl_ins_hets[ismpl]++;
}
- if ( var_type&VCF_INDEL )
+ else if ( gt==GT_HOM_AA )
{
- if ( gt != GT_HOM_RR )
- {
- stats->smpl_indels[is]++;
-
- if ( gt==GT_HET_RA || gt==GT_HET_AA )
- {
- int is_ins = 0, is_del = 0;
- if ( bcf_get_variant_type(line,ial)&VCF_INDEL )
- {
- if ( line->d.var[ial].n < 0 ) is_del = 1;
- else is_ins = 1;
- update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal);
- }
- if ( bcf_get_variant_type(line,jal)&VCF_INDEL )
- {
- if ( line->d.var[jal].n < 0 ) is_del = 1;
- else is_ins = 1;
- update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial);
- }
- // Note that alt-het genotypes with both ins and del allele are counted twice!!
- if ( is_del ) stats->smpl_del_hets[is]++;
- if ( is_ins ) stats->smpl_ins_hets[is]++;
- }
- else if ( gt==GT_HOM_AA )
- {
- if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++;
- else stats->smpl_ins_homs[is]++;
- }
- }
- if ( stats->smpl_frm_shifts )
- {
- assert( ial<line->n_allele && jal<line->n_allele );
- stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
- stats->smpl_frm_shifts[is*3 + args->tmp_frm[jal]]++;
- }
+ if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[ismpl]++;
+ else stats->smpl_ins_homs[ismpl]++;
}
}
- if ( n_nref==1 ) stats->smpl_sngl[i_nref]++;
+ if ( stats->smpl_frm_shifts )
+ {
+ assert( ial<line->n_allele && jal<line->n_allele );
+ stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[ial]]++;
+ stats->smpl_frm_shifts[ismpl*3 + args->tmp_frm[jal]]++;
+ }
}
+}
+static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched)
+{
+ bcf_srs_t *files = args->files;
+ bcf1_t *line = reader->buffer[0];
- #if HWE_STATS
- if ( nhet_tot + nref_tot + nalt_tot )
+ args->nref_tot = 0;
+ args->nhet_tot = 0;
+ args->nalt_tot = 0;
+ args->n_nref = 0;
+ args->i_nref = 0;
+
+ bcf_fmt_t *gt_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT");
+ bcf_fmt_t *ad_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD");
+ bcf_fmt_t *dp_fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP");
+
+ int is;
+ for (is=0; is<args->files->n_smpl; is++)
+ {
+ // Determine depth
+ int dp = calc_sample_depth(args,is,ad_fmt_ptr,dp_fmt_ptr);
+ if ( dp>0 )
{
- float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
- int idx = het_frac*(args->naf_hwe - 1);
-//check me: what is this?
- if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
- stats->af_hwe[idx]++;
+ (*idist(&stats->dp, dp))++;
+ stats->smpl_ndp[is]++;
+ stats->smpl_dp[is] += dp;
}
- #endif
- if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"DP")) )
- {
- #define BRANCH_INT(type_t,missing,vector_end) { \
- int is; \
- for (is=0; is<args->files->n_smpl; is++) \
- { \
- type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
- if ( *p==vector_end ) continue; \
- if ( *p!=missing ) \
- { \
- (*idist(&stats->dp, *p))++; \
- stats->smpl_ndp[is]++; \
- stats->smpl_dp[is] += *p; \
- } \
- } \
+ // Determine genotype
+ int ial, jal, gt=GT_UNKN;
+ if ( gt_fmt_ptr )
+ {
+ gt = bcf_gt_type(gt_fmt_ptr, reader->samples[is], &ial, &jal);
+ sample_gt_stats(args,stats,line,is,gt,ial,jal);
}
- switch (fmt_ptr->type) {
- case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
- case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
- case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
+
+ // Determine variant allele frequency
+ if ( dp>0 && ad_fmt_ptr )
+ {
+ float iad = 0, jad = 0;
+ if ( gt==GT_UNKN ) // GT not available
+ {
+ iad = get_ad(line,ad_fmt_ptr,is,&ial);
+ }
+ else if ( gt!=GT_UNKN )
+ {
+ iad = ial==0 ? 0 : get_iad(line,ad_fmt_ptr,is,ial);
+ jad = jal==0 ? 0 : get_iad(line,ad_fmt_ptr,is,jal);
+ }
+ if ( iad )
+ {
+ update_dvaf(stats,line,ial,(float)iad/dp);
+ update_vaf(&stats->smpl_vaf[is],line,ial,(float)iad/dp);
+ }
+ if ( jad && iad!=jad )
+ {
+ update_dvaf(stats,line,jal,(float)jad/dp);
+ update_vaf(&stats->smpl_vaf[is],line,jal,(float)jad/dp);
+ }
}
- #undef BRANCH_INT
}
- else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) )
+ if ( args->n_nref==1 ) stats->smpl_sngl[args->i_nref]++;
+
+#if HWE_STATS
+ if ( gt_fmt_ptr && line->n_allele > 1 && (args->nref_tot || args->nhet_tot || args->nalt_tot) )
{
- #define BRANCH_INT(type_t,missing,vector_end) { \
- int is,iv; \
- for (is=0; is<args->files->n_smpl; is++) \
- { \
- type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \
- int dp = 0, has_value = 0; \
- for (iv=0; iv<fmt_ptr->n; iv++) \
- { \
- if ( p[iv]==vector_end ) break; \
- if ( p[iv]==missing ) continue; \
- has_value = 1; \
- dp += p[iv]; \
- } \
- if ( has_value ) \
- { \
- (*idist(&stats->dp, dp))++; \
- stats->smpl_ndp[is]++; \
- stats->smpl_dp[is] += dp; \
- } \
- } \
- }
- switch (fmt_ptr->type) {
- case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
- case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
- case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
- }
- #undef BRANCH_INT
+ // Number of heterozygous genotypes observed for any given allele frequency. This is used
+ // by plot-vcfstats to show the observed vs expected number of hets. There the expected number
+ // of hets is calculated from the probability P(het) = 2*AF*(1-AF).
+ // The array af_hwe is organized as follows
+ // m_af .. number of allele frequency bins
+ // naf_hwe .. the number of het genotype frequency bins
+ // iallele_freq*naf_hwe + ihet_freq
+ //
+ float het_frac = (float)args->nhet_tot / (args->nref_tot + args->nhet_tot + args->nalt_tot);
+ int ihet_freq = het_frac * (args->naf_hwe - 1);
+ int idx = ihet_freq + args->tmp_iaf[1] * args->naf_hwe;
+ stats->af_hwe[idx]++;
}
+#endif
if ( matched==3 )
{
if ( files->n_smpl )
do_sample_stats(args, stats, reader, ret);
- if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 )
- (*idist(&stats->dp_sites, args->tmp_iaf[0]))++;
+ if ( bcf_get_info_int32(reader->header,line,"DP",&args->iarr,&args->miarr)==1 )
+ (*idist(&stats->dp_sites, args->iarr[0]))++;
}
}
}
#endif
}
+
+ if ( args->stats[0].smpl_vaf )
+ {
+ fprintf(bcftools_stdout, "# VAF, Variant Allele Frequency determined as fraction of alternate reads in FORMAT/AD\n");
+ fprintf(bcftools_stdout, "# VAF\t[2]id\t[3]sample\t[4]SNV VAF distribution\t[5]indel VAF distribution\n");
+ for (id=0; id<args->nstats; id++)
+ {
+ stats_t *stats = &args->stats[id];
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ fprintf(bcftools_stdout, "VAF\t%d\t%s\t", id,args->files->samples[i]);
+ for (j=0; j<21; j++) fprintf(bcftools_stdout, "%s%d",j?",":"",stats->smpl_vaf[i].snv[j]);
+ fprintf(bcftools_stdout, "\t");
+ for (j=0; j<21; j++) fprintf(bcftools_stdout, "%s%d",j?",":"",stats->smpl_vaf[i].indel[j]);
+ fprintf(bcftools_stdout, "\n");
+ }
+ }
+ }
}
static void usage(void)
/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
char *include_types, *exclude_types;
int include, exclude;
int record_cmd_line;
+ char *index_fn;
+ int write_index;
htsFile *out;
}
args_t;
fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n");
fprintf(stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
fprintf(stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+ fprintf(stderr, " --write-index Automatically index the output files [off]\n");
fprintf(stderr, "\n");
exit(1);
}
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->write_index = 0;
args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
args->regions_overlap = 1;
args->targets_overlap = 0;
{"phased",no_argument,NULL,'p'},
{"exclude-phased",no_argument,NULL,'P'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,10},
{NULL,0,NULL,0}
};
char *tmp;
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 10 : args->write_index = 1; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
else if ( args->output_type & FT_BCF )
error("BCF output requires header, cannot proceed with -H\n");
+ if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out);
+
int ret = 0;
if (!args->header_only)
{
ret = args->files->errnum;
if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
}
- hts_close(args->out);
+
+ if (args->write_index)
+ {
+ if (bcf_idx_save(args->out) < 0)
+ {
+ if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+
+ if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout");
destroy_data(args);
bcf_sr_destroy(args->files);
free(args);
/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
char *include_types, *exclude_types;
int include, exclude;
int record_cmd_line;
+ char *index_fn;
+ int write_index;
htsFile *out;
}
args_t;
fprintf(bcftools_stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n");
fprintf(bcftools_stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
fprintf(bcftools_stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+ fprintf(bcftools_stderr, " --write-index Automatically index the output files [off]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->write_index = 0;
args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
args->regions_overlap = 1;
args->targets_overlap = 0;
{"phased",no_argument,NULL,'p'},
{"exclude-phased",no_argument,NULL,'P'},
{"no-version",no_argument,NULL,8},
+ {"write-index",no_argument,NULL,10},
{NULL,0,NULL,0}
};
char *tmp;
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 10 : args->write_index = 1; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
else if ( args->output_type & FT_BCF )
error("BCF output requires header, cannot proceed with -H\n");
+ if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out);
+
int ret = 0;
if (!args->header_only)
{
ret = args->files->errnum;
if ( ret ) fprintf(bcftools_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
}
- hts_close(args->out);
+
+ if (args->write_index)
+ {
+ if (bcf_idx_save(args->out) < 0)
+ {
+ if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"bcftools_stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
+
+ if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"bcftools_stdout");
destroy_data(args);
bcf_sr_destroy(args->files);
free(args);
/* version.c -- report version numbers for plugins.
- Copyright (C) 2014-2021 Genome Research Ltd.
+ Copyright (C) 2014-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
const char *hts_bcf_wmode2(int file_type, const char *fname)
{
if ( !fname ) return hts_bcf_wmode(file_type);
- int len = strlen(fname);
- if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
- if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
- if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
- if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+ const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+ if ( !end ) end = fname ? fname + strlen(fname) : fname;
+ int len = end - fname;
+ if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+ if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF);
+ if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+ if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
return hts_bcf_wmode(file_type);
}
void set_wmode(char dst[8], int file_type, const char *fname, int clevel)
{
const char *ret = NULL;
- int len = fname ? strlen(fname) : 0;
- if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
- else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF);
- else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
- else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+ const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+ if ( !end ) end = fname ? fname + strlen(fname) : fname;
+ int len = end - fname;
+ if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+ else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF);
+ else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+ else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
else ret = hts_bcf_wmode(file_type);
if ( clevel>=0 && clevel<=9 )
{
else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2;
else return -1;
}
+
+// See also samtools/sam_utils.c auto_index()
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
+{
+ int min_shift = 14; // CSI
+
+ if ( !fname || !*fname || !strcmp(fname, "-") ) return -1;
+
+ char *delim = strstr(fname, HTS_IDX_DELIM);
+ if (delim)
+ {
+ delim += strlen(HTS_IDX_DELIM);
+ *idx_fname = strdup(delim);
+ if ( !*idx_fname ) return -1;
+
+ size_t l = strlen(*idx_fname);
+ if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0;
+ }
+ else
+ {
+ if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1;
+ sprintf(*idx_fname, "%s.csi", fname);
+ }
+
+ if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1;
+
+ return 0;
+}
+
+
/* version.c -- report version numbers for plugins.
- Copyright (C) 2014-2021 Genome Research Ltd.
+ Copyright (C) 2014-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
const char *hts_bcf_wmode2(int file_type, const char *fname)
{
if ( !fname ) return hts_bcf_wmode(file_type);
- int len = strlen(fname);
- if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
- if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
- if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
- if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+ const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+ if ( !end ) end = fname ? fname + strlen(fname) : fname;
+ int len = end - fname;
+ if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+ if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF);
+ if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+ if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
return hts_bcf_wmode(file_type);
}
void set_wmode(char dst[8], int file_type, const char *fname, int clevel)
{
const char *ret = NULL;
- int len = fname ? strlen(fname) : 0;
- if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
- else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF);
- else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
- else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+ const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL;
+ if ( !end ) end = fname ? fname + strlen(fname) : fname;
+ int len = end - fname;
+ if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+ else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF);
+ else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+ else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
else ret = hts_bcf_wmode(file_type);
if ( clevel>=0 && clevel<=9 )
{
else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2;
else return -1;
}
+
+// See also samtools/sam_utils.c auto_index()
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname)
+{
+ int min_shift = 14; // CSI
+
+ if ( !fname || !*fname || !strcmp(fname, "-") ) return -1;
+
+ char *delim = strstr(fname, HTS_IDX_DELIM);
+ if (delim)
+ {
+ delim += strlen(HTS_IDX_DELIM);
+ *idx_fname = strdup(delim);
+ if ( !*idx_fname ) return -1;
+
+ size_t l = strlen(*idx_fname);
+ if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0;
+ }
+ else
+ {
+ if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1;
+ sprintf(*idx_fname, "%s.csi", fname);
+ }
+
+ if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1;
+
+ return 0;
+}
+
+
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.17
+VERSION=1.18
# If we have a git clone, then check against the current tag
if [ -e .git ]
+++ /dev/null
-import os
-import re
-import sys
-
-try:
- from Cython.Distutils import build_ext
-except ImportError:
- from setuptools.command.build_ext import build_ext
-
-from distutils.extension import Extension
-from distutils.sysconfig import get_config_var, get_config_vars, get_python_version
-from pkg_resources import Distribution
-
-
-if sys.platform == 'darwin':
- config_vars = get_config_vars()
- config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '')
- config_vars['SHLIB_EXT'] = '.so'
-
-
-def is_pip_install():
- if "_" in os.environ and os.environ["_"].endswith("pip"):
- return True
- if "pip-egg-info" in sys.argv:
- return True
- if re.search("/pip-.*-build/", __file__):
- return True
- return False
-
-
-class CyExtension(Extension):
- def __init__(self, *args, **kwargs):
- self._init_func = kwargs.pop("init_func", None)
- self._prebuild_func = kwargs.pop("prebuild_func", None)
- Extension.__init__(self, *args, **kwargs)
-
- def extend_includes(self, includes):
- self.include_dirs.extend(includes)
-
- def extend_macros(self, macros):
- self.define_macros.extend(macros)
-
- def extend_extra_objects(self, objs):
- self.extra_objects.extend(objs)
-
-
-class cy_build_ext(build_ext):
-
- def _get_egg_name(self):
- ei_cmd = self.get_finalized_command("egg_info")
- return Distribution(
- None, None, ei_cmd.egg_name, ei_cmd.egg_version, get_python_version(),
- self.distribution.has_ext_modules() and self.plat_name).egg_name()
-
- def build_extension(self, ext):
-
- if isinstance(ext, CyExtension) and ext._init_func:
- ext._init_func(ext)
-
- if not self.inplace:
- ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
-
- if sys.platform == 'darwin':
- # The idea is to give shared libraries an install name of the form
- # `@rpath/<library-name.so>`, and to set the rpath equal to
- # @loader_path. This will allow Python packages to find the library
- # in the expected place, while still giving enough flexibility to
- # external applications to link against the library.
- relative_module_path = ext.name.replace(".", os.sep) + (get_config_var('EXT_SUFFIX') or get_config_var('SO'))
- library_path = os.path.join(
- "@rpath", os.path.basename(relative_module_path)
- )
-
- if not ext.extra_link_args:
- ext.extra_link_args = []
- ext.extra_link_args += ['-dynamiclib',
- '-rpath', '@loader_path',
- '-Wl,-headerpad_max_install_names',
- '-Wl,-install_name,%s' % library_path,
- '-Wl,-x']
- else:
- if not ext.extra_link_args:
- ext.extra_link_args = []
-
- ext.extra_link_args += ['-Wl,-rpath,$ORIGIN']
-
- if isinstance(ext, CyExtension) and ext._prebuild_func:
- ext._prebuild_func(ext, self.force)
-
- build_ext.build_extension(self, ext)
"htslib": (
'htslib/tabix.c', 'htslib/bgzip.c',
'htslib/htsfile.c',
- "test", "tests"),
+ "samples", "test", "tests"),
}
--- /dev/null
+#!/bin/sh -e
+
+if test -x /usr/bin/dnf; then
+ echo Installing prerequisites via dnf...
+ dnf -y install epel-release
+ dnf -y install zlib-devel bzip2-devel xz-devel curl-devel samtools bcftools htslib-tools
+
+elif test -x /usr/bin/yum; then
+ if yum -y install epel-release; then
+ echo Installing prerequisites via yum...
+ yum -y install zlib-devel bzip2-devel xz-devel curl-devel samtools bcftools htslib-tools
+ else
+ echo Installing non-test prerequisites via yum...
+ yum -y install zlib-devel bzip2-devel xz-devel curl-devel
+ fi
+
+elif test -d /etc/dpkg; then
+ echo Installing prerequisites via apt-get...
+ apt-get update
+ apt-get install -y --no-install-recommends --no-install-suggests libcurl4-openssl-dev zlib1g-dev libbz2-dev liblzma-dev samtools bcftools tabix
+
+elif test -x /sbin/apk; then
+ echo Installing non-test prerequisites via apk...
+ apk update
+ apk add zlib-dev bzip2-dev xz-dev curl-dev
+
+elif test -x ${HOMEBREW_PREFIX-/usr/local}/bin/brew; then
+ echo Installing prerequisites via brew...
+ HOMEBREW_NO_AUTO_UPDATE=1 brew install -q samtools bcftools
+ brew unlink xz || true
+
+else
+ echo No package manager detected
+fi
# All configuration values have a default; values that are commented out
# serve to show the default.
-import sys, os, setuptools
+import sys, os, re, setuptools
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['sphinx.ext.autodoc',
'sphinx.ext.autosummary',
+ 'sphinx.ext.extlinks',
'sphinx.ext.todo',
'sphinx.ext.ifconfig',
'sphinx.ext.intersphinx',
# General information about the project.
project = u'pysam'
-copyright = u'2009–2021, Andreas Heger, Kevin Jacobs, et al'
+copyright = '2009–2023 Andreas Heger, John Marshall, Kevin Jacobs, et al'
# Included at the end of each rst file
rst_epilog = '''
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
+# -- Rewrite "PR #NNN" and "#NNN" in NEWS as URL links -------------------------
+
+extlinks = {
+ 'issue': ('https://github.com/pysam-developers/pysam/issues/%s', '#%s'),
+ 'pull': ('https://github.com/pysam-developers/pysam/pull/%s', 'PR #%s'),
+ }
+
+def expand_github_references(text):
+ text = re.sub(r'PR\s*#(\d+)', r':pull:`\1`', text)
+ text = re.sub(r'#(\d+)', r':issue:`\1`', text)
+ return text
+
+def include_read(app, relative_path, parent_docname, source):
+ if relative_path.name == 'NEWS':
+ source[0] = expand_github_references(source[0])
+
+def setup(app):
+ try:
+ app.connect('include-read', include_read)
+ except:
+ pass # Sphinx is too old to link issues/PRs
# -- Options for HTML output ---------------------------------------------------
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = 'default'
+html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'pysam.tex', u'pysam documentation',
- u'Andreas Heger, Kevin Jacobs, et al.', 'manual'),
+ 'Andreas Heger, John Marshall, Kevin Jacobs, et al', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
pysam: htslib interface for python
==================================
-:Author: Andreas Heger, Kevin Jacobs and contributors
+:Author: Andreas Heger, John Marshall, Kevin Jacobs, and contributors
:Date: |today|
:Version: |version|
using cython and a high-level, pythonic API for convenient access to
the data within genomic file formats.
-The current version wraps *htslib-1.17*, *samtools-1.17*, and *bcftools-1.17*.
+The current version wraps *htslib-1.18*, *samtools-1.18*, and *bcftools-1.18*.
To install the latest release, type::
See the :ref:`Installation notes <installation>` for details.
+This module is unrelated to NREL-PySAM_, which wraps the National Renewable
+Energy Laboratory's System Advisor Model.
+
+.. _NREL-PySAM: https://nrel-pysam.readthedocs.io/
+
Contents
--------
pip install pysam
-This will compile the ``builtin`` htslib source code within pysam.
+Generally you will have the ``wheel`` package installed and
+this command will speedily install pysam from a pre-built wheel.
+Otherwise, or if you use pip's ``--no-binary`` option, this will
+compile the ``builtin`` htslib source code within pysam and allow
+the configuration facilities described below to be used.
htslib_ can be configured at compilation to turn on additional
features such support using encrypted configurations, enable plugins,
Release notes
=============
-Release 0.19.1
-==============
-
-This release wraps htslib/samtools/bcftools version 1.15.1.
-
-* [#1104] add an add_samples() method to quickly add multiple samples
- to VCF.
-
-Release 0.19.0
-==============
-
-This release wraps htslib/samtools/bcftools version 1.15.
-
-* [#1085] Improve getopt()/getopt_long() resetting when running samtools/bcftools commands
-
-* [#1078] Support BAM_CPAD in get_aligned_pairs
-
-* [#1063] Run flake8 and fix some linting issues
-
-* [#1088] Add AlignedSegment is_mapped/mate_is_mapped/is_forward/mate_is_forward properties
-
-* Write an absent AlignedSegment.qual as all-bytes-0xff
-
-* Fix BGZFile.read() behaviour near or at EOF
-
-* First API for the htslib modified bases interface
-
-Release 0.18.0
-==============
-
-This release wraps htslib/samtools/bcftools version 1.14.
-
-* [#1048] and [#1060], clarify documentation of index statistics with CRAM files
-* Prevent "retval may be used uninitialised" warning.
-* Add new "samples" subcommand to pysam/samtools.py
-* Introduce TupleProxyIterator iterator object class
-
-Release 0.17.0
-==============
-
-This release wraps htslib/samtools/bcftools version 1.13. Corresponding
-to new samtools commands, `pysam.samtools` now has additional functions
-`ampliconclip`, `ampliconstats`, `fqimport`, and `version`.
-
-Bugs fixed:
-
-* [#447] The maximum QNAME length is fully restored to 254
-* [#506, #958, #1000] Don't crash the Python interpreter on ``pysam.bcftools.*()`` errors
-* [#603] count_coverage: ignore reads that have no SEQ field
-* [#928] Fix ``pysam.bcftools.mpileup()`` segmentation fault
-* [#983] Add win32/\*.[ch] to MANIFEST.in
-* [#994] Raise exception in ``get_tid()`` if header could not be parsed
-* [#995] Choose TBI/CSI in ``tabix_index()`` via both min_shift and csi
-* [#996] ``AlignmentFile.fetch()`` now works with large chromosomes longer than 2\ :sup:`29` bases
-* [#1019] Fix Sphinx documentation generation by avoiding Python 2 ``ur'string'`` syntax
-* [#1035] Improved handling of file iteration errors
-* [#1038] ``tabix_index()`` no longer leaks file descriptors
-* [#1040] ``print(aligned_segment)`` now prints the correct TLEN value
- (it also now prints RNAME/RNEXT more clearly and prints POS/PNEXT 1-based)
-* *setup.py* longer uses ``setup(use_2to3)`` for compatibility with setuptools >= v58.0.0
-
-New facilities:
-
-* [PR #963] Additional VCF classes are exposed to pysam programmers
-* [#998, PR #1001] Add ``get/set_encoding_error_handler()`` to control UTF-8 conversion
-* [PR #1012] Running ``python setup.py sdist`` now automatically runs cythonize
-* Running tests with ``pytest`` now automatically runs ``make`` to generate test data
-
-Documentation improvements:
-
-* [#726] Clarify get_forward_sequence/get_forward_qualities documentation
-* [#865] Improved example
-* [#968] ``get_index_statstics`` parameters
-* [#986] Clarify ``VariantFile.fetch`` start/stop region parameters are 0-based and half-open.
-* [#990] Corrected ``PileupColumn.get_query_sequences`` documentation
-* [#999] Fix documentation for ``AlignmentFile.get_reference_length()``
-* [#1002] Document the default min_base_quality for ``pileup()``
-
-
-Release 0.16.0
-==============
-
-This release wraps htslib/bcftools version 1.10.2 and samtools version
-1.10. The following bugs reported against pysam are fixed due to this:
-
-* [#447] Writing out QNAME longer than 251 characters corrupts BAM
-* [#640, #734, #843] Setting VariantRecord pos or stop raises error
-* [#738, #919] FastxFile truncates concatenated plain gzip compressed files
-
-Additional bugfixes:
-
-* [#840] Pileup doesn't work on python3 when `index_filename` is used
-* [#886] FastqProxy raises ValueError when instantiated from python
-* [#904] VariantFile.fetch() throws ValueError on files with no records
-* [#909] Fix incorrect quoting in VariantFile contig records
-* [#915, #916] Implement pileup() for unindexed files and/or SAM files
-
-Backwards incompatible changes:
-
-* The `samtools import` command was removed in samtools 1.10, so pysam
- no longer exports a `samimport` function. Use `pysam.view()` instead.
-
-
-Release 0.15.4
-==============
-
-Bugfix release. Principal reason for release is to update cython
-version in order to fix pip install pysam with python 3.8.
-
-* [#879] Fix add_meta function in libcbcf.pyx, so meta-information
- lines in header added with this function have double-quoting rules
- in accordance to rules specified in VCF4.2 and VCF4.3 specifications
-* [#863] Force arg to bytes to support non-ASCII encoding
-* [#875] Bump minimum Cython version
-* [#868] Prevent segfault on Python 2.7 AlignedSegment.compare(other=None)
-* [#867] Fix wheel building on TravisCI
-* [#863] Force arg to bytes to support non-ASCII encoding
-* [#799] disambiguate interpretation of bcf_read return code
-* [#841] Fix silent truncation of FASTQ with bad q strings
-* [#846] Prevent segmentation fault on ID, when handling malformed records
-* [#829] Run configure with the correct CC/CFLAGS/LDFLAGS env vars
-
-
-Release 0.15.3
-==============
-
-Bugfix release.
-
-* [#824] allow reading of UTF-8 encoded text in VCF/BCF files.
-* [#780] close all filehandles before opening new ones in pysam_dispatch
-* [#773] do not cache VariantRecord.id to avoid memory leak
-* [#781] default of multiple_iterators=True is changed to False for
- CRAM files.
-* [#825] fix collections.abc import
-* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy
- bug when setting FORMAT fields.
-* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its
- memory as needed, rather than a fixed-size char buffer.
-* [#814] Build wheels and upload them to PyPI
-* [#755] Allow passing flags and arguments to index methods
-* [#763] Strip \0 in header check
-* [#761] Test Tabix index contents, not the compression
-
-Release 0.15.2
-==============
-
-Bugfix release.
-
-* [#746] catch pileup itorator out-of-scope segfaults
-* [#747] fix faixd fetch with region
-* [#748] increase max_pos to (1<<31)-1
-* [#645] Add missing macOS stub files in `MANIFEST.in`, @SoapZA
-* [#737] Fix bug in get_aligned_pairs, @bkohrn
-
-Release 0.15.1
-==============
-
-Bugfix release.
-
-* [#716] raise ValueError if tid is out of range when writing
-* [#697] release version using cython 0.28.5 for python 3.7
- compatibility
-
-Release 0.15.0
-==============
-
-This release wraps htslib/samtools/bcftools version 1.9.0.
-
-* [#673] permit dash in chromosome name of region string
-* [#656] Support `text` when opening a SAM file for writing
-* [#658] return None in get_forward_sequence if sequence not in record
-* [#683] allow lower case bases in MD tags
-* Ensure that = and X CIGAR ops are treated the same as M
-
-Release 0.14.1
-==============
-
-This is mostly a bugfix release, though bcftools has now also been
-upgraded to 1.7.0.
-
-* [#621] Add a warning to count_coverage when an alignment has an
- empty QUAL field
-* [#635] Speed-up of AlignedSegment.find_intro()
-* treat border case of all bases in pileup column below quality score
-* [#634] Fix access to pileup reference_sequence
-
-
-Release 0.14.0
-==============
-
-This release wraps htslib/samtools versions 1.7.0.
-
-* SAM/BAM/CRAM headers are now managed by a separate AlignmentHeader
- class.
-* AlignmentFile.header.as_dict() returns an ordered dictionary.
-* Use "stop" instead of "end" to ensure consistency to
- VariantFile. The end designations have been kept for backwards
- compatibility.
-
-* [#611] and [#293] CRAM repeated fetch now works, each iterator
- reloads index if multiple_iterators=True
-* [#608] pysam now wraps htslib 1.7 and samtools 1.7.
-* [#580] reference_name and next_reference_name can now be set to "*"
- (will be converted to None to indicate an unmapped location)
-* [#302] providing no coordinate to count_coverage will not count from
- start/end of contig.
-* [#325] @SQ records will be automatically added to header if they are
- absent from text section of header.
-* [#529] add get_forward_sequence() and get_forward_qualities()
- methods
-* [#577] add from_string() and to_dict()/from_dict() methods to
- AlignedSegment. Rename tostring() to to_string() throughout for
- consistency
-* [#589] return None from build_alignment_sequence if no MD tag is set
-* [#528] add PileupColumn.__len__ method
-
-Backwards incompatible changes:
-
-* AlignmentFile.header now returns an AlignmentHeader object. Use
- AlignmentFile.header.to_dict() to get the dictionary as
- previously. Most dictionary accessor methods (keys(), values(),
- __getitem__, ...) have been implemented to ensure some level of
- backwards compatibility when only reading.
-
- The rationale for this change is to have consistency between
- AlignmentFile and VariantFile.
-
-* AlignmentFile and FastaFile now raise IOError instead of OSError
-
-Medium term we plan to have a 1.0 release. The pysam
-interface has grown over the years and the API is cluttered with
-deprecated names (Samfile, getrname(), gettid(), ...). To work towards
-this, the next release (0.15.0) will yield DeprecationWarnings
-for any parts of the API that are considered obsolete and will not be
-in 1.0. Once 1.0 has been reached, we will use semantic versioning.
-
-Release 0.13.0
-===============
-
-This release wraps htslib/samtools/bcftools versions 1.6.0 and
-contains a series of bugfixes.
-
-* [#544] reading header from remote TabixFiles now works.
-* [#531] add missing tag types H and A. A python float will now be
- added as 'f' type instead of 'd' type.
-* [#543] use FastaFile instead of Fastafile in pileup.
-* [#546] set is_modified flag in setAttribute so updated attributes
- are output.
-* [#537] allow tabix index files to be created in a custom location.
-* [#530] add get_index_statistics() method
-
-
-Release 0.12.0.1
-================
-
-Bugfix release to solve compilation issue due to missinge
-bcftools/config.h file.
-
-Release 0.12.0
-==============
-
-This release wraps htslib/samtools/bcftools versions 1.5.0 and
-contains a series of bugfixes.
-
-* [#473] A new FastxRecord class that can be instantiated from class and
- modified in-place. Replaces PersistentFastqProxy.
-* [#521] In AligmentFile, Simplify file detection logic and allow remote index files
-
- * Removed attempts to guess data and index file names; this is magic left
- to htslib.
- * Removed file existence check prior to opening files with htslib
- * Better error checking after opening files that raise the appropriate
- error (IOError for when errno is set, ValueError otherwise for backward
- compatibility).
- * Report IO errors when loading an index by name.
- * Allow remote indices (tested using S3 signed URLs).
- * Document filepath_index and make it an alias for index_filename.
- * Added a require_index parameter to AlignmentFile
-
-* [#526] handle unset ref when creating new records
-* [#513] fix bcf_translate to skip deleted FORMAT fields to avoid
- segfaults
-* [#516] expose IO errors via IOError exceptions
-* [#487] add tabix line_skip, remove 'pileup' preset
-* add FastxRecord, replaces PersistentFastqProxy (still present for
- backwards compatibility)
-* [#496] upgrade to htslib/samtools/bcftools versions 1.5
-* add start/stop to AlignmentFile.fetch() to be consistent with
- VariantFile.fetch(). "end" is kept for backwards compatibility.
-* [#512] add get_index_statistics() method to AlignmentFile.
-
-Upcoming changes:
-
-In the next release we are plannig to separate the header information
-from AlignmentFile into a separate class AlignmentHeader. This layout
-is similar to VariantFile/VariantHeader. With this change we will
-ensure that an AlignedSegment record will be linked to a header so
-that chromosome names can be automatically translated from the numeric
-representation. As a consequence, the way new AlignedSegment records
-are created will need to change as the constructor requires a header::
-
- header = pysam.AlignmentHeader(
- reference_names=["chr1", "chr2"],
- reference_lengths=[1000, 1000])
-
- read = pysam.AlignedSegment(header)
-
-This will affect all code that instantiates AlignedSegment objects
-directly. We have not yet merged to allow users to provide feed-back.
-The pull-request is here: https://github.com/pysam-developers/pysam/pull/518
-Please comment on github.
-
-Release 0.11.2.2
-================
-
-Bugfix release to address two issues:
-
-* Changes in 0.11.2.1 broke the GTF/GFF3 parser. Corrected and
- more tests have been added.
-* [#479] Correct VariantRecord edge cases described in issue
-
-Release 0.11.2.1
-================
-
-Release to fix release tar-ball containing 0.11.1 pre-compiled
-C-files.
-
-Release 0.11.2
-==============
-
-This release wraps htslib/samtools/bcfools versions 1.4.1 in response
-to a security fix in these libraries. Additionally the following
-issues have been fixed:
-
-* [#452] add GFF3 support for tabix parsers
-* [#461] Multiple fixes related to VariantRecordInfo and handling of INFO/END
-* [#447] limit query name to 251 characters (only partially addresses issue)
-
-VariantFile and related object fixes
-
-* Restore VariantFile.\_\_dealloc\_\_
-* Correct handling of bcf_str_missing in bcf_array_to_object and
- bcf_object_to_array
-* Added update() and pop() methods to some dict-like proxy objects
-* scalar INFO entries could not be set again after being deleted
-* VariantRecordInfo.__delitem__ now allows unset flags to be deleted without
- raising a KeyError
-* Multiple other fixes for VariantRecordInfo methods
-* INFO/END is now accessible only via VariantRecord.stop and
- VariantRecord.rlen. Even if present behind the scenes, it is no longer
- accessible via VariantRecordInfo.
-* Add argument to issue a warning instead of an exception if input appears
- to be truncated
-
-Other features and fixes:
-
-* Make AlignmentFile \_\_dealloc\_\_ and close more
- stringent
-* Add argument AlignmentFile to issue a warning instead of an
- exception if input appears to be truncated
-
-Release 0.11.1
-==============
-
-Bugfix release
-
-* [#440] add deprecated 'always' option to infer_query_length for backwards compatibility.
-
-Release 0.11.0
-==============
-
-This release wraps the latest versions of htslib/samtools/bcftools and
-implements a few bugfixes.
-
-* [#413] Wrap HTSlib/Samtools/BCFtools 1.4
-* [#422] Fix missing pysam.sort.usage() message
-* [#411] Fix BGZfile initialization bug
-* [#412] Add seek support for BGZFile
-* [#395] Make BGZfile iterable
-* [#433] Correct getQueryEnd
-* [#419] Export SAM enums such as pysam.CMATCH
-* [#415] Fix access by tid in AlignmentFile.fetch()
-* [#405] Writing SAM now outputs a header by default.
-* [#332] split infer_query_length(always) into infer_query_length and infer_read_length
-
-Release 0.10.0
-==============
-
-This release implements further functionality in the VariantFile API
-and includes several bugfixes:
-
-* treat special case -c option in samtools view outputs to stdout even
- if -o given, fixes #315
-* permit reading BAM files with CSI index, closes #370
-* raise Error if query name exceeds maximum length, fixes #373
-* new method to compute hash value for AlignedSegment
-* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile
-* Avoid segfault by detecting out of range reference_id and
- next_reference in AlignedSegment.tostring
-* Issue #355: Implement streams using file descriptors for VariantFile
-* upgrade to htslib 1.3.2
-* fix compilation with musl libc
-* Issue #316, #360: Rename all Cython modules to have lib as a prefix
-* Issue #332, hardclipped bases in cigar included by
- pysam.AlignedSegment.infer_query_length()
-* Added support for Python 3.6 filename encoding protocol
-* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord
-* Issue #331, fix failure in VariantFile.reset() method
-* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and
- VariantRecord.copy() methods to create new VariantRecord objects
-* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters
-* Preliminary (potentially unsafe) support for removing and altering header metadata
-* Many minor fixes and improvements to VariantFile and related objects
-
-Please note that all internal cython extensions now have a lib prefix
-to facilitate linking against pysam extension modules. Any user cython
-extensions using cimport to import pysam definitions will need
-changes, for example::
-
- cimport pysam.csamtools
-
-will become::
-
- cimport pysam.libcsamtools
-
-Release 0.9.1
-=============
-
-This is a bugfix release addressing some installation problems
-in pysam 0.9.0, in particular:
-
-* patch included htslib to work with older libcurl versions, fixes #262.
-* do not require cython for python 3 install, fixes #260
-* FastaFile does not accept filepath_index any more, see #270
-* add AlignedSegment.get_cigar_stats method.
-* py3 bugfix in VariantFile.subset_samples, fixes #272
-* add missing sysconfig import, fixes #278
-* do not redirect stdout, but instead write to a separately
- created file. This should resolve issues when pysam is used
- in notebooks or other environments that redirect stdout.
-* wrap htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1
-* use bgzf throughout instead of gzip
-* allow specifying a fasta reference for CRAM file when opening
- for both read and write, fixes #280
-
-Release 0.9.0
-=============
-
-Overview
---------
-
-The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other
-enhancements and bugfixes. See below for a detailed list.
-
-`Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
-comes with additional capabilities for remote file access which depend
-on the presence of optional system libraries. As a consequence, the
-installation script :file:`setup.py` has become more complex. For an
-overview, see :ref:`installation`. We have tested installation on
-linux and OS X, but could not capture all variations. It is possible
-that a 0.9.1 release might follow soon addressing installation issues.
-
-The :py:class:`~.pysam.VariantFile` class provides access to
-:term:`vcf` and :term:`bcf` formatted files. The class is certainly
-usable and interface is reaching completion, but the API and the
-functionality is subject to change.
-
-Detailed release notes
-----------------------
-
-* upgrade to htslib 1.3
-* python 3 compatibility tested throughout.
-* added a first set of bcftools commands in the pysam.bcftools
- submodule.
-* samtools commands are now in the pysam.samtools module. For
- backwards compatibility they are still imported into the pysam
- namespace.
-* samtools/bcftools return stdout as a single (byte) string. As output
- can be binary (VCF.gz, BAM) this is necessary to ensure py2/py3
- compatibility. To replicate the previous behaviour in py2.7, use::
-
- pysam.samtools.view(self.filename).splitlines(True)
-
-* get_tags() returns the tag type as a character, not an integer (#214)
-* TabixFile now raises ValueError on indices created by tabix <1.0 (#206)
-* improve OSX installation and develop mode
-* FastxIterator now handles empty sequences (#204)
-* TabixFile.isremote is not TabixFile.is_remote in line with AlignmentFile
-* AlignmentFile.count() has extra optional argument read_callback
-* setup.py has been changed to:
- * install a single builtin htslib library. Previously, each pysam
- module contained its own version. This reduces compilation time
- and code bloat.
- * run configure for the builtin htslib library in order to detect
- optional libraries such as libcurl. Configure behaviour can be
- controlled by setting the environment variable
- HTSLIB_CONFIGURE_OPTIONS.
-* get_reference_sequence() now returns the reference sequence and not
- something looking like it. This bug had effects on
- get_aligned_pairs(with_seq=True), see #225. If you have relied on on
- get_aligned_pairs(with_seq=True) in pysam-0.8.4, please check your
- results.
-* improved autodetection of file formats in AlignmentFile and VariantFile.
-
-Release 0.8.4
-=============
-
-This release contains numerous bugfixes and a first implementation of
-a pythonic interface to VCF/BCF files. Note that this code is still
-incomplete and preliminary, but does offer a nearly complete immutable
-Pythonic interface to VCF/BCF metadata and data with reading and
-writing capability.
-
-Potential isses when upgrading from v0.8.3:
-
-* binary tags are now returned as python arrays
-
-* renamed several methods for pep8 compatibility, old names still retained for
- backwards compatibility, but should be considered deprecated.
-
- * gettid() is now get_tid()
- * getrname() is now get_reference_name()
- * parseRegion() is now parse_region()
-
-* some methods have changed for pep8 compatibility without the old
- names being present:
-
- * fromQualityString() is now qualitystring_to_array()
- * toQualityString() is now qualities_to_qualitystring()
-
-* faidx now returns strings and not binary strings in py3.
-
-* The cython components have been broken up into smaller files with
- more specific content. This will affect users using the cython
- interfaces.
-
-Edited list of commit log changes:
-
-* fixes AlignmentFile.check_index to return True
-* add RG/PM header tag - closes #179
-* add with_seq option to get_aligned_pairs
-* use char * inside reconsituteReferenceSequence
-* add soft clipping for get_reference_sequence
-* add get_reference_sequence
-* queryEnd now computes length from cigar string if no sequence present, closes #176
-* tolerate missing space at end of gtf files, closes #162
-* do not raise Error when receiving output on stderr
-* add docu about fetching without index, closes #170
-* FastaFile and FastxFile now return strings in python3, closes #173
-* py3 compat: relative -> absolute imports.
-* add reference_name and next_reference_name attributes to AlignedSegment
-* add function signatures to cvcf cython. Added note about other VCF code.
-* add context manager functions to FastaFile
-* add reference_name and next_reference_name attributes to AlignedSegment
-* PileupColumn also gets a reference_name attribute.
-* add context manager functions to FastaFile
-* TabixFile.header for remote files raises AttributeError, fixes #157
-* add context manager interface to TabixFile, closes #165
-* change ctypedef enum to typedef enum for cython 0.23
-* add function signatures to cvcf cython, also added note about other VCF code
-* remove exception for custom upper-case header record tags.
-* rename VALID_HEADER_FIELDS to KNOWN_HEADER_FIELDS
-* fix header record tag parsing for custom tags.
-* use cython.str in count_coverage, fixes #141
-* avoid maketrans (issues with python3)
-* refactoring: AlignedSegment now in separate module
-* do not execute remote tests if URL not available
-* fix the unmapped count, incl reads with no SQ group
-* add raw output to tags
-* added write access for binary tags
-* bugfix in call to resize
-* implemented writing of binary tags from arrays
-* implemented convert_binary_tag to use arrays
-* add special cases for reads that are unmapped or whose mates are unmapped.
-* rename TabProxies to ctabixproxies
-* remove underscores from utility functions
-* move utility methods into cutils
-* remove callback argument to fetch - closes #128
-* avoid calling close in dealloc
-* add unit tests for File object opening
-* change AlignmentFile.open to filepath_or_object
-* implement copy.copy, close #65
-* add chaching of array attributes in AlignedSegment, closes #121
-* add export of Fastafile
-* remove superfluous pysam_dispatch
-* use persist option in FastqFile
-* get_tag: expose tag type if requested with `with_value_type`
-* fix to allow reading vcf record info via tabix-based vcf reader
-* add pFastqProxy and pFastqFile objects to make it possible to work with multiple fastq records per file handle, unlike FastqProxy/FastqFile.
-* release GIL around htslib IO operations
-* More work on read/write support, API improvements
-* add `phased` property on `VariantRecordSample`
-* add mutable properties to VariantRecord
-* BCF fixes and start of read/write support
-* VariantHeaderRecord objects now act like mappings for attributes.
-* add VariantHeader.alts dict from alt ID->Record.
-* Bug fix to strong representation of structured header records.
-* VariantHeader is now mutable
-
-
-Release 0.8.3
-=============
-
-* samtools command now accept the "catch_stdout" option.
-
-* get_aligned_pairs now works for soft-clipped reads.
-
-* query_position is now None when a PileupRead is not aligned
- to a particular position.
-
-* AlignedSegments are now comparable and hashable.
-
-Release 0.8.2.1
-===============
-
-* Installation bugfix release.
-
-Release 0.8.2
-=============
-
-* Pysam now wraps htslib 1.2.1 and samtools version 1.2.
-
-* Added CRAM file support to pysam.
-
-* New alignment info interface.
- * opt() and setTag are deprecated, use get_tag() and set_tag()
- instead.
- * added has_tag()
- * tags is deprecated, use get_tags() and set_tags() instead.
-
-* FastqFile is now FastxFile to reflect that the latter permits
- iteration over both fastq- and fasta-formatted files.
-
-* A Cython wrapper for htslib VCF/BCF reader/writer. The wrapper
- provides a nearly complete Pythonic interface to VCF/BCF metadata
- with reading and writing capability. However, the interface is still
- incomplete and preliminary and lacks capability to mutate the
- resulting data.
-
-Release 0.8.1
-=============
-
-* Pysam now wraps htslib and samtools versions 1.1.
-
-* Bugfixes, most notable:
- * issue #43: uncompressed BAM output
- * issue #42: skip tests requiring network if none available
- * issue #19: multiple iterators can now be made to work on the same tabix file
- * issue #24: All strings returned from/passed to the pysam API are now unicode in python 3
- * issue #5: type guessing for lists of integers fixed
-
-* API changes for consistency. The old API is still present,
- but deprecated.
- In particular:
-
- * Tabixfile -> TabixFile
- * Fastafile -> FastaFile
- * Fastqfile -> FastqFile
- * Samfile -> AlignmentFile
- * AlignedRead -> AlignedSegment
- * qname -> query_name
- * tid -> reference_id
- * pos -> reference_start
- * mapq -> mapping_quality
- * rnext -> next_reference_id
- * pnext -> next_reference_start
- * cigar -> cigartuples
- * cigarstring -> cigarstring
- * tlen -> template_length
- * seq -> query_sequence
- * qual -> query_qualities, now returns array
- * qqual -> query_alignment_qualities, now returns array
- * tags -> tags
- * alen -> reference_length, reference is always "alignment", so removed
- * aend -> reference_end
- * rlen -> query_length
- * query -> query_alignment_sequence
- * qstart -> query_alignment_start
- * qend -> query_alignment_end
- * qlen -> query_alignment_length
- * mrnm -> next_reference_id
- * mpos -> next_reference_start
- * rname -> reference_id
- * isize -> template_length
- * blocks -> get_blocks()
- * aligned_pairs -> get_aligned_pairs()
- * inferred_length -> infer_query_length()
- * positions -> get_reference_positions()
- * overlap() -> get_overlap()
-
- * All strings are now passed to or received from the pysam API
- as strings, no more bytes.
-
-Other changes:
- * AlignmentFile.fetch(reopen) option is now multiple_iterators. The
- default changed to not reopen a file unless requested by the user.
- * FastaFile.getReferenceLength is now FastaFile.get_reference_length
-
-Backwards incompatible changes
-
-* Empty cigarstring now returns None (instead of '')
-* Empty cigar now returns None (instead of [])
-* When using the extension classes in cython modules, AlignedRead
- needs to be substituted with AlignedSegment.
-* fancy_str() has been removed
-* qual, qqual now return arrays
-
-Release 0.8.0
-=============
-
-* Disabled features
- * IteratorColumn.setMask() disabled as htslib does not implement
- this functionality?
-
-* Not implemented yet:
- * reading SAM files without header
-
-Tabix files between version 0.7.8 and 0.8.0 are
-not compatible and need to be re-indexed.
-
-While version 0.7.8 and 0.8.0 should be mostly
-compatible, there are some notable exceptions:
-
-* tabix iterators will fail if there are comments
- in the middle or the end of a file.
-
-* tabix raises always ValueError for invalid intervals.
- Previously, different types of errors were raised
- (KeyError, IndexError, ValueError) depending on
- the type of invalid intervals (missing chromosome,
- out-of-range, malformatted interval).
-
-
-Release 0.7.8
-=============
-
-* added AlignedRead.setTag method
-* added AlignedRead.blocks
-* unsetting CIGAR strings is now possible
-* empty CIGAR string returns empty list
-* added reopen flag to Samfile.fetch()
-* various bugfixes
-
-Release 0.7.7
-=============
-
-* added Fastafile.references, .nreferences and .lengths
-* tabix_iterator now uses kseq.h for python 2.7
-
-Release 0.7.6
-=============
-
-* added inferred_length property
-* issue 122: MACOSX getline missing, now it works?
-* seq and qual can be set None
-* added Fastqfile
-
-Release 0.7.5
-=============
-
-* switch to samtools 0.1.19
-* issue 122: MACOSX getline missing
-* issue 130: clean up tempfiles
-* various other bugfixes
-
-Release 0.7.4
-=============
-
-* further bugfixes to setup.py and package layout
-
-Release 0.7.3
-=============
-
-* further bugfixes to setup.py
-* upgraded distribute_setup.py to 0.6.34
-
-Release 0.7.2
-=============
-
-* bugfix in installer - failed when cython not present
-* changed installation locations of shared libraries
-
-Release 0.7.1
-=============
-
-* bugfix: missing PP tag PG records in header
-* added pre-built .c files to distribution
-
-Release 0.7
-===========
-
-* switch to tabix 0.2.6
-* added cigarstring field
-* python3 compatibility
-* added B tag handling
-* added check_sq and check_header options to Samfile.__init__
-* added lazy GTF parsing to tabix
-* reworked support for VCF format parsing
-* bugfixes
-
-Release 0.6
-===========
-
-* switch to samtools 0.1.18
-* various bugfixes
-* removed references to deprecated 'samtools pileup' functionality
-* AlignedRead.tags now returns an empty list if there are no tags.
-* added pnext, rnext and tlen
-
-Release 0.5
-===========
-
-* switch to samtools 0.1.16 and tabix 0.2.5
-* improved tabix parsing, added vcf support
-* re-organized code to permit linking against pysam
-* various bugfixes
-* added Samfile.positions and Samfile.overlap
-
-Release 0.4
-===========
-
-* switch to samtools 0.1.12a and tabix 0.2.3
-* added snp and indel calling.
-* switch from pyrex to cython
-* changed handling of samtools stderr
-* various bugfixes
-* added Samfile.count and Samfile.mate
-* deprecated AlignedRead.rname, added AlignedRead.tid
-
-Release 0.3
-===========
-
-* switch to samtools 0.1.8
-* added support for tabix files
-* numerous bugfixes including
-* permit simultaneous iterators on the same file
-* working access to remote files
+.. include:: ../NEWS
--- /dev/null
+sphinx==7.2.5
+sphinx-rtd-theme==1.3.0
[project]
name = "pysam"
-description = "pysam - a python module for reading, manipulating and writing genomic data sets."
+description = "Package for reading, manipulating, and writing genomic data"
license = { text = "MIT License" }
-version = "0.21.0"
authors = [
{ name = "Andreas Heger", email = "andreas.heger@gmail.com"}
]
dynamic = [
"classifiers",
"readme",
+ "version",
]
-dependencies = [
- "cython",
-]
-
+[project.urls]
+"Documentation" = "https://pysam.readthedocs.io/"
+"Release notes" = "https://pysam.readthedocs.io/en/stable/release.html"
[build-system]
-requires = ["setuptools>=59.0", "wheel", "Cython>=0.29.30,<3.0"]
+requires = ["setuptools>=59.0", "Cython>=0.29.12,<4"]
build-backend = "setuptools.build_meta:__legacy__"
+
+[tool.cibuildwheel]
+before-all = "{project}/devtools/install-prerequisites.sh"
+# Necessary until we build libhts.a out-of-tree from within build_temp
+before-build = "make -C {project}/htslib distclean"
+
+test-requires = ["pytest"]
+test-command = "REF_PATH=: pytest {project}/tests"
+
+[tool.tox]
+legacy_tox_ini = """
+ [tox]
+ envlist = py36, py311
+
+ [testenv]
+ deps = pytest
+ setenv = REF_PATH=:
+ commands = pytest tests
+"""
elif op == BAM_CHARD_CLIP:
pass # advances neither
- cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
+ cdef char *md_tag, md_buffer[2];
+ cdef uint8_t md_typecode = md_tag_ptr[0]
+ if md_typecode == b'Z':
+ md_tag = bam_aux2Z(md_tag_ptr)
+ elif md_typecode == b'A':
+ # Work around HTSeq bug that writes 1-character strings as MD:A:v
+ md_buffer[0] = bam_aux2A(md_tag_ptr)
+ md_buffer[1] = b'\0'
+ md_tag = md_buffer
+ else:
+ raise TypeError('Tagged field MD:{}:<value> does not have expected type MD:Z'.format(chr(md_typecode)))
+
cdef int md_idx = 0
cdef char c
s_idx = 0
_sam = force_bytes(sam)
line.s = _sam
- sam_parse1(&line, dest.header.ptr, dest._delegate)
+ cdef int ret
+ ret = sam_parse1(&line, dest.header.ptr, dest._delegate)
+ if ret < 0:
+ raise ValueError("parsing SAM record string failed (error code {})".format(ret))
return dest
def get_reference_positions(self, full_length=False):
"""a list of reference positions that this read aligns to.
- By default, this method only returns positions in the
- reference that are within the alignment. If *full_length* is
- set, None values will be included for any soft-clipped or
- unaligned positions within the read. The returned list will
- thus be of the same length as the read.
+ By default, this method returns the (0-based) positions on the
+ reference that are within the read's alignment, leaving gaps
+ corresponding to deletions and other reference skips.
+ When *full_length* is True, the returned list is the same length
+ as the read and additionally includes None values corresponding
+ to insertions or soft-clipping, i.e., to bases of the read that
+ are not aligned to a reference position.
+ (See also :meth:`get_aligned_pairs` which additionally returns
+ the corresponding positions along the read.)
"""
cdef uint32_t k, i, l, pos
cdef int op
def get_aligned_pairs(self, matches_only=False, with_seq=False):
"""a list of aligned read (query) and reference positions.
+ Each item in the returned list is a tuple consisting of
+ the 0-based offset from the start of the read sequence
+ followed by the 0-based reference position.
+
For inserts, deletions, skipping either query or reference
position may be None.
----------
matches_only : bool
- If True, only matched bases are returned - no None on either
+ If True, only matched bases are returned --- no None on either
side.
with_seq : bool
If True, return a third element in the tuple containing the
def is_valid_tid(self, tid: int) -> bool: ...
def get_tid(self, reference: str) -> int: ...
+# The iterator produced by AlignmentFile is currently itself, but this may
+# change in future and code should not make assumptions about this type.
+AlignmentFileIterator = AlignmentFile
+
class AlignmentFile(HTSFile):
def __init__(
self,
@property
def nocoordinate(self) -> int: ...
def get_index_statistics(self) -> List[IndexStats]: ...
- def __iter__(self) -> Any: ...
- def __next__(self) -> Any: ...
+ def __iter__(self) -> AlignmentFileIterator: ...
+ def __next__(self) -> AlignedSegment: ...
def is_valid_tid(self, tid: int) -> bool: ...
def get_tid(self, reference: str) -> int: ...
def get_reference_name(self, tid: int) -> str: ...
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
from pysam.libcutils cimport encode_filename, from_string_and_size
from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
-from pysam.libchtslib cimport HTSFile, hisremote
+from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
+ HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
from io import StringIO
if cfilename or cindexname:
with nogil:
- self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
+ self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
+ HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
if not self.index and (cindexname or require_index):
if errno:
return bcf_format_get_alleles(self)
@alleles.setter
- def alleles(self, value: tuple):
+ def alleles(self, value):
# Sets the genotype, supply a tuple of alleles to set.
# The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
# The genotype is reset when an empty tuple, None or (None,) is supplied
from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdio cimport SEEK_SET
from libc.stdlib cimport malloc, calloc, realloc, free
from cpython.object cimport PyObject
from pysam.libcutils cimport force_bytes, encode_filename
from pysam.libchtslib cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \
bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \
- bgzf_tell, bgzf_getline, kstring_t, SEEK_SET, BGZF
+ bgzf_tell, bgzf_getline, kstring_t, BGZF
__all__ = ["BGZFile"]
# Write the data in the buffer to the file.
int bgzf_flush(BGZF *fp)
- int SEEK_SET
-
# Return a virtual file pointer to the current location in the file.
# No interpretation of the value should be made, other than a subsequent
# call to bgzf_seek can be used to position the file at the same point.
#
# @param fp BGZF file handler
# @param pos virtual file offset returned by bgzf_tell()
- # @param whence must be SEEK_SET
+ # @param whence must be SEEK_SET (cimported from libc.stdio / posix.unistd)
# @return 0 on success and -1 on error
# /
int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
#
# @param fp BGZF file handler; must be opened for reading
# @param uoffset file offset in the uncompressed data
- # @param where SEEK_SET supported atm
+ # @param where SEEK_SET (cimported from libc.stdio) supported atm
#
# Returns 0 on success and -1 on error.
int bgzf_useek(BGZF *fp, long uoffset, int where)
# @return The index, or NULL if an error occurred.
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
+ #### Load a specific index file
+ # @param fn Input BAM/BCF/etc filename
+ # @param fnidx The input index filename
+ # @param fmt One of the HTS_FMT_* index formats
+ # @param flags Flags to alter behaviour (see description)
+ # @return The index, or NULL if an error occurred.
+ hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
+
+ int HTS_IDX_SAVE_REMOTE
+ int HTS_IDX_SILENT_FAIL
+
uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
# @return The index, or NULL if an error occurred.
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
+ # Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file
+ # @param fp File handle of the data file whose index is being opened
+ # @param fn BAM/CRAM/etc data file filename
+ # @param fnidx Index filename, or NULL to search alongside @a fn
+ # @param flags Flags to alter behaviour
+ # @return The index, or NULL if an error occurred.
+ hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
+
# Generate and save an index file
# @param fn Input BAM/etc filename, to which .csi/etc will be added
# @param min_shift Positive to generate CSI, or 0 to generate BAI
tbx_t * tbx_index_load(char *fn)
tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
+ tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags)
# free the array but not the values
char **tbx_seqnames(tbx_t *tbx, int *n)
#************************************************************************
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
+ hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
int bcf_index_build(const char *fn, int min_shift)
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
@property
def is_bcf(self) -> bool: ...
def reset(self) -> None: ...
- def seek(self, offset: int) -> int: ...
+ def seek(self, offset: int, whence: int = ...) -> int: ...
def tell(self) -> int: ...
def add_hts_options(self, format_options: Optional[List[str]] = ...) -> None: ...
def parse_region(
from cpython cimport PyBytes_FromStringAndSize
from pysam.libchtslib cimport *
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport encode_filename, from_string_and_size, libc_whence_from_io
########################################################################
__all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
-# defines imported from samtools
-DEF SEEK_SET = 0
-DEF SEEK_CUR = 1
-DEF SEEK_END = 2
-
# maximum genomic coordinace
cdef int MAX_POS = (1 << 31) - 1
self.fp = NULL
if hclose(fp) != 0:
- raise IOError(herrno(self.fp), 'failed to close HFile', self.name)
+ raise IOError(errno, 'failed to close HFile', self.name)
def fileno(self):
if self.fp == NULL:
def readlines(self):
return list(self)
- def seek(self, Py_ssize_t offset, int whence=SEEK_SET):
+ def seek(self, Py_ssize_t offset, int whence=io.SEEK_SET):
if self.fp == NULL:
raise IOError('operation on closed HFile')
- cdef Py_ssize_t off = hseek(self.fp, offset, whence)
+ cdef Py_ssize_t off = hseek(self.fp, offset, libc_whence_from_io(whence))
if off < 0:
raise IOError(herrno(self.fp), 'seek failed on HFile', self.name)
"""
return self.seek(self.start_offset)
- def seek(self, uint64_t offset):
+ def seek(self, uint64_t offset, int whence=io.SEEK_SET):
"""move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
if not self.is_open:
raise ValueError('I/O operation on closed file')
if self.is_stream:
raise IOError('seek not available in streams')
+ whence = libc_whence_from_io(whence)
+
cdef int64_t ret
if self.htsfile.format.compression == bgzf:
with nogil:
- ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
+ ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence)
elif self.htsfile.format.compression == no_compression:
- ret = 0 if (hseek(self.htsfile.fp.hfile, offset, SEEK_SET) >= 0) else -1
+ ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1
else:
raise NotImplementedError("seek not implemented in files compressed by method {}".format(
self.htsfile.format.compression))
cpdef parse_region(contig=*, start=*, stop=*, region=*, reference=*, end=*)
+cdef int libc_whence_from_io(int whence)
+
#########################################################################
# Utility functions for quality string conversions
from libc.stdio cimport fprintf, stderr, fflush
from libc.stdio cimport stdout as c_stdout
from posix.fcntl cimport open as c_open, O_WRONLY
+from posix.unistd cimport SEEK_SET, SEEK_CUR, SEEK_END
from libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \
samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn
return contig, rstart, rstop
+cdef int libc_whence_from_io(int whence):
+ # io.SEEK_SET/_CUR/_END are by definition 0/1/2 but C/POSIX's equivalents
+ # have unspecified values. So we must translate, but checking for 0/1/2
+ # rather than io.SEEK_SET/etc suffices.
+ if whence == 0: return SEEK_SET
+ if whence == 1: return SEEK_CUR
+ if whence == 2: return SEEK_END
+ return whence # Otherwise likely invalid, but let HTSlib or OS report it
+
+
def _pysam_dispatch(collection,
method,
args=None,
// Version information used while compiling samtools, bcftools, and htslib
-#define SAMTOOLS_VERSION "1.17 (pysam)"
-#define BCFTOOLS_VERSION "1.17 (pysam)"
-#define HTS_VERSION_TEXT "1.17 (pysam)"
+#define SAMTOOLS_VERSION "1.18 (pysam)"
+#define BCFTOOLS_VERSION "1.18 (pysam)"
+#define HTS_VERSION_TEXT "1.18 (pysam)"
# pysam versioning information
-__version__ = "0.21.0"
+__version__ = "0.22.0"
-__samtools_version__ = "1.17"
-__bcftools_version__ = "1.17"
-__htslib_version__ = "1.17"
+__samtools_version__ = "1.18"
+__bcftools_version__ = "1.18"
+__htslib_version__ = "1.18"
--- /dev/null
+Cython>=0.29.12,<4
+++ /dev/null
-cython>=0.29.12
The typical simple case of building Samtools using the HTSlib bundled within
this Samtools release tarball is done as follows:
- cd .../samtools-1.17 # Within the unpacked release directory
+ cd .../samtools-1.18 # Within the unpacked release directory
./configure
make
installation using the HTSlib bundled within this Samtools release tarball,
and building the various HTSlib utilities such as bgzip is done as follows:
- cd .../samtools-1.17 # Within the unpacked release directory
+ cd .../samtools-1.18 # Within the unpacked release directory
./configure --prefix=/path/to/location
make all all-htslib
make install install-htslib
To build with plug-ins, you need to use the --enable-plugins configure option
as follows:
- cd .../samtools-1.17 # Within the unpacked release directory
+ cd .../samtools-1.18 # Within the unpacked release directory
./configure --enable-plugins --prefix=/path/to/location
make all all-htslib
make install install-htslib
the source distribution instead of installing the package. In that case
you can use:
- cd .../samtools-1.17 # Within the unpacked release directory
- ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.17
+ cd .../samtools-1.18 # Within the unpacked release directory
+ ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.18
make all all-htslib
It is possible to override the built-in search path using the HTS_PATH
/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
from the 5' end.
- Copyright (C) 2020-2022 Genome Research Ltd.
+ Copyright (C) 2020-2023 Genome Research Ltd.
Authors: Andrew Whitwham <aw7@sanger.ac.uk>
Rob Davies <rmd+git@sanger.ac.uk>
int oa_tag;
int del_tag;
int tol;
+ int unmap_len;
char *arg_list;
char *stats_file;
char *rejects_file;
long filtered = 0, written = 0, failed = 0;
kstring_t str = KS_INITIALIZE;
kstring_t oat = KS_INITIALIZE;
+ kstring_t seq = KS_INITIALIZE;
bed_entry_list_t *sites;
FILE *stats_fp = stderr;
khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
}
}
- if (param->fail_len >= 0 || param->filter_len >= 0) {
- hts_pos_t aql = active_query_len(b);
+ if (param->fail_len >= 0 || param->filter_len >= 0 || param->unmap_len >= 0) {
+ hts_pos_t aql = active_query_len(b);
- if (param->fail_len >= 0 && aql <= param->fail_len) {
- b->core.flag |= BAM_FQCFAIL;
- }
+ if (param->fail_len >= 0 && aql <= param->fail_len) {
+ b->core.flag |= BAM_FQCFAIL;
+ }
+
+ if (param->filter_len >= 0 && aql <= param->filter_len) {
+ filter = 1;
+ }
+
+ if (param->unmap_len >= 0 && aql <= param->unmap_len) {
+
+ if (ks_resize(&seq, b->core.l_qseq) < 0) {
+ fprintf(stderr, "[ampliconclip] error: allocate memory for sequence %s\n", bam_get_seq(b));
+ goto fail;
+ }
+
+ ks_clear(&seq);
+ char *sb = ks_str(&seq);
+ uint8_t *sequence = bam_get_seq(b);
+ int i;
- if (param->filter_len >= 0 && aql <= param->filter_len) {
- filter = 1;
- }
+ for (i = 0; i < b->core.l_qseq ; ++i) {
+ *sb++ = seq_nt16_str[bam_seqi(sequence, i)];
+ }
+
+ if (bam_set1(b_tmp, b->core.l_qname - b->core.l_extranul - 1, bam_get_qname(b),
+ (b->core.flag | BAM_FUNMAP), b->core.tid, b->core.pos, 0,
+ 0, NULL, b->core.mtid, b->core.mpos, b->core.isize,
+ b->core.l_qseq, seq.s, (const char *)bam_get_qual(b),
+ bam_get_l_aux(b)) < 0) {
+ fprintf(stderr, "[ampliconclip] error: could not unmap read %s\n", bam_get_seq(b));
+ goto fail;
+ }
+
+ memcpy(bam_get_aux(b_tmp), bam_get_aux(b), bam_get_l_aux(b));
+ b_tmp->l_data += bam_get_l_aux(b);
+ swap_bams(&b, &b_tmp);
+ }
}
if (b->core.flag & BAM_FQCFAIL) {
fail:
destroy_bed_hash(bed_hash);
ks_free(&oat);
+ ks_free(&seq);
sam_hdr_destroy(header);
bam_destroy1(b);
bam_destroy1(b_tmp);
fprintf(stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n");
fprintf(stderr, " --filter-len INT do not output reads INT size or shorter.\n");
fprintf(stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n");
+ fprintf(stderr, " --unmap-len INT unmap reads INT size or shorter, default 0.\n");
fprintf(stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n");
fprintf(stderr, " --rejects-file FILE file to write filtered reads.\n");
fprintf(stderr, " --original for clipped entries add an OA tag with original data.\n");
htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL, *reject = NULL;
clipping_type clipping = soft_clip;
- cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
+ cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, 0, NULL, NULL, NULL};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"original", no_argument, NULL, 1013},
{"keep-tag", no_argument, NULL, 1014},
{"tolerance", required_argument, NULL, 1015},
+ {"unmap-len", required_argument, NULL, 1016},
{NULL, 0, NULL, 0}
};
case 1013: param.oa_tag = 1; break;
case 1014: param.del_tag = 0; break;
case 1015: param.tol = atoi(optarg); break;
+ case 1016: param.unmap_len = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': usage(); exit(1);
if (param.tol < 0) {
fprintf(stderr, "[ampliconclip] warning: invalid tolerance of %d,"
- " reseting tolerance to default of 5.\n", param.tol);
+ " resetting tolerance to default of 5.\n", param.tol);
param.tol = 5;
}
/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
from the 5' end.
- Copyright (C) 2020-2022 Genome Research Ltd.
+ Copyright (C) 2020-2023 Genome Research Ltd.
Authors: Andrew Whitwham <aw7@sanger.ac.uk>
Rob Davies <rmd+git@sanger.ac.uk>
int oa_tag;
int del_tag;
int tol;
+ int unmap_len;
char *arg_list;
char *stats_file;
char *rejects_file;
long filtered = 0, written = 0, failed = 0;
kstring_t str = KS_INITIALIZE;
kstring_t oat = KS_INITIALIZE;
+ kstring_t seq = KS_INITIALIZE;
bed_entry_list_t *sites;
FILE *stats_fp = samtools_stderr;
khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
}
}
- if (param->fail_len >= 0 || param->filter_len >= 0) {
- hts_pos_t aql = active_query_len(b);
+ if (param->fail_len >= 0 || param->filter_len >= 0 || param->unmap_len >= 0) {
+ hts_pos_t aql = active_query_len(b);
- if (param->fail_len >= 0 && aql <= param->fail_len) {
- b->core.flag |= BAM_FQCFAIL;
- }
+ if (param->fail_len >= 0 && aql <= param->fail_len) {
+ b->core.flag |= BAM_FQCFAIL;
+ }
+
+ if (param->filter_len >= 0 && aql <= param->filter_len) {
+ filter = 1;
+ }
+
+ if (param->unmap_len >= 0 && aql <= param->unmap_len) {
+
+ if (ks_resize(&seq, b->core.l_qseq) < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] error: allocate memory for sequence %s\n", bam_get_seq(b));
+ goto fail;
+ }
+
+ ks_clear(&seq);
+ char *sb = ks_str(&seq);
+ uint8_t *sequence = bam_get_seq(b);
+ int i;
- if (param->filter_len >= 0 && aql <= param->filter_len) {
- filter = 1;
- }
+ for (i = 0; i < b->core.l_qseq ; ++i) {
+ *sb++ = seq_nt16_str[bam_seqi(sequence, i)];
+ }
+
+ if (bam_set1(b_tmp, b->core.l_qname - b->core.l_extranul - 1, bam_get_qname(b),
+ (b->core.flag | BAM_FUNMAP), b->core.tid, b->core.pos, 0,
+ 0, NULL, b->core.mtid, b->core.mpos, b->core.isize,
+ b->core.l_qseq, seq.s, (const char *)bam_get_qual(b),
+ bam_get_l_aux(b)) < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] error: could not unmap read %s\n", bam_get_seq(b));
+ goto fail;
+ }
+
+ memcpy(bam_get_aux(b_tmp), bam_get_aux(b), bam_get_l_aux(b));
+ b_tmp->l_data += bam_get_l_aux(b);
+ swap_bams(&b, &b_tmp);
+ }
}
if (b->core.flag & BAM_FQCFAIL) {
fail:
destroy_bed_hash(bed_hash);
ks_free(&oat);
+ ks_free(&seq);
sam_hdr_destroy(header);
bam_destroy1(b);
bam_destroy1(b_tmp);
fprintf(samtools_stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n");
fprintf(samtools_stderr, " --filter-len INT do not output reads INT size or shorter.\n");
fprintf(samtools_stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n");
+ fprintf(samtools_stderr, " --unmap-len INT unmap reads INT size or shorter, default 0.\n");
fprintf(samtools_stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n");
fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n");
fprintf(samtools_stderr, " --original for clipped entries add an OA tag with original data.\n");
htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL, *reject = NULL;
clipping_type clipping = soft_clip;
- cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
+ cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, 0, NULL, NULL, NULL};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"original", no_argument, NULL, 1013},
{"keep-tag", no_argument, NULL, 1014},
{"tolerance", required_argument, NULL, 1015},
+ {"unmap-len", required_argument, NULL, 1016},
{NULL, 0, NULL, 0}
};
case 1013: param.oa_tag = 1; break;
case 1014: param.del_tag = 0; break;
case 1015: param.tol = atoi(optarg); break;
+ case 1016: param.unmap_len = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': usage(); samtools_exit(1);
if (param.tol < 0) {
fprintf(samtools_stderr, "[ampliconclip] warning: invalid tolerance of %d,"
- " reseting tolerance to default of 5.\n", param.tol);
+ " resetting tolerance to default of 5.\n", param.tol);
param.tol = 5;
}
}
if (opts->all_bases) {
- if (tid != opts->last_tid && opts->last_tid >= 0) {
- hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
- if (opts->iter)
- len = MIN(opts->iter->end, len);
- if (empty_pileup2(opts, opts->h, opts->last_tid, opts->last_pos,
- len) < 0)
- return -1;
- if (tid >= 0) {
- if (empty_pileup2(opts, opts->h, tid,
- opts->iter ? opts->iter->beg : 0,
- pos-1) < 0)
+ if (tid != opts->last_tid && opts->last_tid >= -1) {
+ if (opts->last_tid >= 0) {
+ // remainder of previous ref
+ hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
+ if (opts->iter)
+ len = MIN(opts->iter->end, len);
+ if (empty_pileup2(opts, opts->h, opts->last_tid,
+ opts->last_pos, len) < 0)
+ return -1;
+ }
+
+ opts->last_pos = opts->iter ? opts->iter->beg : 0;
+ }
+
+ // Any refs between last_tid and tid
+ if (!opts->iter && tid > opts->last_tid && opts->all_bases > 1) {
+ while (++opts->last_tid < tid) {
+ hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
+ if (empty_pileup2(opts, opts->h, opts->last_tid, 0, len) < 0)
return -1;
}
}
+
+ // Any gaps in this ref (same tid) or at start of this new tid
if (opts->last_pos >= 0 && pos > opts->last_pos+1) {
if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos,
pos-1) < 0)
return 0;
}
+ next_ref:
if (tid != opts->last_tid) {
if (opts->last_tid != -1) {
if (opts->all_bases) {
+ // Fill in remainder of previous reference
int i, N;
if (opts->iter) {
opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1);
}
seq->l = 0; qual->l = 0;
+
+ if (!opts->iter && opts->all_bases > 1 && ++opts->last_tid < tid) {
+ opts->last_pos = 0;
+ goto next_ref;
+ }
+
opts->last_tid = tid;
-// if (opts->all_bases)
-// opts->last_pos = 0;
if (opts->iter)
opts->last_pos = opts->iter->beg;
else
if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0)
goto err;
}
+ while (!opts.iter && opts.all_bases > 1 &&
+ ++opts.last_tid < opts.h->n_targets) {
+ int len = sam_hdr_tid2len(opts.h, opts.last_tid);
+ if (empty_pileup2(&opts, opts.h, opts.last_tid, 0, len) < 0)
+ goto err;
+ }
+
} else {
if (pileup_loop(opts.fp, opts.h, readaln2,
opts.mode != MODE_SIMPLE ? nm_init : NULL,
opts.mode != MODE_SIMPLE ? nm_free : NULL,
&opts) < 0)
goto err;
+
+ next_ref_q:
if (opts.all_bases) {
// fill out terminator
int tid = opts.iter ? opts.iter->tid : opts.last_tid;
dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid),
opts.ks_ins_seq.s, opts.ks_ins_seq.l,
opts.ks_ins_qual.s, opts.ks_ins_qual.l);
+
+ if (!opts.iter && opts.all_bases > 1 &&
+ ++opts.last_tid < opts.h->n_targets) {
+ opts.last_pos = 0;
+ opts.ks_ins_seq.l = opts.ks_ins_qual.l = 0;
+ goto next_ref_q;
+ }
// if (consensus_loop(&opts) < 0) {
// print_error_errno("consensus", "Failed");
// goto err;
}
if (opts->all_bases) {
- if (tid != opts->last_tid && opts->last_tid >= 0) {
- hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
- if (opts->iter)
- len = MIN(opts->iter->end, len);
- if (empty_pileup2(opts, opts->h, opts->last_tid, opts->last_pos,
- len) < 0)
- return -1;
- if (tid >= 0) {
- if (empty_pileup2(opts, opts->h, tid,
- opts->iter ? opts->iter->beg : 0,
- pos-1) < 0)
+ if (tid != opts->last_tid && opts->last_tid >= -1) {
+ if (opts->last_tid >= 0) {
+ // remainder of previous ref
+ hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
+ if (opts->iter)
+ len = MIN(opts->iter->end, len);
+ if (empty_pileup2(opts, opts->h, opts->last_tid,
+ opts->last_pos, len) < 0)
+ return -1;
+ }
+
+ opts->last_pos = opts->iter ? opts->iter->beg : 0;
+ }
+
+ // Any refs between last_tid and tid
+ if (!opts->iter && tid > opts->last_tid && opts->all_bases > 1) {
+ while (++opts->last_tid < tid) {
+ hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid);
+ if (empty_pileup2(opts, opts->h, opts->last_tid, 0, len) < 0)
return -1;
}
}
+
+ // Any gaps in this ref (same tid) or at start of this new tid
if (opts->last_pos >= 0 && pos > opts->last_pos+1) {
if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos,
pos-1) < 0)
return 0;
}
+ next_ref:
if (tid != opts->last_tid) {
if (opts->last_tid != -1) {
if (opts->all_bases) {
+ // Fill in remainder of previous reference
int i, N;
if (opts->iter) {
opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1);
}
seq->l = 0; qual->l = 0;
+
+ if (!opts->iter && opts->all_bases > 1 && ++opts->last_tid < tid) {
+ opts->last_pos = 0;
+ goto next_ref;
+ }
+
opts->last_tid = tid;
-// if (opts->all_bases)
-// opts->last_pos = 0;
if (opts->iter)
opts->last_pos = opts->iter->beg;
else
if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0)
goto err;
}
+ while (!opts.iter && opts.all_bases > 1 &&
+ ++opts.last_tid < opts.h->n_targets) {
+ int len = sam_hdr_tid2len(opts.h, opts.last_tid);
+ if (empty_pileup2(&opts, opts.h, opts.last_tid, 0, len) < 0)
+ goto err;
+ }
+
} else {
if (pileup_loop(opts.fp, opts.h, readaln2,
opts.mode != MODE_SIMPLE ? nm_init : NULL,
opts.mode != MODE_SIMPLE ? nm_free : NULL,
&opts) < 0)
goto err;
+
+ next_ref_q:
if (opts.all_bases) {
// fill out terminator
int tid = opts.iter ? opts.iter->tid : opts.last_tid;
dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid),
opts.ks_ins_seq.s, opts.ks_ins_seq.l,
opts.ks_ins_qual.s, opts.ks_ins_qual.l);
+
+ if (!opts.iter && opts.all_bases > 1 &&
+ ++opts.last_tid < opts.h->n_targets) {
+ opts.last_pos = 0;
+ opts.ks_ins_seq.l = opts.ks_ins_qual.l = 0;
+ goto next_ref_q;
+ }
// if (consensus_loop(&opts) < 0) {
// print_error_errno("consensus", "Failed");
// goto err;
/* bam_fastq.c -- FASTA and FASTQ file generation
- Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
+ Copyright (C) 2009-2017, 2019-2020, 2023 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <assert.h>
#include <inttypes.h>
#include <unistd.h>
+#include <float.h>
#include "htslib/sam.h"
#include "htslib/klist.h"
" -o FILE write reads designated READ1 or READ2 to FILE\n"
" note: if a singleton file is specified with -s, only\n"
" paired reads will be written to the -1 and -2 files.\n"
-" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
-" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0
+" -d, --tag TAG[:VAL]\n"
+" only include reads containing TAG, optionally with value VAL\n"
+" -f, --require-flags INT\n"
+" only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F, --excl[ude]-flags INT\n"
+" only include reads with none of the FLAGs in INT present [0x900]\n" // F&x == 0
+" --rf, --incl[ude]-flags INT\n"
+" only include reads with any of the FLAGs in INT present [0]\n" // !(F&x == 0)
" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
" -n don't append /1 and /2 to the read name\n"
" -N always append /1 and /2 to the read name\n",
char *fnr[3];
char *fn_input; // pointer to input filename in argv do not free
bool has12, has12always, use_oq, copy_tags, illumina_tag;
- int flag_on, flag_off, flag_alloff;
+ int flag_on, flag_off, flag_alloff, flag_anyon;
sam_global_args ga;
fastfile filetype;
int def_qual;
char *index_format;
char *extra_tags;
char compression_level;
+ const char *filter_tag; // -d opt
+ const char *filter_value_str;
+ int64_t filter_value_int;
+ float filter_value_flt;
} bam2fq_opts_t;
typedef struct bam2fq_state {
samFile *hstdout;
sam_hdr_t *h;
bool has12, use_oq, copy_tags, illumina_tag;
- int flag_on, flag_off, flag_alloff;
+ int flag_on, flag_off, flag_alloff, flag_anyon;
fastfile filetype;
int def_qual;
char *index_sequence;
free(opts);
}
+// Make mnemonic distinct values for longoption-only options
+#define LONGOPT(c) ((c) + 128)
+
// return true if valid
static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
{
opts->extra_tags = NULL;
opts->compression_level = 1;
opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY;
- int flag_off_set = 0;
int c;
sam_global_args_init(&opts->ga);
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
+ {"require-flags", required_argument, NULL, 'f'},
+ {"excl-flags", required_argument, NULL, 'F'},
+ {"exclude-flags", required_argument, NULL, 'F'},
+ // following the same convention as view: g exists as a longoption_only
+ // argument, accessible from the command line as --rf/--incl[ude]-flags
+ {"rf", required_argument, NULL, LONGOPT('g')},
+ {"incl-flags", required_argument, NULL, LONGOPT('g')},
+ {"include-flags", required_argument, NULL, LONGOPT('g')},
{"i1", required_argument, NULL, 1},
{"I1", required_argument, NULL, 1},
{"i2", required_argument, NULL, 2},
{"index-format", required_argument, NULL, 3},
{"barcode-tag", required_argument, NULL, 'b'},
{"quality-tag", required_argument, NULL, 'q'},
+ {"tag", required_argument, NULL, 'd'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
+ while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:",
lopts, NULL)) > 0) {
switch (c) {
case 'b': opts->barcode_tag = optarg; break;
case '2': opts->fnr[2] = optarg; break;
case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break;
case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
- case 'F':
- if (!flag_off_set) {
- flag_off_set = 1;
- opts->flag_off = 0;
- }
- opts->flag_off |= strtol(optarg, 0, 0);
- break;
+ // note that flag_off does not have |= because it has a default
+ // value of 0x900 which needs to be replaced by the optarg
+ case 'F': opts->flag_off = strtol(optarg, 0, 0); break;
case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
+ case LONGOPT('g'): opts->flag_anyon |= strtol(optarg, 0, 0); break;
case 'n': opts->has12 = false; break;
case 'N': opts->has12always = true; break;
case 'O': opts->use_oq = true; break;
case 'T': opts->extra_tags = optarg; break;
case 'v': opts->def_qual = atoi(optarg); break;
+ case 'd':
+ if (strlen(optarg) < 2 ||
+ (strlen(optarg) > 2 && optarg[2] != ':')) {
+ print_error("fastq",
+ "Invalid \"tag:value\" option: \"%s\"",
+ optarg);
+ free_opts(opts);
+ return false;
+ }
+
+ opts->filter_tag = optarg;
+ opts->filter_value_str = strlen(optarg) > 2 ? optarg+3 : NULL;
+ opts->filter_value_int = INT64_MAX; // fill out later
+ opts->filter_value_flt = FLT_MAX;
+ break;
+
case '?':
bam2fq_usage(stderr, argv[0]);
free_opts(opts);
state->flag_on = opts->flag_on;
state->flag_off = opts->flag_off;
state->flag_alloff = opts->flag_alloff;
+ state->flag_anyon = opts->flag_anyon;
state->has12 = opts->has12;
state->use_oq = opts->use_oq;
state->illumina_tag = opts->illumina_tag;
state->hstdout = NULL;
state->compression_level = opts->compression_level;
- state->fp = sam_open(opts->fn_input, "r");
+ state->fp = sam_open_format(opts->fn_input, "r", &opts->ga.in);
if (state->fp == NULL) {
print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
free(state);
}
uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
- if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX;
+ if (opts->use_oq || opts->extra_tags || opts->index_file[0])
+ rf |= SAM_AUX;
+ if (opts->filter_tag) {
+ if (memcmp(opts->filter_tag, "NM", 2) == 0 ||
+ memcmp(opts->filter_tag, "MD", 2) == 0)
+ rf |= SAM_AUX | SAM_SEQ;
+ else if (memcmp(opts->filter_tag, "RG", 2) == 0)
+ rf |= SAM_RGAUX;
+ else
+ rf |= SAM_AUX;
+ }
if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
free(state);
return valid;
}
-static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
+static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state,
+ bam2fq_opts_t *opts)
{
+ if (opts->filter_tag) {
+ uint8_t *s = bam_aux_get(b, opts->filter_tag);
+ if (!s)
+ return true;
+
+ if (opts->filter_value_str) {
+ switch (*s) {
+ case 'i': case 'I':
+ case 's': case 'S':
+ case 'c': case 'C':
+ if (opts->filter_value_int == INT64_MAX)
+ // cache integer conversion for repeated use
+ opts->filter_value_int =
+ strtoll(opts->filter_value_str, NULL, 0);
+ if (opts->filter_value_int != bam_aux2i(s))
+ return true;
+ break;
+
+ case 'f':
+ if (opts->filter_value_flt == FLT_MAX)
+ opts->filter_value_flt = atof(opts->filter_value_str);
+ // Comparing floats is hard.
+ // Eg (double)0.1 - (double)0.1f is -1.5e-9.
+ // Given BAM binary encoding is float however, just keep it.
+ // This means rounding errors will (hopefully) always be the
+ // same and basic equality still works.
+ if (opts->filter_value_flt != (float)bam_aux2f(s))
+ return true;
+ break;
+
+ case 'A':
+ if (s[1] != *opts->filter_value_str)
+ return true;
+ break;
+
+ case 'Z': case 'H':
+ if (strcmp((char *)s+1, opts->filter_value_str) != 0)
+ return true;
+ break;
+
+ default:
+ // Anything unsupported fails the filter match too.
+ return true;
+ }
+ }
+ }
+
return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
|| (b->core.flag&(state->flag_off)) != 0
+ || (((b->core.flag&(state->flag_anyon)) == 0) && (state->flag_anyon != 0))
|| (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
}
}
at_eof = res < 0;
- if (!at_eof && filter_it_out(b[n], state))
+ if (!at_eof && filter_it_out(b[n], state, opts))
continue;
if (!at_eof) {
++n_reads;
/* bam_fastq.c -- FASTA and FASTQ file generation
- Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
+ Copyright (C) 2009-2017, 2019-2020, 2023 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <assert.h>
#include <inttypes.h>
#include <unistd.h>
+#include <float.h>
#include "htslib/sam.h"
#include "htslib/klist.h"
" -o FILE write reads designated READ1 or READ2 to FILE\n"
" note: if a singleton file is specified with -s, only\n"
" paired reads will be written to the -1 and -2 files.\n"
-" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
-" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0
+" -d, --tag TAG[:VAL]\n"
+" only include reads containing TAG, optionally with value VAL\n"
+" -f, --require-flags INT\n"
+" only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F, --excl[ude]-flags INT\n"
+" only include reads with none of the FLAGs in INT present [0x900]\n" // F&x == 0
+" --rf, --incl[ude]-flags INT\n"
+" only include reads with any of the FLAGs in INT present [0]\n" // !(F&x == 0)
" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
" -n don't append /1 and /2 to the read name\n"
" -N always append /1 and /2 to the read name\n",
char *fnr[3];
char *fn_input; // pointer to input filename in argv do not free
bool has12, has12always, use_oq, copy_tags, illumina_tag;
- int flag_on, flag_off, flag_alloff;
+ int flag_on, flag_off, flag_alloff, flag_anyon;
sam_global_args ga;
fastfile filetype;
int def_qual;
char *index_format;
char *extra_tags;
char compression_level;
+ const char *filter_tag; // -d opt
+ const char *filter_value_str;
+ int64_t filter_value_int;
+ float filter_value_flt;
} bam2fq_opts_t;
typedef struct bam2fq_state {
samFile *hstdout;
sam_hdr_t *h;
bool has12, use_oq, copy_tags, illumina_tag;
- int flag_on, flag_off, flag_alloff;
+ int flag_on, flag_off, flag_alloff, flag_anyon;
fastfile filetype;
int def_qual;
char *index_sequence;
free(opts);
}
+// Make mnemonic distinct values for longoption-only options
+#define LONGOPT(c) ((c) + 128)
+
// return true if valid
static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
{
opts->extra_tags = NULL;
opts->compression_level = 1;
opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY;
- int flag_off_set = 0;
int c;
sam_global_args_init(&opts->ga);
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
+ {"require-flags", required_argument, NULL, 'f'},
+ {"excl-flags", required_argument, NULL, 'F'},
+ {"exclude-flags", required_argument, NULL, 'F'},
+ // following the same convention as view: g exists as a longoption_only
+ // argument, accessible from the command line as --rf/--incl[ude]-flags
+ {"rf", required_argument, NULL, LONGOPT('g')},
+ {"incl-flags", required_argument, NULL, LONGOPT('g')},
+ {"include-flags", required_argument, NULL, LONGOPT('g')},
{"i1", required_argument, NULL, 1},
{"I1", required_argument, NULL, 1},
{"i2", required_argument, NULL, 2},
{"index-format", required_argument, NULL, 3},
{"barcode-tag", required_argument, NULL, 'b'},
{"quality-tag", required_argument, NULL, 'q'},
+ {"tag", required_argument, NULL, 'd'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
+ while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:d:",
lopts, NULL)) > 0) {
switch (c) {
case 'b': opts->barcode_tag = optarg; break;
case '2': opts->fnr[2] = optarg; break;
case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break;
case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
- case 'F':
- if (!flag_off_set) {
- flag_off_set = 1;
- opts->flag_off = 0;
- }
- opts->flag_off |= strtol(optarg, 0, 0);
- break;
+ // note that flag_off does not have |= because it has a default
+ // value of 0x900 which needs to be replaced by the optarg
+ case 'F': opts->flag_off = strtol(optarg, 0, 0); break;
case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
+ case LONGOPT('g'): opts->flag_anyon |= strtol(optarg, 0, 0); break;
case 'n': opts->has12 = false; break;
case 'N': opts->has12always = true; break;
case 'O': opts->use_oq = true; break;
case 'T': opts->extra_tags = optarg; break;
case 'v': opts->def_qual = atoi(optarg); break;
+ case 'd':
+ if (strlen(optarg) < 2 ||
+ (strlen(optarg) > 2 && optarg[2] != ':')) {
+ print_error("fastq",
+ "Invalid \"tag:value\" option: \"%s\"",
+ optarg);
+ free_opts(opts);
+ return false;
+ }
+
+ opts->filter_tag = optarg;
+ opts->filter_value_str = strlen(optarg) > 2 ? optarg+3 : NULL;
+ opts->filter_value_int = INT64_MAX; // fill out later
+ opts->filter_value_flt = FLT_MAX;
+ break;
+
case '?':
bam2fq_usage(samtools_stderr, argv[0]);
free_opts(opts);
state->flag_on = opts->flag_on;
state->flag_off = opts->flag_off;
state->flag_alloff = opts->flag_alloff;
+ state->flag_anyon = opts->flag_anyon;
state->has12 = opts->has12;
state->use_oq = opts->use_oq;
state->illumina_tag = opts->illumina_tag;
state->hstdout = NULL;
state->compression_level = opts->compression_level;
- state->fp = sam_open(opts->fn_input, "r");
+ state->fp = sam_open_format(opts->fn_input, "r", &opts->ga.in);
if (state->fp == NULL) {
print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
free(state);
}
uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
- if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX;
+ if (opts->use_oq || opts->extra_tags || opts->index_file[0])
+ rf |= SAM_AUX;
+ if (opts->filter_tag) {
+ if (memcmp(opts->filter_tag, "NM", 2) == 0 ||
+ memcmp(opts->filter_tag, "MD", 2) == 0)
+ rf |= SAM_AUX | SAM_SEQ;
+ else if (memcmp(opts->filter_tag, "RG", 2) == 0)
+ rf |= SAM_RGAUX;
+ else
+ rf |= SAM_AUX;
+ }
if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
free(state);
return valid;
}
-static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
+static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state,
+ bam2fq_opts_t *opts)
{
+ if (opts->filter_tag) {
+ uint8_t *s = bam_aux_get(b, opts->filter_tag);
+ if (!s)
+ return true;
+
+ if (opts->filter_value_str) {
+ switch (*s) {
+ case 'i': case 'I':
+ case 's': case 'S':
+ case 'c': case 'C':
+ if (opts->filter_value_int == INT64_MAX)
+ // cache integer conversion for repeated use
+ opts->filter_value_int =
+ strtoll(opts->filter_value_str, NULL, 0);
+ if (opts->filter_value_int != bam_aux2i(s))
+ return true;
+ break;
+
+ case 'f':
+ if (opts->filter_value_flt == FLT_MAX)
+ opts->filter_value_flt = atof(opts->filter_value_str);
+ // Comparing floats is hard.
+ // Eg (double)0.1 - (double)0.1f is -1.5e-9.
+ // Given BAM binary encoding is float however, just keep it.
+ // This means rounding errors will (hopefully) always be the
+ // same and basic equality still works.
+ if (opts->filter_value_flt != (float)bam_aux2f(s))
+ return true;
+ break;
+
+ case 'A':
+ if (s[1] != *opts->filter_value_str)
+ return true;
+ break;
+
+ case 'Z': case 'H':
+ if (strcmp((char *)s+1, opts->filter_value_str) != 0)
+ return true;
+ break;
+
+ default:
+ // Anything unsupported fails the filter match too.
+ return true;
+ }
+ }
+ }
+
return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
|| (b->core.flag&(state->flag_off)) != 0
+ || (((b->core.flag&(state->flag_anyon)) == 0) && (state->flag_anyon != 0))
|| (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
}
}
at_eof = res < 0;
- if (!at_eof && filter_it_out(b[n], state))
+ if (!at_eof && filter_it_out(b[n], state, opts))
continue;
if (!at_eof) {
++n_reads;
* samtools import a_1.fq a_2.fq
* samtools import a_interleaved.fq
*
- * Copyright (C) 2020-2021 Genome Research Ltd.
+ * Copyright (C) 2020-2021, 2023 Genome Research Ltd.
*
* Author: James Bonfield <jkb@sanger.ac.uk>
*/
char *rg;
char *rg_line;
char *order;
+ int order_str;
int compress_level;
htsThreadPool p;
int name2;
}
if (opts->order) {
- if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
- ret = -1;
- goto err;
+ if (opts->order_str) {
+ char buf[25];
+ snprintf(buf, sizeof(buf), "%0*"PRIu64,
+ opts->order_str, read_num++);
+ if (bam_aux_update_str(b, opts->order,
+ strlen(buf), buf) < 0) {
+ ret = -1;
+ goto err;
+ }
+ } else {
+ if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
+ ret = -1;
+ goto err;
+ }
+ if (read_num == UINT_MAX)
+ fprintf(stderr, "Warning: --order tag has overflowed."
+ " Consider using TAG:LENGTH instead\n");
}
}
.rg = NULL,
.rg_line = NULL,
.order = NULL,
+ .order_str = 0,
.compress_level = -1,
.name2 = 0,
};
case 'N': opts.name2 = 1; break;
case 9: opts.no_pg = 1; break;
- case 3: opts.order = optarg; break;
+ case 3:
+ opts.order = optarg;
+ if (strlen(optarg) > 3 && optarg[2] == ':')
+ opts.order_str = atoi(optarg+3);
+ break;
case 'h': return usage(stdout, EXIT_SUCCESS);
case '?': return usage(stderr, EXIT_FAILURE);
* samtools import a_1.fq a_2.fq
* samtools import a_interleaved.fq
*
- * Copyright (C) 2020-2021 Genome Research Ltd.
+ * Copyright (C) 2020-2021, 2023 Genome Research Ltd.
*
* Author: James Bonfield <jkb@sanger.ac.uk>
*/
char *rg;
char *rg_line;
char *order;
+ int order_str;
int compress_level;
htsThreadPool p;
int name2;
}
if (opts->order) {
- if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
- ret = -1;
- goto err;
+ if (opts->order_str) {
+ char buf[25];
+ snprintf(buf, sizeof(buf), "%0*"PRIu64,
+ opts->order_str, read_num++);
+ if (bam_aux_update_str(b, opts->order,
+ strlen(buf), buf) < 0) {
+ ret = -1;
+ goto err;
+ }
+ } else {
+ if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
+ ret = -1;
+ goto err;
+ }
+ if (read_num == UINT_MAX)
+ fprintf(samtools_stderr, "Warning: --order tag has overflowed."
+ " Consider using TAG:LENGTH instead\n");
}
}
.rg = NULL,
.rg_line = NULL,
.order = NULL,
+ .order_str = 0,
.compress_level = -1,
.name2 = 0,
};
case 'N': opts.name2 = 1; break;
case 9: opts.no_pg = 1; break;
- case 3: opts.order = optarg; break;
+ case 3:
+ opts.order = optarg;
+ if (strlen(optarg) > 3 && optarg[2] == ':')
+ opts.order_str = atoi(optarg+3);
+ break;
case 'h': return usage(samtools_stdout, EXIT_SUCCESS);
case '?': return usage(samtools_stderr, EXIT_FAILURE);
/* bam_index.c -- index and idxstats subcommands.
- Copyright (C) 2008-2011, 2013-2016, 2018, 2019 Genome Research Ltd.
+ Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023 Genome Research Ltd.
Portions copyright (C) 2010 Broad Institute.
Portions copyright (C) 2013 Peter Cock, The James Hutton Institute.
"Usage: samtools index -M [-bc] [-m INT] <in1.bam> <in2.bam>...\n"
" or: samtools index [-bc] [-m INT] <in.bam> [out.index]\n"
"Options:\n"
-" -b Generate BAI-format index for BAM files [default]\n"
-" -c Generate CSI-format index for BAM files\n"
-" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
-" -M Interpret all filename arguments as files to be indexed\n"
-" -o FILE Write index to FILE [alternative to <out.index> as an argument]\n"
-" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
+" -b, --bai Generate BAI-format index for BAM files [default]\n"
+" -c, --csi Generate CSI-format index for BAM files\n"
+" -m, --min-shift INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
+" -M Interpret all filename arguments as files to be indexed\n"
+" -o, --output FILE Write index to FILE [alternative to <out.index> in args]\n"
+" -@, --threads INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
}
// Returns 1 if the file does not exist or can be positively
int n_files, c, i, ret;
const char *fn_idx = NULL;
- while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0)
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'),
+ {"output", required_argument, NULL, 'o'},
+ {"bai", no_argument, NULL, 'b'},
+ {"csi", no_argument, NULL, 'c'},
+ {"min-shift", required_argument, NULL, 'm'},
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "bcm:Mo:@:", lopts, NULL)) >= 0)
switch (c) {
case 'b': csi = 0; break;
case 'c': csi = 1; break;
/* bam_index.c -- index and idxstats subcommands.
- Copyright (C) 2008-2011, 2013-2016, 2018, 2019 Genome Research Ltd.
+ Copyright (C) 2008-2011, 2013-2016, 2018, 2019, 2023 Genome Research Ltd.
Portions copyright (C) 2010 Broad Institute.
Portions copyright (C) 2013 Peter Cock, The James Hutton Institute.
"Usage: samtools index -M [-bc] [-m INT] <in1.bam> <in2.bam>...\n"
" or: samtools index [-bc] [-m INT] <in.bam> [out.index]\n"
"Options:\n"
-" -b Generate BAI-format index for BAM files [default]\n"
-" -c Generate CSI-format index for BAM files\n"
-" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
-" -M Interpret all filename arguments as files to be indexed\n"
-" -o FILE Write index to FILE [alternative to <out.index> as an argument]\n"
-" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
+" -b, --bai Generate BAI-format index for BAM files [default]\n"
+" -c, --csi Generate CSI-format index for BAM files\n"
+" -m, --min-shift INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
+" -M Interpret all filename arguments as files to be indexed\n"
+" -o, --output FILE Write index to FILE [alternative to <out.index> in args]\n"
+" -@, --threads INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
}
// Returns 1 if the file does not exist or can be positively
int n_files, c, i, ret;
const char *fn_idx = NULL;
- while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0)
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'),
+ {"output", required_argument, NULL, 'o'},
+ {"bai", no_argument, NULL, 'b'},
+ {"csi", no_argument, NULL, 'c'},
+ {"min-shift", required_argument, NULL, 'm'},
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "bcm:Mo:@:", lopts, NULL)) >= 0)
switch (c) {
case 'b': csi = 0; break;
case 'c': csi = 1; break;
regex_t *bc_rgx;
int read_groups;
int json;
+ int dc;
} md_param_t;
typedef struct {
bam1_t *b;
struct read_queue_s *duplicate;
struct read_queue_s *original;
+ int dc;
hts_pos_t pos;
int dup_checked;
int read_group;
in_read->original = NULL;
in_read->dup_checked = 0;
in_read->read_group = 0;
+ in_read->dc = 1;
if (param->read_groups) {
uint8_t *data;
}
bp->p = in_read;
+ bp->p->dc += 1;
if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
if (new_score + tie_add > old_score) { // swap reads
dup = bp->p->b;
+ in_read->dc += bp->p->dc;
if (param->check_chain) {
}
dup = in_read->b;
+ bp->p->dc += 1;
}
if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings))
in_read->original = bp->p;
}
+ bp->p->dc += 1;
+
if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
// to the single hash and mark the other as duplicate
if (new_score > old_score) { // swap reads
dup = bp->p->b;
+ in_read->dc += bp->p->dc;
if (param->check_chain) {
in_read->duplicate = bp->p;
in_read->original = bp->p;
}
+ bp->p->dc += 1;
dup = in_read->b;
}
}
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+ if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+ bam_aux_update_int(in_read->b, "dc", in_read->dc);
+ }
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
print_error("markdup", "error, writing temp output failed.\n");
}
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+ if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+ bam_aux_update_int(in_read->b, "dc", in_read->dc);
+ }
+
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
print_error("markdup", "error, writing temp output failed on final write.\n");
goto fail;
}
} else {
+ if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+ bam_aux_update_int(in_read->b, "dc", in_read->dc);
+ }
+
if (sam_write1(param->out, header, in_read->b) < 0) {
print_error("markdup", "error, writing output failed on final write.\n");
goto fail;
}
if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) {
+ if (param->dc && (b->core.flag & BAM_FDUP)) {
+ uint8_t* data = bam_aux_get(b, "dc");
+ if(data) bam_aux_del(b, data);
+ }
if (sam_write1(param->out, header, b) < 0) {
print_error("markdup", "error, writing final output failed.\n");
goto fail;
if (param->check_chain && (param->tag || param->opt_dist))
free(dup_list.c);
+ free(idx_fn);
free(stat_array);
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
if (param->check_chain && (param->tag || param->opt_dist))
free(dup_list.c);
+ free(idx_fn);
free(stat_array);
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
fprintf(stderr, " --use-read-groups Use the read group tags in duplicate matching.\n");
fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
" Mainly for information and debugging.\n");
+ fprintf(stderr, " --duplicate-count Record the original primary read duplication count(include itself) in a \'dc\' tag.\n");
sam_global_opt_help(stderr, "-.O..@..");
char *regex = NULL, *bc_regex = NULL;
char *regex_order = "txy";
md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0};
+ 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0, 0};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"barcode-rgx", required_argument, NULL, 1008},
{"use-read-groups", no_argument, NULL, 1009},
{"json", no_argument, NULL, 1010},
+ {"duplicate-count", no_argument, NULL, 1011},
{NULL, 0, NULL, 0}
};
case 1008: bc_name = 1, bc_regex = optarg; break;
case 1009: param.read_groups = 1; break;
case 1010: param.json = 1; param.do_stats = 1; break;
+ case 1011: param.dc = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return markdup_usage();
regex_t *bc_rgx;
int read_groups;
int json;
+ int dc;
} md_param_t;
typedef struct {
bam1_t *b;
struct read_queue_s *duplicate;
struct read_queue_s *original;
+ int dc;
hts_pos_t pos;
int dup_checked;
int read_group;
in_read->original = NULL;
in_read->dup_checked = 0;
in_read->read_group = 0;
+ in_read->dc = 1;
if (param->read_groups) {
uint8_t *data;
}
bp->p = in_read;
+ bp->p->dc += 1;
if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
if (new_score + tie_add > old_score) { // swap reads
dup = bp->p->b;
+ in_read->dc += bp->p->dc;
if (param->check_chain) {
}
dup = in_read->b;
+ bp->p->dc += 1;
}
if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings))
in_read->original = bp->p;
}
+ bp->p->dc += 1;
+
if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
// to the single hash and mark the other as duplicate
if (new_score > old_score) { // swap reads
dup = bp->p->b;
+ in_read->dc += bp->p->dc;
if (param->check_chain) {
in_read->duplicate = bp->p;
in_read->original = bp->p;
}
+ bp->p->dc += 1;
dup = in_read->b;
}
}
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+ if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+ bam_aux_update_int(in_read->b, "dc", in_read->dc);
+ }
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
print_error("markdup", "error, writing temp output failed.\n");
}
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
+ if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+ bam_aux_update_int(in_read->b, "dc", in_read->dc);
+ }
+
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
print_error("markdup", "error, writing temp output failed on final write.\n");
goto fail;
}
} else {
+ if (param->dc && !(in_read->b->core.flag & BAM_FDUP)) {
+ bam_aux_update_int(in_read->b, "dc", in_read->dc);
+ }
+
if (sam_write1(param->out, header, in_read->b) < 0) {
print_error("markdup", "error, writing output failed on final write.\n");
goto fail;
}
if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) {
+ if (param->dc && (b->core.flag & BAM_FDUP)) {
+ uint8_t* data = bam_aux_get(b, "dc");
+ if(data) bam_aux_del(b, data);
+ }
if (sam_write1(param->out, header, b) < 0) {
print_error("markdup", "error, writing final output failed.\n");
goto fail;
if (param->check_chain && (param->tag || param->opt_dist))
free(dup_list.c);
+ free(idx_fn);
free(stat_array);
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
if (param->check_chain && (param->tag || param->opt_dist))
free(dup_list.c);
+ free(idx_fn);
free(stat_array);
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
fprintf(samtools_stderr, " --use-read-groups Use the read group tags in duplicate matching.\n");
fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
" Mainly for information and debugging.\n");
+ fprintf(samtools_stderr, " --duplicate-count Record the original primary read duplication count(include itself) in a \'dc\' tag.\n");
sam_global_opt_help(samtools_stderr, "-.O..@..");
char *regex = NULL, *bc_regex = NULL;
char *regex_order = "txy";
md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0};
+ 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0, 0};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"barcode-rgx", required_argument, NULL, 1008},
{"use-read-groups", no_argument, NULL, 1009},
{"json", no_argument, NULL, 1010},
+ {"duplicate-count", no_argument, NULL, 1011},
{NULL, 0, NULL, 0}
};
case 1008: bc_name = 1, bc_regex = optarg; break;
case 1009: param.read_groups = 1; break;
case 1010: param.json = 1; param.do_stats = 1; break;
+ case 1011: param.dc = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return markdup_usage();
header = sam_hdr_read(fp);
if (header == NULL || sam_hdr_nref(header) == 0) {
- fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
- goto fail;
+ // NB: if we have no SQ headers but have aligned data, then this will
+ // be caught during processing with e.g.
+ // "[E::sam_parse1] no SQ lines present in the header"
+ fprintf(stderr, "[bam_fillmd] warning: input SAM does not have "
+ "header, performing a no-op.\n");
}
fpout = sam_open_format("-", mode_w, &ga.out);
header = sam_hdr_read(fp);
if (header == NULL || sam_hdr_nref(header) == 0) {
- fprintf(samtools_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
- goto fail;
+ // NB: if we have no SQ headers but have aligned data, then this will
+ // be caught during processing with e.g.
+ // "[E::sam_parse1] no SQ lines present in the header"
+ fprintf(samtools_stderr, "[bam_fillmd] warning: input SAM does not have "
+ "header, performing a no-op.\n");
}
fpout = sam_open_format(samtools_stdout_fn, mode_w, &ga.out);
if (!h)
return ret;
+ // Match output version number with input file.
+ char vers[99];
+ sprintf(vers, "%d.%d", cram_major_vers(in), cram_minor_vers(in));
+ cram_set_option(out, CRAM_OPT_VERSION, vers);
+
// Attempt to fill out a cram->refs[] array from @SQ headers
sam_hdr_t *cram_h = sam_hdr_dup(h);
if (!cram_h)
if (!h)
return ret;
+ // Match output version number with input file.
+ char vers[99];
+ sprintf(vers, "%d.%d", cram_major_vers(in), cram_minor_vers(in));
+ cram_set_option(out, CRAM_OPT_VERSION, vers);
+
// Attempt to fill out a cram->refs[] array from @SQ headers
sam_hdr_t *cram_h = sam_hdr_dup(h);
if (!cram_h)
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2022 Genome Research Ltd.
+ Copyright (C) 2008-2023 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "bedidx.h"
#include "bam.h"
+//#define DEBUG_MINHASH
+
#define BAM_BLOCK_SIZE 2*1024*1024
#define MAX_TMP_FILES 64
htsThreadPool *htspool,
const char *cmd, const htsFormat *in_fmt,
const htsFormat *out_fmt, char *arg_list, int no_pg,
- int write_index) {
+ int write_index, int final_out) {
samFile *fpout = NULL, **fp = NULL;
heap1_t *heap = NULL;
uint64_t idx = 0;
ks_heapmake(heap, heap_size, heap);
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->entry.bam_record;
- if (g_sam_order == MinHash && b->core.tid == -1) {
+ if (g_sam_order == MinHash && b->core.tid == -1 && final_out) {
// Remove the cached minhash value
b->core.pos = -1;
b->core.mpos = -1;
//
// The 64-bit sort key is split over the bam pos and isize fields.
// This permits it to survive writing to temporary file and coming back.
+
+#ifdef DEBUG_MINHASH
+static int ntot = 0, nmis = 0, ndup = 0;
+#endif
+
static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
{
const bam1_t *A = a.bam_record;
if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b);
- const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
- const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
+ const uint64_t m_a = (((uint64_t)A->core.pos)<<31)|(uint32_t)A->core.mpos;
+ const uint64_t m_b = (((uint64_t)B->core.pos)<<31)|(uint32_t)B->core.mpos;
if (m_a < m_b) // by hash
return -1;
else if (m_a > m_b)
return 1;
- else if (A->core.isize < B->core.isize) // by hash location in seq
+
+ // Bigger pos with size minhash means starts further to left
+ else if (A->core.isize > B->core.isize) // by hash location in seq
return -1;
- else if (A->core.isize > B->core.isize)
+ else if (A->core.isize < B->core.isize)
return 1;
else
return bam1_cmp_core(a,b);
int error;
int large_pos;
int minimiser_kmer;
+ bool try_rev;
+ bool no_squash;
} worker_t;
// Returns 0 for success
for (i = 0; i < l; ++i) {
bam1_t *b = buf[i].bam_record;
if (clear_minhash && b->core.tid == -1) {
+ // To see the position for debugging
+ // b->core.pos = ((((uint64_t)b->core.pos)<<31)|(uint32_t)b->core.mpos) + b->core.isize;
// Remove the cached minhash value
b->core.pos = -1;
b->core.mpos = -1;
return ret;
}
+KHASH_MAP_INIT_INT64(kmer, int64_t)
+static khash_t(kmer) *kmer_h = NULL;
+
+// Punt homopolymers somewhere central in the hash space
+#define XOR 0xdead7878beef7878
+
/*
- * Computes the minhash of a sequence using both forward and reverse strands.
+ * Computes the minhash of a sequence using forward strand and if requested
+ * reverse strand.
*
* This is used as a sort key for unmapped data, to collate like sequences
* together and to improve compression ratio.
* The minhash is returned and *pos filled out with location of this hash
* key in the sequence if pos != NULL.
*/
-static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
+static uint64_t minhash(bam1_t *b, int kmer, int window, int *curr_pos,
+ int *end, int *is_rev, int try_fwd, int try_rev,
+ int no_squash) {
uint64_t hashf = 0, minhashf = UINT64_MAX;
- uint64_t hashr = 0, minhashr = UINT64_MAX;
- int minhashpf = 0, minhashpr = 0, i;
+ int minhashpf = *curr_pos, i, j;
+ uint64_t mask = (1L<<(2*kmer))-1;
+ uint8_t *seq = bam_get_seq(b);
+ int len = b->core.l_qseq;
+ uint64_t xor = XOR & mask;
+
+ if (is_rev) *is_rev = 0;
+
+ // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+ // =ACM GRSV TWYH KDBN
+#define X 0
+ static unsigned char L[16] = {
+ X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X,
+ };
+ uint64_t R[16] = {
+ X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X,
+ };
+ for (i = 0; i < 16; i++)
+ R[i] <<= 2*(kmer-1);
+
+ int i_start = *curr_pos;
+ int i_end = MIN(i_start + window, len);
+ int last_base = -1;
+
+ if (try_fwd) {
+ // Initialise hash keys
+ for (i = i_start, j = 0; j < kmer-1 && i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ // collapse homopolymers
+ if (no_squash || last_base != base) {
+ last_base = base;
+ hashf = (hashf<<2) | L[base];
+ j++;
+ }
+ }
+
+ // Loop to find minimum
+ if (no_squash) {
+ for (; i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ hashf = (hashf<<2) | L[base];
+ uint64_t hashfx = (hashf ^ XOR) & mask;
+ if (minhashf > hashfx)
+ minhashf = hashfx, minhashpf = i;
+ }
+ } else {
+ for (; i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ if (last_base != base) {
+ last_base = base;
+ hashf = (hashf<<2) | L[base];
+ uint64_t hashfx = (hashf ^ XOR) & mask;
+ if (minhashf > hashfx)
+ minhashf = hashfx, minhashpf = i;
+ }
+ }
+ }
+ }
+
+ // Same as above for the reverse strand.
+ // Not used for now, but we may wish to consider indexing in both
+ // strands, recording the strand in value (pos), and comparing in one
+ // strand only. Right now we compare on both against a single-stranded
+ // index.
+ if (try_rev) {
+ uint64_t hashr = 0, minhashr = UINT64_MAX;
+ int minhashpr = *curr_pos;
+ int last_base = -1;
+
+ for (i = i_start, j = 0; j < kmer-1 && i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (no_squash || last_base != base) {
+ last_base = base;
+ hashr = (hashr>>2) | R[base];
+ j++;
+ }
+ }
+
+ if (no_squash) {
+ for (; i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ hashr = (hashr>>2) | R[base];
+ if (minhashr > (hashr^xor))
+ minhashr = (hashr^xor), minhashpr = len-i+kmer-2;
+ }
+ } else {
+ for (; i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ if (last_base != base) {
+ last_base = base;
+ hashr = (hashr>>2) | R[base];
+ if (minhashr > (hashr^xor))
+ minhashr = (hashr^xor), minhashpr = len-i+kmer-2;
+ }
+ }
+ }
+
+ if (minhashr < minhashf) {
+ minhashf = minhashr;
+ minhashpf = minhashpr;
+ if (is_rev) *is_rev = 1;
+ }
+ }
+
+ // "*curr_pos = minhashpf" is faster here, but is sometimes
+ // poorer in compression. Eg 10 million novaseq records with
+ // 75.1MB vs 76.9MB cram BA field.
+ //*curr_pos = minhashpf;
+ *curr_pos = minhashpf - (kmer-1);
+ if (end) *end = (i_end == len);
+ return minhashf;
+}
+
+#define UNIQ_BIT 60
+#define UNIQ_TEST(x) (((x) & (1ULL<<UNIQ_BIT))==0)
+#define UNIQ_MASK ((1ULL<<UNIQ_BIT)-1)
+static int build_minhash_index(char *fn, int kmer, int window, int no_squash) {
+ int ret = 1;
+ samFile *in;
+ sam_hdr_t *h = NULL;
+ bam1_t *b = NULL;
+
+ in = sam_open(fn, "r");
+ if (!in) {
+ perror(fn);
+ return 1;
+ }
+
+ kmer_h = kh_init(kmer);
+ if (!kmer_h)
+ goto err;
+
+ if (!(h = sam_hdr_read(in)))
+ goto err;
+
+ if (!(b = bam_init1()))
+ goto err;
+
+ int r;
+ uint64_t tpos = 0;
+ while ((r = sam_read1(in, h, b)) >= 0) {
+ //fprintf(stderr, "LEN\t%d\t%s\n", b->core.l_qseq, bam_get_qname(b));
+ uint64_t hashf;
+ int pos = 0, end = 0;
+ khiter_t k;
+ int ret;
+
+ if (b->core.l_qseq < window)
+ continue;
+
+ // fwd
+ while (!end) {
+ int last_pos = pos;
+ hashf = minhash(b, kmer, window, &pos, &end, NULL, 1, 0,
+ no_squash);
+ k = kh_put(kmer, kmer_h, hashf, &ret);
+ kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<<UNIQ_BIT);
+ pos = MAX(last_pos+kmer, pos+1);
+ //pos++; Slower, but indexes a bit better?
+ }
+ tpos += b->core.l_qseq;
+
+// We could also add reverse keys to the index here.
+// This would avoid reverse complementing during the matching stage.
+// We'd need to add a flag (another high bit of kh_value) to indicate
+// strand.
+// I'm unsure if this is a good trade-off or not.
+
+// // rev
+// pos = 0; end = 0;
+// while (!end) {
+// hashf = minhash(b, kmer, window, &pos, &end, NULL, 0, 1,
+// no_squash);
+// k = kh_put(kmer, kmer_h, hashf, &ret);
+// kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<<UNIQ_BIT);
+// pos++;
+// }
+//
+// tpos += b->core.l_qseq;
+ }
+ if (r < -1)
+ goto err;
+
+ ret = 0;
+ err:
+ if (b) bam_destroy1(b);
+ if (h) sam_hdr_destroy(h);
+ sam_close(in);
+
+ return ret;
+}
+
+/*
+ * A variant of minhash that compares against a previously built index.
+ *
+ * We follow the same steps of scanning through this sequence to find the
+ * minimum hash, but we prefer hash keys that have unique placement in the
+ * index, or if not unique, then non-uniquely placed, over ones that
+ * are absent from the index.
+ */
+static uint64_t minhash_with_idx(bam1_t *b, int kmer, int *pos, int *rev,
+ bool try_rev) {
+ uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX;
+ uint64_t minhashfd = UINT64_MAX;
+ int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j;
uint64_t mask = (1L<<(2*kmer))-1;
unsigned char *seq = bam_get_seq(b);
int len = b->core.l_qseq;
+ const uint64_t xor = XOR & mask;
// Lookup tables for bam_seqi to 0123 fwd/rev hashes
// =ACM GRSV TWYH KDBN
for (i = 0; i < 16; i++)
R[i] <<= 2*(kmer-1);
- // Punt homopolymers somewhere central in the hash space
-#define XOR (0xdead7878beef7878 & mask)
-
// Initialise hash keys
- for (i = 0; i < kmer-1 && i < len; i++) {
+ for (i = j = 0; j < kmer-1 && i < len; i++, j++) {
int base = bam_seqi(seq, i);
hashf = (hashf<<2) | L[base];
- hashr = (hashr>>2) | R[base];
}
// Loop to find minimum
+ int found_f = 0, found_r = 0;
for (; i < len; i++) {
int base = bam_seqi(seq, i);
+ hashf = ((hashf<<2) | L[base]) & mask;
+ const uint64_t hashfx = hashf^xor;
+
+ // Priority for sorting
+ // 1. Unique key in index
+ // 2. Dup key in index
+ // 3. Everything else
+ int index = 0;
+ if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) {
+ khiter_t k = kh_get(kmer, kmer_h, hashfx);
+ if (k != kh_end(kmer_h))
+ index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+ }
+ found_f |= index;
+ switch (index) {
+ case 2: minhashfi = hashfx, minhashpfi = i; break;
+ case 1: minhashfd = hashfx, minhashpfd = i; break;
+
+ default:
+ if (minhashf > hashfx)
+ minhashf = hashfx, minhashpf = i;
+ }
+ }
+
+ if (minhashfi != UINT64_MAX)
+ minhashf = minhashfi, minhashpf = minhashpfi;
+ else if (minhashfd != UINT64_MAX)
+ minhashf = minhashfd, minhashpf = minhashpfd;
+
+ // Same as above for the reverse strand
+ int dir = 0;
+ if (try_rev) {
+ uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX;
+ uint64_t minhashrd = UINT64_MAX;
+ int minhashpr = 0, minhashpri = 0, minhashprd = 0;
+
+ for (i = j = 0; j < kmer-1 && i < len; i++, j++) {
+ int base = bam_seqi(seq, i);
+ hashr = (hashr>>2) | R[base];
+ }
+ for (; i < len; i++) {
+ int base = bam_seqi(seq, i);
+ hashr = (hashr>>2) | R[base];
+ const uint64_t hashrx = hashr^xor;
+
+ int index = 0;
+ if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) {
+ khiter_t k = kh_get(kmer, kmer_h, hashrx);
+ if (k != kh_end(kmer_h))
+ index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+ }
+ found_r |= index;
+ switch (index) {
+ case 2: minhashri = hashrx, minhashpri = i; break;
+ case 1: minhashrd = hashrx, minhashprd = i; break;
+
+ default:
+ if (minhashr > hashrx)
+ minhashr = hashrx, minhashpr = i;
+ }
+ }
+ if (minhashri != UINT64_MAX)
+ minhashr = minhashri, minhashpr = minhashpri;
+ else if (minhashrd != UINT64_MAX)
+ minhashr = minhashrd, minhashpr = minhashprd;
+
+ // Pick reverse if better mapping
+ if ((minhashf > minhashr) || (!found_f && found_r)) {
+ if (!found_f || found_r) {
+ minhashf = minhashr;
+ minhashpf = b->core.l_qseq - minhashpr + kmer - 2;
+ dir = 1;
+ }
+ }
+ }
+
+#ifdef DEBUG_MINHASH
+ ntot++;
+ khiter_t k = kh_get(kmer, kmer_h, minhashf);
+ if (k != kh_end(kmer_h)) {
+ if (!UNIQ_TEST(kh_value(kmer_h, k)))
+ ndup++;
+ minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+ } else {
+ nmis++;
+ }
+#else
+ // For indexed kmers, our hash key is the position the kmer
+ // occurs in the concatenated reference rather than the hash itself.
+ khiter_t k = kh_get(kmer, kmer_h, minhashf);
+ if (k != kh_end(kmer_h))
+ minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+#endif
+
+ if (rev) *rev = dir;
+ if (pos) *pos = minhashpf;
+ return minhashf != UINT64_MAX ? minhashf : 0;
+}
+
+// As per minhash_with_idx but with homopolymer squashing enabled.
+// This function is duplicated to remove conditionals and speed up the
+// hashing code. (Minus the ifdef-ed out code, which is kept above mainly
+// for posterity.)
+static uint64_t minhash_with_idx_squash(bam1_t *b, int kmer, int *pos,
+ int *rev, bool try_rev) {
+ uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX;
+ uint64_t minhashfd = UINT64_MAX;
+ int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j;
+ uint64_t mask = (1L<<(2*kmer))-1;
+ unsigned char *seq = bam_get_seq(b);
+ int len = b->core.l_qseq;
+ const uint64_t xor = XOR & mask;
+
+ // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+ // =ACM GRSV TWYH KDBN
+#define X 0
+ unsigned char L[16] = {
+ X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X,
+ };
+ uint64_t R[16] = {
+ X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X,
+ };
+ for (i = 0; i < 16; i++)
+ R[i] <<= 2*(kmer-1);
+
+ // Initialise hash keys
+ int last_base = -1;
+ for (i = j = 0; j < kmer-1 && i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (base == last_base)
+ continue;
+ last_base = base;
+ j++;
+ hashf = (hashf<<2) | L[base];
+ }
+
+ // Loop to find minimum
+ int found_f = 0, found_r = 0;
+ for (; i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (base == last_base)
+ continue;
+ last_base = base;
hashf = ((hashf<<2) | L[base]) & mask;
- hashr = (hashr>>2) | R[base];
+ const uint64_t hashfx = hashf^xor;
+
+ // Priority for sorting
+ // 1. Unique key in index
+ // 2. Dup key in index
+ // 3. Everything else
+ int index = 0;
+ if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) {
+ khiter_t k = kh_get(kmer, kmer_h, hashfx);
+ if (k != kh_end(kmer_h))
+ index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+ }
+ found_f |= index;
+ switch (index) {
+ case 2: minhashfi = hashfx, minhashpfi = i; break;
+ case 1: minhashfd = hashfx, minhashpfd = i; break;
- if (minhashf > (hashf^XOR))
- minhashf = (hashf^XOR), minhashpf = i;
- if (minhashr > (hashr^XOR))
- minhashr = (hashr^XOR), minhashpr = len-i+kmer-2;
+ default:
+ if (minhashf > hashfx)
+ minhashf = hashfx, minhashpf = i;
+ }
+ }
+
+ if (minhashfi != UINT64_MAX)
+ minhashf = minhashfi, minhashpf = minhashpfi;
+ else if (minhashfd != UINT64_MAX)
+ minhashf = minhashfd, minhashpf = minhashpfd;
+
+ // Same as above for the reverse strand
+ int dir = 0;
+ if (try_rev) {
+ uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX;
+ uint64_t minhashrd = UINT64_MAX;
+ int minhashpr = 0, minhashpri = 0, minhashprd = 0;
+ int last_base = -1;
+
+ for (i = j = 0; j < kmer-1 && i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (base == last_base)
+ continue;
+ last_base = base;
+ j++;
+ hashr = (hashr>>2) | R[base];
+ }
+ for (; i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (base == last_base)
+ continue;
+ last_base = base;
+ hashr = (hashr>>2) | R[base];
+ const uint64_t hashrx = hashr^xor;
+
+ int index = 0;
+ if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) {
+ khiter_t k = kh_get(kmer, kmer_h, hashrx);
+ if (k != kh_end(kmer_h))
+ index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+ }
+ found_r |= index;
+ switch (index) {
+ case 2: minhashri = hashrx, minhashpri = i; break;
+ case 1: minhashrd = hashrx, minhashprd = i; break;
+ default:
+ if (minhashr > hashrx)
+ minhashr = hashrx, minhashpr = i;
+ }
+ }
+ if (minhashri != UINT64_MAX)
+ minhashr = minhashri, minhashpr = minhashpri;
+ else if (minhashrd != UINT64_MAX)
+ minhashr = minhashrd, minhashpr = minhashprd;
+
+ // Pick reverse if better mapping
+ if ((minhashf > minhashr) || (!found_f && found_r)) {
+ if (!found_f || found_r) {
+ minhashf = minhashr;
+ minhashpf = b->core.l_qseq - minhashpr + kmer - 2;
+ dir = 1;
+ }
+ }
}
- if (minhashf <= minhashr) {
- if (rev) *rev = 0;
- if (pos) *pos = minhashpf;
- return minhashf;
+#ifdef DEBUG_MINHASH
+ ntot++;
+ khiter_t k = kh_get(kmer, kmer_h, minhashf);
+ if (k != kh_end(kmer_h)) {
+ if (!UNIQ_TEST(kh_value(kmer_h, k)))
+ ndup++;
+ minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
} else {
- if (rev) *rev = 1;
- if (pos) *pos = minhashpr;
- return minhashr;
+ nmis++;
}
+#else
+ // For indexed kmers, our hash key is the position the kmer
+ // occurs in the concatenated reference rather than the hash itself.
+ khiter_t k = kh_get(kmer, kmer_h, minhashf);
+ if (k != kh_end(kmer_h))
+ minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+#endif
+
+ if (rev) *rev = dir;
+ if (pos) *pos = minhashpf;
+
+ return minhashf != UINT64_MAX ? minhashf : 0;
}
//--- Start of candidates to punt to htslib
continue;
int pos = 0, rev = 0;
- uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev);
+ uint64_t mh = kmer_h
+ ? (w->no_squash
+ ? minhash_with_idx(b, w->minimiser_kmer, &pos, &rev,
+ w->try_rev)
+ : minhash_with_idx_squash(b, w->minimiser_kmer, &pos, &rev,
+ w->try_rev)
+ )
+ : minhash(b, w->minimiser_kmer, b->core.l_qseq,
+ &pos, NULL, &rev, 1, w->try_rev, w->no_squash);
if (rev)
reverse_complement(b);
+ if (!kmer_h) {
+ mh += 1LL<<30;
+ pos = 65535-pos >= 0 ? 65535-pos : 0;
+ } else {
+ mh -= pos;
+ pos = 0;
+ }
+
+
// Store 64-bit hash in unmapped pos and mpos fields.
// The position of hash is in isize, which we use for
// resolving ties when sorting by hash key.
// These are unused for completely unmapped data and
// will be reset during final output.
- b->core.pos = mh>>31;
+ b->core.pos = (mh>>31) & 0x7fffffff;
b->core.mpos = mh&0x7fffffff;
- b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+ b->core.isize = pos;
}
}
static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h,
int n_threads, buf_region *in_mem,
- int large_pos, int minimiser_kmer)
+ int large_pos, int minimiser_kmer, bool try_rev,
+ bool no_squash)
{
int i;
size_t pos, rest;
w[i].h = h;
w[i].large_pos = large_pos;
w[i].minimiser_kmer = minimiser_kmer;
+ w[i].try_rev = try_rev;
+ w[i].no_squash = no_squash;
in_mem[i].from = pos;
in_mem[i].to = pos + w[i].buf_len;
pos += w[i].buf_len; rest -= w[i].buf_len;
@param sam_order the order in which the sort should occur
@param sort_tag the tag to use if sorting by Tag
@param minimiser_kmer the kmer size when sorting by MinHash
+ @param try_rev try reverse strand when sorting by MinHash
@param fn name of the file to be sorted
@param prefix prefix of the temporary files (prefix.NNNN.bam are written)
@param fnout name of the final output file to be written
NOT thread safe.
*/
int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
- const char *fn, const char *prefix,
- const char *fnout, const char *modeout,
- size_t _max_mem, int n_threads,
+ bool try_rev, bool no_squash, const char *fn,
+ const char *prefix, const char *fnout,
+ const char *modeout, size_t _max_mem, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
goto err;
int sort_res = sort_blocks(k, buf, header, n_threads,
- in_mem, large_pos, minimiser_kmer);
+ in_mem, large_pos, minimiser_kmer,
+ try_rev, no_squash);
if (sort_res < 0)
goto err;
&fns[consolidate_from], n_threads,
in_mem, buf, keys,
lib_lookup, &htspool, "sort", NULL, NULL,
- NULL, 1, 0) >= 0) {
+ NULL, 1, 0, 0) >= 0) {
merge_res = 0;
break;
}
// Sort last records
if (k > 0) {
num_in_mem = sort_blocks(k, buf, header, n_threads,
- in_mem, large_pos, minimiser_kmer);
+ in_mem, large_pos, minimiser_kmer, try_rev,
+ no_squash);
if (num_in_mem < 0) goto err;
} else {
num_in_mem = 0;
if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header,
n_files, fns, num_in_mem, in_mem, buf, keys,
lib_lookup, &htspool, "sort", in_fmt, out_fmt,
- arg_list, no_pg, write_index) < 0) {
+ arg_list, no_pg, write_index, 1) < 0) {
// Propagate bam_merge_simple() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
sprintf(fnout, "%s.bam", prefix);
SamOrder sam_order = is_by_qname ? QueryName : Coordinate;
g_sam_order = sam_order;
- ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
+ ret = bam_sort_core_ext(sam_order, NULL, 0, false, true, fn, prefix,
+ fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
free(fnout);
return ret;
}
" -u Output uncompressed data (equivalent to -l 0)\n"
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
" -M Use minimiser for clustering unaligned/unplaced reads\n"
+" -R Do not use reverse strand (only compatible with -M)\n"
" -K INT Kmer size to use for minimiser [20]\n"
+" -I FILE Order minimisers by their position in FILE FASTA\n"
+" -w INT Window size for minimiser indexing via -I ref.fa [100]\n"
+" -H Squash homopolymers when computing minimiser\n"
" -n Sort by read name (not compatible with samtools index command)\n"
" -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
" -o FILE Write final output to FILE rather than standard output\n"
SamOrder sam_order = Coordinate;
bool by_tag = false;
int minimiser_kmer = 20;
+ bool try_rev = true;
char* sort_tag = NULL, *arg_list = NULL;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
struct stat st;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ int window = 100;
+ char *minimiser_ref = NULL;
+ int no_squash = 1;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
switch (c) {
case 'o': fnout = optarg; o_seen = 1; break;
case 'n': sam_order = QueryName; break;
case 1: no_pg = 1; break;
case 2: sam_order = TemplateCoordinate; break;
case 'M': sam_order = MinHash; break;
+ case 'I':
+ sam_order = MinHash; // implicit option
+ minimiser_ref = optarg;
+ break;
+ case 'H': no_squash = 0; break;
+
+ case 'w': window = atoi(optarg); break;
+
+ case 'R': try_rev = false; break;
case 'K':
minimiser_kmer = atoi(optarg);
if (minimiser_kmer < 1)
}
}
+ if (minimiser_ref) {
+ fprintf(stderr, "Building index ... ");
+ fflush(stderr);
+ if (build_minhash_index(minimiser_ref, minimiser_kmer, window,
+ no_squash)) {
+ ret = EXIT_FAILURE;
+ goto sort_end;
+ }
+ fprintf(stderr, "done\n");
+ }
+
// Change sort order if tag sorting is requested. Must update based on secondary index
if (by_tag) {
sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate;
ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
}
- ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0,
+ ret = bam_sort_core_ext(sam_order, sort_tag,
+ (sam_order == MinHash) ? minimiser_kmer : 0,
+ try_rev, no_squash,
(nargs > 0) ? argv[optind] : "-",
tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
&ga.in, &ga.out, arg_list, no_pg, ga.write_index);
ret = EXIT_FAILURE;
}
+#ifdef DEBUG_MINHASH
+ fprintf(stderr, "Missed %.1f%%, dup %.1f%%\n",
+ 100.0*nmis/(ntot+.1),
+ 100.0*ndup/(ntot+.1));
+#endif
+
sort_end:
free(tmpprefix.s);
free(arg_list);
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2022 Genome Research Ltd.
+ Copyright (C) 2008-2023 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "bedidx.h"
#include "bam.h"
+//#define DEBUG_MINHASH
+
#define BAM_BLOCK_SIZE 2*1024*1024
#define MAX_TMP_FILES 64
htsThreadPool *htspool,
const char *cmd, const htsFormat *in_fmt,
const htsFormat *out_fmt, char *arg_list, int no_pg,
- int write_index) {
+ int write_index, int final_out) {
samFile *fpout = NULL, **fp = NULL;
heap1_t *heap = NULL;
uint64_t idx = 0;
ks_heapmake(heap, heap_size, heap);
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->entry.bam_record;
- if (g_sam_order == MinHash && b->core.tid == -1) {
+ if (g_sam_order == MinHash && b->core.tid == -1 && final_out) {
// Remove the cached minhash value
b->core.pos = -1;
b->core.mpos = -1;
//
// The 64-bit sort key is split over the bam pos and isize fields.
// This permits it to survive writing to temporary file and coming back.
+
+#ifdef DEBUG_MINHASH
+static int ntot = 0, nmis = 0, ndup = 0;
+#endif
+
static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
{
const bam1_t *A = a.bam_record;
if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b);
- const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
- const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
+ const uint64_t m_a = (((uint64_t)A->core.pos)<<31)|(uint32_t)A->core.mpos;
+ const uint64_t m_b = (((uint64_t)B->core.pos)<<31)|(uint32_t)B->core.mpos;
if (m_a < m_b) // by hash
return -1;
else if (m_a > m_b)
return 1;
- else if (A->core.isize < B->core.isize) // by hash location in seq
+
+ // Bigger pos with size minhash means starts further to left
+ else if (A->core.isize > B->core.isize) // by hash location in seq
return -1;
- else if (A->core.isize > B->core.isize)
+ else if (A->core.isize < B->core.isize)
return 1;
else
return bam1_cmp_core(a,b);
int error;
int large_pos;
int minimiser_kmer;
+ bool try_rev;
+ bool no_squash;
} worker_t;
// Returns 0 for success
for (i = 0; i < l; ++i) {
bam1_t *b = buf[i].bam_record;
if (clear_minhash && b->core.tid == -1) {
+ // To see the position for debugging
+ // b->core.pos = ((((uint64_t)b->core.pos)<<31)|(uint32_t)b->core.mpos) + b->core.isize;
// Remove the cached minhash value
b->core.pos = -1;
b->core.mpos = -1;
return ret;
}
+KHASH_MAP_INIT_INT64(kmer, int64_t)
+static khash_t(kmer) *kmer_h = NULL;
+
+// Punt homopolymers somewhere central in the hash space
+#define XOR 0xdead7878beef7878
+
/*
- * Computes the minhash of a sequence using both forward and reverse strands.
+ * Computes the minhash of a sequence using forward strand and if requested
+ * reverse strand.
*
* This is used as a sort key for unmapped data, to collate like sequences
* together and to improve compression ratio.
* The minhash is returned and *pos filled out with location of this hash
* key in the sequence if pos != NULL.
*/
-static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
+static uint64_t minhash(bam1_t *b, int kmer, int window, int *curr_pos,
+ int *end, int *is_rev, int try_fwd, int try_rev,
+ int no_squash) {
uint64_t hashf = 0, minhashf = UINT64_MAX;
- uint64_t hashr = 0, minhashr = UINT64_MAX;
- int minhashpf = 0, minhashpr = 0, i;
+ int minhashpf = *curr_pos, i, j;
+ uint64_t mask = (1L<<(2*kmer))-1;
+ uint8_t *seq = bam_get_seq(b);
+ int len = b->core.l_qseq;
+ uint64_t xor = XOR & mask;
+
+ if (is_rev) *is_rev = 0;
+
+ // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+ // =ACM GRSV TWYH KDBN
+#define X 0
+ static unsigned char L[16] = {
+ X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X,
+ };
+ uint64_t R[16] = {
+ X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X,
+ };
+ for (i = 0; i < 16; i++)
+ R[i] <<= 2*(kmer-1);
+
+ int i_start = *curr_pos;
+ int i_end = MIN(i_start + window, len);
+ int last_base = -1;
+
+ if (try_fwd) {
+ // Initialise hash keys
+ for (i = i_start, j = 0; j < kmer-1 && i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ // collapse homopolymers
+ if (no_squash || last_base != base) {
+ last_base = base;
+ hashf = (hashf<<2) | L[base];
+ j++;
+ }
+ }
+
+ // Loop to find minimum
+ if (no_squash) {
+ for (; i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ hashf = (hashf<<2) | L[base];
+ uint64_t hashfx = (hashf ^ XOR) & mask;
+ if (minhashf > hashfx)
+ minhashf = hashfx, minhashpf = i;
+ }
+ } else {
+ for (; i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ if (last_base != base) {
+ last_base = base;
+ hashf = (hashf<<2) | L[base];
+ uint64_t hashfx = (hashf ^ XOR) & mask;
+ if (minhashf > hashfx)
+ minhashf = hashfx, minhashpf = i;
+ }
+ }
+ }
+ }
+
+ // Same as above for the reverse strand.
+ // Not used for now, but we may wish to consider indexing in both
+ // strands, recording the strand in value (pos), and comparing in one
+ // strand only. Right now we compare on both against a single-stranded
+ // index.
+ if (try_rev) {
+ uint64_t hashr = 0, minhashr = UINT64_MAX;
+ int minhashpr = *curr_pos;
+ int last_base = -1;
+
+ for (i = i_start, j = 0; j < kmer-1 && i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (no_squash || last_base != base) {
+ last_base = base;
+ hashr = (hashr>>2) | R[base];
+ j++;
+ }
+ }
+
+ if (no_squash) {
+ for (; i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ hashr = (hashr>>2) | R[base];
+ if (minhashr > (hashr^xor))
+ minhashr = (hashr^xor), minhashpr = len-i+kmer-2;
+ }
+ } else {
+ for (; i < i_end; i++) {
+ int base = bam_seqi(seq, i);
+ if (last_base != base) {
+ last_base = base;
+ hashr = (hashr>>2) | R[base];
+ if (minhashr > (hashr^xor))
+ minhashr = (hashr^xor), minhashpr = len-i+kmer-2;
+ }
+ }
+ }
+
+ if (minhashr < minhashf) {
+ minhashf = minhashr;
+ minhashpf = minhashpr;
+ if (is_rev) *is_rev = 1;
+ }
+ }
+
+ // "*curr_pos = minhashpf" is faster here, but is sometimes
+ // poorer in compression. Eg 10 million novaseq records with
+ // 75.1MB vs 76.9MB cram BA field.
+ //*curr_pos = minhashpf;
+ *curr_pos = minhashpf - (kmer-1);
+ if (end) *end = (i_end == len);
+ return minhashf;
+}
+
+#define UNIQ_BIT 60
+#define UNIQ_TEST(x) (((x) & (1ULL<<UNIQ_BIT))==0)
+#define UNIQ_MASK ((1ULL<<UNIQ_BIT)-1)
+static int build_minhash_index(char *fn, int kmer, int window, int no_squash) {
+ int ret = 1;
+ samFile *in;
+ sam_hdr_t *h = NULL;
+ bam1_t *b = NULL;
+
+ in = sam_open(fn, "r");
+ if (!in) {
+ perror(fn);
+ return 1;
+ }
+
+ kmer_h = kh_init(kmer);
+ if (!kmer_h)
+ goto err;
+
+ if (!(h = sam_hdr_read(in)))
+ goto err;
+
+ if (!(b = bam_init1()))
+ goto err;
+
+ int r;
+ uint64_t tpos = 0;
+ while ((r = sam_read1(in, h, b)) >= 0) {
+ //fprintf(samtools_stderr, "LEN\t%d\t%s\n", b->core.l_qseq, bam_get_qname(b));
+ uint64_t hashf;
+ int pos = 0, end = 0;
+ khiter_t k;
+ int ret;
+
+ if (b->core.l_qseq < window)
+ continue;
+
+ // fwd
+ while (!end) {
+ int last_pos = pos;
+ hashf = minhash(b, kmer, window, &pos, &end, NULL, 1, 0,
+ no_squash);
+ k = kh_put(kmer, kmer_h, hashf, &ret);
+ kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<<UNIQ_BIT);
+ pos = MAX(last_pos+kmer, pos+1);
+ //pos++; Slower, but indexes a bit better?
+ }
+ tpos += b->core.l_qseq;
+
+// We could also add reverse keys to the index here.
+// This would avoid reverse complementing during the matching stage.
+// We'd need to add a flag (another high bit of kh_value) to indicate
+// strand.
+// I'm unsure if this is a good trade-off or not.
+
+// // rev
+// pos = 0; end = 0;
+// while (!end) {
+// hashf = minhash(b, kmer, window, &pos, &end, NULL, 0, 1,
+// no_squash);
+// k = kh_put(kmer, kmer_h, hashf, &ret);
+// kh_value(kmer_h, k) = tpos+pos + (((uint64_t)!ret)<<UNIQ_BIT);
+// pos++;
+// }
+//
+// tpos += b->core.l_qseq;
+ }
+ if (r < -1)
+ goto err;
+
+ ret = 0;
+ err:
+ if (b) bam_destroy1(b);
+ if (h) sam_hdr_destroy(h);
+ sam_close(in);
+
+ return ret;
+}
+
+/*
+ * A variant of minhash that compares against a previously built index.
+ *
+ * We follow the same steps of scanning through this sequence to find the
+ * minimum hash, but we prefer hash keys that have unique placement in the
+ * index, or if not unique, then non-uniquely placed, over ones that
+ * are absent from the index.
+ */
+static uint64_t minhash_with_idx(bam1_t *b, int kmer, int *pos, int *rev,
+ bool try_rev) {
+ uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX;
+ uint64_t minhashfd = UINT64_MAX;
+ int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j;
uint64_t mask = (1L<<(2*kmer))-1;
unsigned char *seq = bam_get_seq(b);
int len = b->core.l_qseq;
+ const uint64_t xor = XOR & mask;
// Lookup tables for bam_seqi to 0123 fwd/rev hashes
// =ACM GRSV TWYH KDBN
for (i = 0; i < 16; i++)
R[i] <<= 2*(kmer-1);
- // Punt homopolymers somewhere central in the hash space
-#define XOR (0xdead7878beef7878 & mask)
-
// Initialise hash keys
- for (i = 0; i < kmer-1 && i < len; i++) {
+ for (i = j = 0; j < kmer-1 && i < len; i++, j++) {
int base = bam_seqi(seq, i);
hashf = (hashf<<2) | L[base];
- hashr = (hashr>>2) | R[base];
}
// Loop to find minimum
+ int found_f = 0, found_r = 0;
for (; i < len; i++) {
int base = bam_seqi(seq, i);
+ hashf = ((hashf<<2) | L[base]) & mask;
+ const uint64_t hashfx = hashf^xor;
+
+ // Priority for sorting
+ // 1. Unique key in index
+ // 2. Dup key in index
+ // 3. Everything else
+ int index = 0;
+ if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) {
+ khiter_t k = kh_get(kmer, kmer_h, hashfx);
+ if (k != kh_end(kmer_h))
+ index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+ }
+ found_f |= index;
+ switch (index) {
+ case 2: minhashfi = hashfx, minhashpfi = i; break;
+ case 1: minhashfd = hashfx, minhashpfd = i; break;
+
+ default:
+ if (minhashf > hashfx)
+ minhashf = hashfx, minhashpf = i;
+ }
+ }
+
+ if (minhashfi != UINT64_MAX)
+ minhashf = minhashfi, minhashpf = minhashpfi;
+ else if (minhashfd != UINT64_MAX)
+ minhashf = minhashfd, minhashpf = minhashpfd;
+
+ // Same as above for the reverse strand
+ int dir = 0;
+ if (try_rev) {
+ uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX;
+ uint64_t minhashrd = UINT64_MAX;
+ int minhashpr = 0, minhashpri = 0, minhashprd = 0;
+
+ for (i = j = 0; j < kmer-1 && i < len; i++, j++) {
+ int base = bam_seqi(seq, i);
+ hashr = (hashr>>2) | R[base];
+ }
+ for (; i < len; i++) {
+ int base = bam_seqi(seq, i);
+ hashr = (hashr>>2) | R[base];
+ const uint64_t hashrx = hashr^xor;
+
+ int index = 0;
+ if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) {
+ khiter_t k = kh_get(kmer, kmer_h, hashrx);
+ if (k != kh_end(kmer_h))
+ index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+ }
+ found_r |= index;
+ switch (index) {
+ case 2: minhashri = hashrx, minhashpri = i; break;
+ case 1: minhashrd = hashrx, minhashprd = i; break;
+
+ default:
+ if (minhashr > hashrx)
+ minhashr = hashrx, minhashpr = i;
+ }
+ }
+ if (minhashri != UINT64_MAX)
+ minhashr = minhashri, minhashpr = minhashpri;
+ else if (minhashrd != UINT64_MAX)
+ minhashr = minhashrd, minhashpr = minhashprd;
+
+ // Pick reverse if better mapping
+ if ((minhashf > minhashr) || (!found_f && found_r)) {
+ if (!found_f || found_r) {
+ minhashf = minhashr;
+ minhashpf = b->core.l_qseq - minhashpr + kmer - 2;
+ dir = 1;
+ }
+ }
+ }
+
+#ifdef DEBUG_MINHASH
+ ntot++;
+ khiter_t k = kh_get(kmer, kmer_h, minhashf);
+ if (k != kh_end(kmer_h)) {
+ if (!UNIQ_TEST(kh_value(kmer_h, k)))
+ ndup++;
+ minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+ } else {
+ nmis++;
+ }
+#else
+ // For indexed kmers, our hash key is the position the kmer
+ // occurs in the concatenated reference rather than the hash itself.
+ khiter_t k = kh_get(kmer, kmer_h, minhashf);
+ if (k != kh_end(kmer_h))
+ minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+#endif
+
+ if (rev) *rev = dir;
+ if (pos) *pos = minhashpf;
+ return minhashf != UINT64_MAX ? minhashf : 0;
+}
+
+// As per minhash_with_idx but with homopolymer squashing enabled.
+// This function is duplicated to remove conditionals and speed up the
+// hashing code. (Minus the ifdef-ed out code, which is kept above mainly
+// for posterity.)
+static uint64_t minhash_with_idx_squash(bam1_t *b, int kmer, int *pos,
+ int *rev, bool try_rev) {
+ uint64_t hashf = 0, minhashf = UINT64_MAX, minhashfi = UINT64_MAX;
+ uint64_t minhashfd = UINT64_MAX;
+ int minhashpf = 0, minhashpfi = 0, minhashpfd = 0, i, j;
+ uint64_t mask = (1L<<(2*kmer))-1;
+ unsigned char *seq = bam_get_seq(b);
+ int len = b->core.l_qseq;
+ const uint64_t xor = XOR & mask;
+
+ // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+ // =ACM GRSV TWYH KDBN
+#define X 0
+ unsigned char L[16] = {
+ X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X,
+ };
+ uint64_t R[16] = {
+ X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X,
+ };
+ for (i = 0; i < 16; i++)
+ R[i] <<= 2*(kmer-1);
+
+ // Initialise hash keys
+ int last_base = -1;
+ for (i = j = 0; j < kmer-1 && i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (base == last_base)
+ continue;
+ last_base = base;
+ j++;
+ hashf = (hashf<<2) | L[base];
+ }
+
+ // Loop to find minimum
+ int found_f = 0, found_r = 0;
+ for (; i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (base == last_base)
+ continue;
+ last_base = base;
hashf = ((hashf<<2) | L[base]) & mask;
- hashr = (hashr>>2) | R[base];
+ const uint64_t hashfx = hashf^xor;
+
+ // Priority for sorting
+ // 1. Unique key in index
+ // 2. Dup key in index
+ // 3. Everything else
+ int index = 0;
+ if (minhashfi > hashfx || (found_f < 2 && minhashfd > hashfx)) {
+ khiter_t k = kh_get(kmer, kmer_h, hashfx);
+ if (k != kh_end(kmer_h))
+ index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+ }
+ found_f |= index;
+ switch (index) {
+ case 2: minhashfi = hashfx, minhashpfi = i; break;
+ case 1: minhashfd = hashfx, minhashpfd = i; break;
- if (minhashf > (hashf^XOR))
- minhashf = (hashf^XOR), minhashpf = i;
- if (minhashr > (hashr^XOR))
- minhashr = (hashr^XOR), minhashpr = len-i+kmer-2;
+ default:
+ if (minhashf > hashfx)
+ minhashf = hashfx, minhashpf = i;
+ }
+ }
+
+ if (minhashfi != UINT64_MAX)
+ minhashf = minhashfi, minhashpf = minhashpfi;
+ else if (minhashfd != UINT64_MAX)
+ minhashf = minhashfd, minhashpf = minhashpfd;
+
+ // Same as above for the reverse strand
+ int dir = 0;
+ if (try_rev) {
+ uint64_t hashr = 0, minhashr = UINT64_MAX, minhashri = UINT64_MAX;
+ uint64_t minhashrd = UINT64_MAX;
+ int minhashpr = 0, minhashpri = 0, minhashprd = 0;
+ int last_base = -1;
+
+ for (i = j = 0; j < kmer-1 && i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (base == last_base)
+ continue;
+ last_base = base;
+ j++;
+ hashr = (hashr>>2) | R[base];
+ }
+ for (; i < len; i++) {
+ int base = bam_seqi(seq, i);
+ if (base == last_base)
+ continue;
+ last_base = base;
+ hashr = (hashr>>2) | R[base];
+ const uint64_t hashrx = hashr^xor;
+
+ int index = 0;
+ if (minhashri > hashrx || (found_r < 2 && minhashrd > hashrx)) {
+ khiter_t k = kh_get(kmer, kmer_h, hashrx);
+ if (k != kh_end(kmer_h))
+ index = UNIQ_TEST(kh_value(kmer_h, k)) ? 2 : 1;
+ }
+ found_r |= index;
+ switch (index) {
+ case 2: minhashri = hashrx, minhashpri = i; break;
+ case 1: minhashrd = hashrx, minhashprd = i; break;
+ default:
+ if (minhashr > hashrx)
+ minhashr = hashrx, minhashpr = i;
+ }
+ }
+ if (minhashri != UINT64_MAX)
+ minhashr = minhashri, minhashpr = minhashpri;
+ else if (minhashrd != UINT64_MAX)
+ minhashr = minhashrd, minhashpr = minhashprd;
+
+ // Pick reverse if better mapping
+ if ((minhashf > minhashr) || (!found_f && found_r)) {
+ if (!found_f || found_r) {
+ minhashf = minhashr;
+ minhashpf = b->core.l_qseq - minhashpr + kmer - 2;
+ dir = 1;
+ }
+ }
}
- if (minhashf <= minhashr) {
- if (rev) *rev = 0;
- if (pos) *pos = minhashpf;
- return minhashf;
+#ifdef DEBUG_MINHASH
+ ntot++;
+ khiter_t k = kh_get(kmer, kmer_h, minhashf);
+ if (k != kh_end(kmer_h)) {
+ if (!UNIQ_TEST(kh_value(kmer_h, k)))
+ ndup++;
+ minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
} else {
- if (rev) *rev = 1;
- if (pos) *pos = minhashpr;
- return minhashr;
+ nmis++;
}
+#else
+ // For indexed kmers, our hash key is the position the kmer
+ // occurs in the concatenated reference rather than the hash itself.
+ khiter_t k = kh_get(kmer, kmer_h, minhashf);
+ if (k != kh_end(kmer_h))
+ minhashf = kh_value(kmer_h, k) & UNIQ_MASK;
+#endif
+
+ if (rev) *rev = dir;
+ if (pos) *pos = minhashpf;
+
+ return minhashf != UINT64_MAX ? minhashf : 0;
}
//--- Start of candidates to punt to htslib
continue;
int pos = 0, rev = 0;
- uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev);
+ uint64_t mh = kmer_h
+ ? (w->no_squash
+ ? minhash_with_idx(b, w->minimiser_kmer, &pos, &rev,
+ w->try_rev)
+ : minhash_with_idx_squash(b, w->minimiser_kmer, &pos, &rev,
+ w->try_rev)
+ )
+ : minhash(b, w->minimiser_kmer, b->core.l_qseq,
+ &pos, NULL, &rev, 1, w->try_rev, w->no_squash);
if (rev)
reverse_complement(b);
+ if (!kmer_h) {
+ mh += 1LL<<30;
+ pos = 65535-pos >= 0 ? 65535-pos : 0;
+ } else {
+ mh -= pos;
+ pos = 0;
+ }
+
+
// Store 64-bit hash in unmapped pos and mpos fields.
// The position of hash is in isize, which we use for
// resolving ties when sorting by hash key.
// These are unused for completely unmapped data and
// will be reset during final output.
- b->core.pos = mh>>31;
+ b->core.pos = (mh>>31) & 0x7fffffff;
b->core.mpos = mh&0x7fffffff;
- b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+ b->core.isize = pos;
}
}
static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h,
int n_threads, buf_region *in_mem,
- int large_pos, int minimiser_kmer)
+ int large_pos, int minimiser_kmer, bool try_rev,
+ bool no_squash)
{
int i;
size_t pos, rest;
w[i].h = h;
w[i].large_pos = large_pos;
w[i].minimiser_kmer = minimiser_kmer;
+ w[i].try_rev = try_rev;
+ w[i].no_squash = no_squash;
in_mem[i].from = pos;
in_mem[i].to = pos + w[i].buf_len;
pos += w[i].buf_len; rest -= w[i].buf_len;
@param sam_order the order in which the sort should occur
@param sort_tag the tag to use if sorting by Tag
@param minimiser_kmer the kmer size when sorting by MinHash
+ @param try_rev try reverse strand when sorting by MinHash
@param fn name of the file to be sorted
@param prefix prefix of the temporary files (prefix.NNNN.bam are written)
@param fnout name of the final output file to be written
NOT thread safe.
*/
int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
- const char *fn, const char *prefix,
- const char *fnout, const char *modeout,
- size_t _max_mem, int n_threads,
+ bool try_rev, bool no_squash, const char *fn,
+ const char *prefix, const char *fnout,
+ const char *modeout, size_t _max_mem, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
goto err;
int sort_res = sort_blocks(k, buf, header, n_threads,
- in_mem, large_pos, minimiser_kmer);
+ in_mem, large_pos, minimiser_kmer,
+ try_rev, no_squash);
if (sort_res < 0)
goto err;
&fns[consolidate_from], n_threads,
in_mem, buf, keys,
lib_lookup, &htspool, "sort", NULL, NULL,
- NULL, 1, 0) >= 0) {
+ NULL, 1, 0, 0) >= 0) {
merge_res = 0;
break;
}
// Sort last records
if (k > 0) {
num_in_mem = sort_blocks(k, buf, header, n_threads,
- in_mem, large_pos, minimiser_kmer);
+ in_mem, large_pos, minimiser_kmer, try_rev,
+ no_squash);
if (num_in_mem < 0) goto err;
} else {
num_in_mem = 0;
if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header,
n_files, fns, num_in_mem, in_mem, buf, keys,
lib_lookup, &htspool, "sort", in_fmt, out_fmt,
- arg_list, no_pg, write_index) < 0) {
+ arg_list, no_pg, write_index, 1) < 0) {
// Propagate bam_merge_simple() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
sprintf(fnout, "%s.bam", prefix);
SamOrder sam_order = is_by_qname ? QueryName : Coordinate;
g_sam_order = sam_order;
- ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
+ ret = bam_sort_core_ext(sam_order, NULL, 0, false, true, fn, prefix,
+ fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
free(fnout);
return ret;
}
" -u Output uncompressed data (equivalent to -l 0)\n"
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
" -M Use minimiser for clustering unaligned/unplaced reads\n"
+" -R Do not use reverse strand (only compatible with -M)\n"
" -K INT Kmer size to use for minimiser [20]\n"
+" -I FILE Order minimisers by their position in FILE FASTA\n"
+" -w INT Window size for minimiser indexing via -I ref.fa [100]\n"
+" -H Squash homopolymers when computing minimiser\n"
" -n Sort by read name (not compatible with samtools index command)\n"
" -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
" -o FILE Write final output to FILE rather than standard output\n"
SamOrder sam_order = Coordinate;
bool by_tag = false;
int minimiser_kmer = 20;
+ bool try_rev = true;
char* sort_tag = NULL, *arg_list = NULL;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
struct stat st;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ int window = 100;
+ char *minimiser_ref = NULL;
+ int no_squash = 1;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MI:K:uRw:H", lopts, NULL)) >= 0) {
switch (c) {
case 'o': fnout = optarg; o_seen = 1; break;
case 'n': sam_order = QueryName; break;
case 1: no_pg = 1; break;
case 2: sam_order = TemplateCoordinate; break;
case 'M': sam_order = MinHash; break;
+ case 'I':
+ sam_order = MinHash; // implicit option
+ minimiser_ref = optarg;
+ break;
+ case 'H': no_squash = 0; break;
+
+ case 'w': window = atoi(optarg); break;
+
+ case 'R': try_rev = false; break;
case 'K':
minimiser_kmer = atoi(optarg);
if (minimiser_kmer < 1)
}
}
+ if (minimiser_ref) {
+ fprintf(samtools_stderr, "Building index ... ");
+ fflush(samtools_stderr);
+ if (build_minhash_index(minimiser_ref, minimiser_kmer, window,
+ no_squash)) {
+ ret = EXIT_FAILURE;
+ goto sort_end;
+ }
+ fprintf(samtools_stderr, "done\n");
+ }
+
// Change sort order if tag sorting is requested. Must update based on secondary index
if (by_tag) {
sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate;
ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
}
- ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0,
+ ret = bam_sort_core_ext(sam_order, sort_tag,
+ (sam_order == MinHash) ? minimiser_kmer : 0,
+ try_rev, no_squash,
(nargs > 0) ? argv[optind] : "-",
tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
&ga.in, &ga.out, arg_list, no_pg, ga.write_index);
ret = EXIT_FAILURE;
}
+#ifdef DEBUG_MINHASH
+ fprintf(samtools_stderr, "Missed %.1f%%, dup %.1f%%\n",
+ 100.0*nmis/(ntot+.1),
+ 100.0*ndup/(ntot+.1));
+#endif
+
sort_end:
free(tmpprefix.s);
free(arg_list);
/* bam_split.c -- split subcommand.
- Copyright (C) 2013-2016,2018-2019 Genome Research Ltd.
+ Copyright (C) 2013-2016,2018-2019,2023 Genome Research Ltd.
Author: Martin Pollard <mp15@sanger.ac.uk>
}
}
- retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
+ retval->merged_input_file = sam_open_format(opts->merged_input_name, "r", &opts->ga.in);
if (!retval->merged_input_file) {
print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
cleanup_state(retval, false);
}
}
- retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
+ char outmode[4] = "w";
+ sam_open_mode(outmode + 1, opts->unaccounted_name, NULL);
+ retval->unaccounted_file = sam_open_format(opts->unaccounted_name, outmode, &opts->ga.out);
+
if (retval->unaccounted_file == NULL) {
print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
cleanup_state(retval, false);
size_t i;
for (i = 0; i < retval->output_count; i++) {
char* output_filename = NULL;
+ char outmode[4] = "w";
output_filename = expand_format_string(opts->output_format_string,
input_base_name,
}
retval->rg_output_file_name[i] = output_filename;
- retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
+
+ sam_open_mode(outmode + 1, output_filename, NULL);
+ retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
+
if (retval->rg_output_file[i] == NULL) {
print_error_errno("split", "Could not open \"%s\"", output_filename);
cleanup_state(retval, false);
/* bam_split.c -- split subcommand.
- Copyright (C) 2013-2016,2018-2019 Genome Research Ltd.
+ Copyright (C) 2013-2016,2018-2019,2023 Genome Research Ltd.
Author: Martin Pollard <mp15@sanger.ac.uk>
}
}
- retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
+ retval->merged_input_file = sam_open_format(opts->merged_input_name, "r", &opts->ga.in);
if (!retval->merged_input_file) {
print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
cleanup_state(retval, false);
}
}
- retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
+ char outmode[4] = "w";
+ sam_open_mode(outmode + 1, opts->unaccounted_name, NULL);
+ retval->unaccounted_file = sam_open_format(opts->unaccounted_name, outmode, &opts->ga.out);
+
if (retval->unaccounted_file == NULL) {
print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
cleanup_state(retval, false);
size_t i;
for (i = 0; i < retval->output_count; i++) {
char* output_filename = NULL;
+ char outmode[4] = "w";
output_filename = expand_format_string(opts->output_format_string,
input_base_name,
}
retval->rg_output_file_name[i] = output_filename;
- retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
+
+ sam_open_mode(outmode + 1, output_filename, NULL);
+ retval->rg_output_file[i] = sam_open_format(output_filename, outmode, &opts->ga.out);
+
if (retval->rg_output_file[i] == NULL) {
print_error_errno("split", "Could not open \"%s\"", output_filename);
cleanup_state(retval, false);
" -l INT Compression level [%d]\n" // DEF_CLEVEL
" -n INT Number of temporary files [%d]\n" // n_files
" -T PREFIX\n"
- " Write tempory files to PREFIX.nnnn.bam\n"
+ " Write temporary files to PREFIX.nnnn.bam\n"
" --no-PG do not add a PG line\n",
reads_store, DEF_CLEVEL, n_files);
" -l INT Compression level [%d]\n" // DEF_CLEVEL
" -n INT Number of temporary files [%d]\n" // n_files
" -T PREFIX\n"
- " Write tempory files to PREFIX.nnnn.bam\n"
+ " Write temporary files to PREFIX.nnnn.bam\n"
" --no-PG do not add a PG line\n",
reads_store, DEF_CLEVEL, n_files);
/* consensus__pileup.h -- Pileup orientated data per consensus column
- Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd.
+ Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd.
Author: James Bonfied <jkb@sanger.ac.uk>
/* consensus__pileup.h -- Pileup orientated data per consensus column
- Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd.
+ Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd.
Author: James Bonfied <jkb@sanger.ac.uk>
/* consensus_pileup.h -- Pileup orientated data per consensus column
- Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd.
+ Copyright (C) 2013-2016, 2020-2022 Genome Research Ltd.
Author: James Bonfied <jkb@sanger.ac.uk>
fprintf(outfp, "Number of slices %18"PRId64"\n", nslice);
fprintf(outfp, "Number of sequences %18"PRId64"\n", nseqs);
fprintf(outfp, "Number of bases %18"PRId64"\n", nbases);
- fprintf(outfp, "Total file size %18"PRId64"\n", end);
- fprintf(outfp, "Format overhead size %18"PRId64"\n", end - tot_size);
+ fprintf(outfp, "Total file size %18"PRId64"\n", (int64_t) end);
+ fprintf(outfp, "Format overhead size %18"PRId64"\n", (int64_t) (end - tot_size));
return 0;
fprintf(outfp, "Number of slices %18"PRId64"\n", nslice);
fprintf(outfp, "Number of sequences %18"PRId64"\n", nseqs);
fprintf(outfp, "Number of bases %18"PRId64"\n", nbases);
- fprintf(outfp, "Total file size %18"PRId64"\n", end);
- fprintf(outfp, "Format overhead size %18"PRId64"\n", end - tot_size);
+ fprintf(outfp, "Total file size %18"PRId64"\n", (int64_t) end);
+ fprintf(outfp, "Format overhead size %18"PRId64"\n", (int64_t) (end - tot_size));
return 0;
*/
+#include <config.h>
+
#include "samtools.h"
#include "htslib/sam.h"
#include "sam_opts.h"
*/
+#include <config.h>
+
#include "samtools.h"
#include "htslib/sam.h"
#include "sam_opts.h"
}
}
-// Returns 0 to indicate read should be output 1 otherwise
+// Returns 0 to indicate read should be output 1 otherwise,
+// and -1 on error.
static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
{
- if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
- return 1;
+ if (settings->filter) {
+ int r = sam_passes_filter(h, b, settings->filter);
+ if (r < 0) // err
+ return -1;
+ if (r == 0) // filter-out
+ return 1;
+ }
if (settings->remove_B) bam_remove_B(b);
if (settings->min_qlen > 0) {
while ((r =sam_itr_multi_next(conf->in, iter, rec))>=0) {
if ( (rec->core.flag & BAM_FPAIRED) == 0 ) continue;
if ( rec->core.mtid>=0 && bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.mtid), rec->core.mpos, rec->core.mpos) ) continue;
- if ( process_aln(conf->header, rec, conf) ) continue;
+ int p = process_aln(conf->header, rec, conf);
+ if (p < 0) goto out;
+ if (p == 1) continue;
nmates++;
k = kh_get(names,mate_names,bam_get_qname(rec));
if ( k != kh_end(mate_names) ) drop = 0;
}
- if (!drop && process_aln(conf->header, rec, conf) == 0) {
+ int p = 0;
+ if (!drop && (p=process_aln(conf->header, rec, conf))== 0) {
if (adjust_tags(conf->header, rec, conf) != 0)
goto out;
if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out,
&write_error) < 0)
goto out;
}
+ if (p < 0)
+ goto out;
}
if (r < -1) {
if (bam_sanitize(conf->header, b, conf->sanitize) < 0)
return -1;
- if (!process_aln(conf->header, b, conf)) {
+ int p;
+ if ((p = process_aln(conf->header, b, conf)) < 0) {
+ // error
+ return -1;
+ } else if (p == 0) {
+ // emit read
if (!conf->is_count) {
change_flag(b, conf);
if (adjust_tags(conf->header, b, conf) != 0)
static int stream_view(samview_settings_t *conf) {
bam1_t *b = bam_init1();
- int write_error = 0, r;
+ int write_error = 0, r, p = 0;
if (!b) {
print_error_errno("view", "could not allocate bam record");
return 1;
}
errno = 0; // prevent false error messages.
while ((r = sam_read1(conf->in, conf->header, b)) >= 0) {
- if (process_one_record(conf, b, &write_error) < 0) break;
+ if ((p = process_one_record(conf, b, &write_error)) < 0) break;
}
bam_destroy1(b);
- if (r < -1) {
+ if (r < -1 || p < 0) {
print_error_errno("view", "error reading file \"%s\"", conf->fn_in);
return 1;
}
}
}
-// Returns 0 to indicate read should be output 1 otherwise
+// Returns 0 to indicate read should be output 1 otherwise,
+// and -1 on error.
static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
{
- if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
- return 1;
+ if (settings->filter) {
+ int r = sam_passes_filter(h, b, settings->filter);
+ if (r < 0) // err
+ return -1;
+ if (r == 0) // filter-out
+ return 1;
+ }
if (settings->remove_B) bam_remove_B(b);
if (settings->min_qlen > 0) {
while ((r =sam_itr_multi_next(conf->in, iter, rec))>=0) {
if ( (rec->core.flag & BAM_FPAIRED) == 0 ) continue;
if ( rec->core.mtid>=0 && bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.mtid), rec->core.mpos, rec->core.mpos) ) continue;
- if ( process_aln(conf->header, rec, conf) ) continue;
+ int p = process_aln(conf->header, rec, conf);
+ if (p < 0) goto out;
+ if (p == 1) continue;
nmates++;
k = kh_get(names,mate_names,bam_get_qname(rec));
if ( k != kh_end(mate_names) ) drop = 0;
}
- if (!drop && process_aln(conf->header, rec, conf) == 0) {
+ int p = 0;
+ if (!drop && (p=process_aln(conf->header, rec, conf))== 0) {
if (adjust_tags(conf->header, rec, conf) != 0)
goto out;
if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out,
&write_error) < 0)
goto out;
}
+ if (p < 0)
+ goto out;
}
if (r < -1) {
if (bam_sanitize(conf->header, b, conf->sanitize) < 0)
return -1;
- if (!process_aln(conf->header, b, conf)) {
+ int p;
+ if ((p = process_aln(conf->header, b, conf)) < 0) {
+ // error
+ return -1;
+ } else if (p == 0) {
+ // emit read
if (!conf->is_count) {
change_flag(b, conf);
if (adjust_tags(conf->header, b, conf) != 0)
static int stream_view(samview_settings_t *conf) {
bam1_t *b = bam_init1();
- int write_error = 0, r;
+ int write_error = 0, r, p = 0;
if (!b) {
print_error_errno("view", "could not allocate bam record");
return 1;
}
errno = 0; // prevent false error messages.
while ((r = sam_read1(conf->in, conf->header, b)) >= 0) {
- if (process_one_record(conf, b, &write_error) < 0) break;
+ if ((p = process_one_record(conf, b, &write_error)) < 0) break;
}
bam_destroy1(b);
- if (r < -1) {
+ if (r < -1 || p < 0) {
print_error_errno("view", "error reading file \"%s\"", conf->fn_in);
return 1;
}
fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches);
fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0);
- float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0;
+ float avg_read_length = (stats->nreads_1st +
+ stats->nreads_2nd +
+ stats->nreads_other)
+ ? (float)stats->total_len / (stats->nreads_1st +
+ stats->nreads_2nd +
+ stats->nreads_other)
+ : 0;
fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length);
fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0);
fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0);
fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches);
fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0);
- float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0;
+ float avg_read_length = (stats->nreads_1st +
+ stats->nreads_2nd +
+ stats->nreads_other)
+ ? (float)stats->total_len / (stats->nreads_1st +
+ stats->nreads_2nd +
+ stats->nreads_other)
+ : 0;
fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length);
fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0);
fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0);
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.17
+VERSION=1.18
# If we have a git clone, then check against the current tag
if [ -e .git ]
#! /usr/bin/python
-'''pysam - a python module for reading, manipulating and writing
+'''pysam --- a Python package for reading, manipulating, and writing
genomic data sets.
-pysam is a lightweight wrapper of the htslib C-API and provides
-facilities to read and write SAM/BAM/VCF/BCF/BED/GFF/GTF/FASTA/FASTQ
-files as well as access to the command line functionality of the
-samtools and bcftools packages. The module supports compression and
-random access through indexing.
-
-This module provides a low-level wrapper around the htslib C-API as
-using cython and a high-level API for convenient access to the data
-within standard genomic file formats.
-
-See:
-http://www.htslib.org
-https://github.com/pysam-developers/pysam
-http://pysam.readthedocs.org/en/stable
+pysam is a lightweight wrapper of the HTSlib API and provides facilities
+to read and write SAM/BAM/CRAM/VCF/BCF/BED/GFF/GTF/FASTA/FASTQ files
+as well as access to the command-line functionality of samtools and bcftools.
+The module supports compression and random access through indexing.
+This module provides a low-level wrapper around HTSlib's C API using Cython
+and a high-level API for convenient access to the data within standard genomic
+file formats.
'''
import collections
import glob
+import logging
import os
import platform
import re
import sys
import sysconfig
from contextlib import contextmanager
-from distutils import log
from setuptools import setup, Command
-from distutils.command.build import build
from setuptools.command.sdist import sdist
-from distutils.errors import LinkError
+from setuptools.extension import Extension
+
+try:
+ from setuptools.errors import LinkError
+except ImportError:
+ from distutils.errors import LinkError
+
+try:
+ from Cython.Distutils import build_ext
+except ImportError:
+ from setuptools.command.build_ext import build_ext
-from cy_build import CyExtension as Extension, cy_build_ext as build_ext
try:
import cython # noqa
HAVE_CYTHON = True
IS_PYTHON3 = sys.version_info.major >= 3
IS_DARWIN = platform.system() == 'Darwin'
+log = logging.getLogger('pysam')
+
@contextmanager
def changedir(path):
sdist.run(self)
-# Override build command to add extra build steps.
-class extra_build(build):
+# Override Cythonised build_ext command to customise macOS shared libraries.
+
+class CyExtension(Extension):
+ def __init__(self, *args, **kwargs):
+ self._init_func = kwargs.pop("init_func", None)
+ self._prebuild_func = kwargs.pop("prebuild_func", None)
+ Extension.__init__(self, *args, **kwargs)
+
+ def extend_includes(self, includes):
+ self.include_dirs.extend(includes)
+
+ def extend_macros(self, macros):
+ self.define_macros.extend(macros)
+
+ def extend_extra_objects(self, objs):
+ self.extra_objects.extend(objs)
+
+
+class cy_build_ext(build_ext):
def check_ext_symbol_conflicts(self):
"""Checks for symbols defined in multiple extension modules,
which can lead to crashes due to incorrect functions being invoked.
Avoid by adding an appropriate #define to import/pysam.h or in
unusual cases adding another rewrite rule to devtools/import.py.
"""
- build_ext_obj = self.distribution.get_command_obj('build_ext')
-
symbols = dict()
for ext in self.distribution.ext_modules:
- for sym in run_nm_defined_symbols(build_ext_obj.get_ext_fullpath(ext.name)):
+ for sym in run_nm_defined_symbols(self.get_ext_fullpath(ext.name)):
symbols.setdefault(sym, []).append(ext.name.lstrip('pysam.'))
errors = 0
if errors > 0: raise LinkError("symbols defined in multiple extensions")
def run(self):
- build.run(self)
+ if sys.platform == 'darwin':
+ ldshared = os.environ.get('LDSHARED', sysconfig.get_config_var('LDSHARED'))
+ os.environ['LDSHARED'] = ldshared.replace('-bundle', '')
+
+ build_ext.run(self)
try:
if HTSLIB_MODE != 'separate':
self.check_ext_symbol_conflicts()
except OSError as e:
- log.warn("skipping symbol collision check (invoking nm failed: %s)", e)
+ log.warning("skipping symbol collision check (invoking nm failed: %s)", e)
except subprocess.CalledProcessError:
- log.warn("skipping symbol collision check (invoking nm failed)")
+ log.warning("skipping symbol collision check (invoking nm failed)")
+
+ def build_extension(self, ext):
+
+ if isinstance(ext, CyExtension) and ext._init_func:
+ ext._init_func(ext)
+
+ if not self.inplace:
+ ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
+
+ if sys.platform == 'darwin':
+ # The idea is to give shared libraries an install name of the form
+ # `@rpath/<library-name.so>`, and to set the rpath equal to
+ # @loader_path. This will allow Python packages to find the library
+ # in the expected place, while still giving enough flexibility to
+ # external applications to link against the library.
+ relative_module_path = ext.name.replace(".", os.sep) + (sysconfig.get_config_var('EXT_SUFFIX') or sysconfig.get_config_var('SO'))
+ library_path = os.path.join(
+ "@rpath", os.path.basename(relative_module_path)
+ )
+
+ if not ext.extra_link_args:
+ ext.extra_link_args = []
+ ext.extra_link_args += ['-dynamiclib',
+ '-rpath', '@loader_path',
+ '-Wl,-headerpad_max_install_names',
+ '-Wl,-install_name,%s' % library_path,
+ '-Wl,-x']
+ else:
+ if not ext.extra_link_args:
+ ext.extra_link_args = []
+
+ ext.extra_link_args += ['-Wl,-rpath,$ORIGIN']
+
+ if isinstance(ext, CyExtension) and ext._prebuild_func:
+ ext._prebuild_func(ext, self.force)
+
+ build_ext.build_extension(self, ext)
class clean_ext(Command):
for line in inf:
if line.startswith("#define"):
key, value = re.match(
- "#define (\S+)\s+(\S+)", line).groups()
+ r"#define (\S+)\s+(\S+)", line).groups()
config_values[key] = value
for key in ["ENABLE_GCS",
"ENABLE_PLUGINS",
args = " ".join(ext.extra_compile_args)
run_make(["ALL_CPPFLAGS=-I. " + args + " $(CPPFLAGS)", "lib-static"])
else:
- log.warn("skipping 'libhts.a' (already built)")
+ log.warning("skipping 'libhts.a' (already built)")
def prebuild_libcsamtools(ext, force):
metadata = {
'name': "pysam",
'version': get_pysam_version(),
- 'description': "pysam",
+ 'description': "Package for reading, manipulating, and writing genomic data",
'long_description': __doc__,
+ 'long_description_content_type': "text/x-rst",
'author': "Andreas Heger",
'author_email': "andreas.heger@gmail.com",
'license': "MIT",
'classifiers': [_f for _f in classifiers.split("\n") if _f],
'url': "https://github.com/pysam-developers/pysam",
'packages': package_list,
- 'requires': ['cython (>=0.29.12)'],
- 'ext_modules': [Extension(**opts) for opts in modules],
- 'cmdclass': {'build': extra_build, 'build_ext': build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist},
+ 'ext_modules': [CyExtension(**opts) for opts in modules],
+ 'cmdclass': {'build_ext': cy_build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist},
'package_dir': package_dirs,
'package_data': {'': ['*.pxd', '*.h', 'py.typed', '*.pyi'], },
# do not pack in order to permit linking to csamtools.so
],
)
+ def test_get_aligned_pairs_1character_md(self):
+ a = self.build_read()
+ a.query_sequence = "A" * 7
+ a.cigarstring = "7M"
+ a.set_tag("MD", "7", value_type="A")
+ self.assertEqual(
+ a.get_aligned_pairs(with_seq=True),
+ [
+ (0, 20, "A"),
+ (1, 21, "A"),
+ (2, 22, "A"),
+ (3, 23, "A"),
+ (4, 24, "A"),
+ (5, 25, "A"),
+ (6, 26, "A"),
+ ],
+ )
+
+ def test_get_aligned_pairs_bad_type_md(self):
+ a = self.build_read()
+ a.query_sequence = "A" * 7
+ a.cigarstring = "7M"
+ a.set_tag("MD", 7)
+ with self.assertRaises(TypeError):
+ a.get_aligned_pairs(with_seq=True)
+
def testNoSequence(self):
"""issue 176: retrieving length without query sequence
with soft-clipping.
+# cython: language_level=3
+
from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
from pysam.libctabix cimport Tabixfile
+# cython: language_level=3
+
from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
from pysam.libcalignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED
from pysam.libcalignedsegment cimport pysam_get_flag
+++ /dev/null
-# content of: tox.ini , put in same dir as setup.py
-[tox]
-envlist = py36 py311
-
-[testenv]
-deps = pytest # install pytest in the virtualenv where commands will be executed
-commands =
- pytest tests