From 95d45b0b9d397716d8b779eede3b7d690c7d8823 Mon Sep 17 00:00:00 2001 From: Andreas Tille Date: Fri, 30 Sep 2022 12:00:06 +0200 Subject: [PATCH] New upstream version 0.19.1+ds --- NEWS | 124 +- README.rst | 2 +- bcftools/abuf.c | 21 +- bcftools/abuf.c.pysam.c | 21 +- bcftools/bam_sample.c | 6 +- bcftools/bam_sample.c.pysam.c | 6 +- bcftools/bcftools.h | 9 +- bcftools/bcftools.pysam.c | 26 +- bcftools/bcftools.pysam.h | 20 +- bcftools/bin.c | 3 +- bcftools/bin.c.pysam.c | 3 +- bcftools/consensus.c | 13 +- bcftools/consensus.c.pysam.c | 13 +- bcftools/csq.c | 82 +- bcftools/csq.c.pysam.c | 82 +- bcftools/dbuf.h | 71 ++ bcftools/filter.c | 30 +- bcftools/filter.c.pysam.c | 30 +- bcftools/main.c | 16 +- bcftools/main.c.pysam.c | 16 +- bcftools/mpileup.c | 66 +- bcftools/mpileup.c.pysam.c | 66 +- bcftools/regidx.h | 22 + bcftools/reheader.c | 24 +- bcftools/reheader.c.pysam.c | 24 +- bcftools/smpl_ilist.c | 34 +- bcftools/smpl_ilist.c.pysam.c | 34 +- bcftools/smpl_ilist.h | 3 +- bcftools/tsv2vcf.c | 15 +- bcftools/tsv2vcf.c.pysam.c | 15 +- bcftools/tsv2vcf.h | 3 +- bcftools/vcfannotate.c | 300 +++-- bcftools/vcfannotate.c.pysam.c | 300 +++-- bcftools/vcfcall.c | 6 +- bcftools/vcfcall.c.pysam.c | 6 +- bcftools/vcfcnv.c | 16 +- bcftools/vcfcnv.c.pysam.c | 16 +- bcftools/vcfconcat.c | 6 +- bcftools/vcfconcat.c.pysam.c | 6 +- bcftools/vcfconvert.c | 209 +++- bcftools/vcfconvert.c.pysam.c | 209 +++- bcftools/vcffilter.c | 126 +- bcftools/vcffilter.c.pysam.c | 126 +- bcftools/vcfgtcheck.c | 12 +- bcftools/vcfgtcheck.c.pysam.c | 12 +- bcftools/vcfhead.c | 133 ++ bcftools/vcfhead.c.pysam.c | 135 ++ bcftools/vcfisec.c | 15 +- bcftools/vcfisec.c.pysam.c | 15 +- bcftools/vcfmerge.c | 63 +- bcftools/vcfmerge.c.pysam.c | 63 +- bcftools/vcfnorm.c | 31 +- bcftools/vcfnorm.c.pysam.c | 31 +- bcftools/vcfplugin.c | 32 +- bcftools/vcfplugin.c.pysam.c | 32 +- bcftools/vcfquery.c | 77 +- bcftools/vcfquery.c.pysam.c | 77 +- bcftools/vcfroh.c | 82 +- bcftools/vcfroh.c.pysam.c | 82 +- bcftools/vcfstats.c | 56 +- bcftools/vcfstats.c.pysam.c | 56 +- bcftools/vcfview.c | 12 +- bcftools/vcfview.c.pysam.c | 12 +- bcftools/version.c | 11 +- bcftools/version.c.pysam.c | 11 +- bcftools/version.sh | 2 +- cy_build.py | 2 +- devtools/import.py | 10 +- doc/api.rst | 11 +- doc/conf.py | 9 +- doc/developer.rst | 23 +- doc/faq.rst | 75 +- doc/glossary.rst | 54 +- doc/index.rst | 6 +- doc/release.rst | 29 +- doc/usage.rst | 50 +- import/pysam.c | 26 +- import/pysam.h | 20 +- pysam/Pileup.py | 8 +- pysam/__init__.py | 30 +- pysam/bcftools.py | 1 + pysam/libcalignedsegment.pxd | 3 +- pysam/libcalignedsegment.pyi | 216 ++++ pysam/libcalignedsegment.pyx | 284 +++-- pysam/libcalignmentfile.pyi | 237 ++++ pysam/libcalignmentfile.pyx | 28 +- pysam/libcbcf.pxd | 2 + pysam/libcbcf.pyi | 369 ++++++ pysam/libcbcf.pyx | 69 +- pysam/libcbcftools.pxd | 1 - pysam/libcbcftools.pyi | 1 + pysam/libcbgzf.pyi | 40 + pysam/libcbgzf.pyx | 2 +- pysam/libcfaidx.pyi | 68 + pysam/libchtslib.pxd | 82 ++ pysam/libchtslib.pyi | 115 ++ pysam/libchtslib.pyx | 2 +- pysam/libcsamfile.pyi | 5 + pysam/libcsamtools.pxd | 1 - pysam/libcsamtools.pyi | 1 + pysam/libctabix.pyi | 103 ++ pysam/libctabixproxies.pyi | 62 + pysam/libcutils.pyi | 28 + pysam/libcutils.pyx | 14 +- pysam/libcvcf.pyi | 0 pysam/py.typed | 0 pysam/samtools.py | 2 + pysam/version.h | 6 +- pysam/version.py | 8 +- pysam/version.pyi | 4 + samtools/LICENSE | 2 +- samtools/README | 10 +- samtools/bam2bcf.c | 821 ------------ samtools/bam2bcf.c.pysam.c | 823 ------------- samtools/bam2bcf.h | 140 --- samtools/bam2bcf_indel.c | 547 -------- samtools/bam2bcf_indel.c.pysam.c | 549 --------- samtools/bam2depth.c | 10 +- samtools/bam2depth.c.pysam.c | 10 +- samtools/bam_ampliconclip.c | 5 +- samtools/bam_ampliconclip.c.pysam.c | 5 +- samtools/bam_consensus.c | 1712 +++++++++++++++++++++++++ samtools/bam_consensus.c.pysam.c | 1714 ++++++++++++++++++++++++++ samtools/bam_lpileup.c | 6 +- samtools/bam_lpileup.c.pysam.c | 6 +- samtools/bam_markdup.c | 225 +++- samtools/bam_markdup.c.pysam.c | 225 +++- samtools/bam_plbuf.h | 17 +- samtools/bam_plcmd.c | 746 ++++------- samtools/bam_plcmd.c.pysam.c | 746 ++++------- samtools/bam_reheader.c.pysam.c | 2 +- samtools/bamtk.c | 13 +- samtools/bamtk.c.pysam.c | 17 +- samtools/bedcov.c | 4 +- samtools/bedcov.c.pysam.c | 4 +- samtools/consensus_pileup.c | 595 +++++++++ samtools/consensus_pileup.c.pysam.c | 597 +++++++++ samtools/consensus_pileup.h | 79 ++ samtools/phase.c | 5 +- samtools/phase.c.pysam.c | 5 +- samtools/sam_view.c | 888 +++++++++---- samtools/sam_view.c.pysam.c | 888 +++++++++---- samtools/samtools.pysam.c | 26 +- samtools/samtools.pysam.h | 20 +- samtools/splaysort.h | 200 +++ samtools/stats.c | 5 +- samtools/stats.c.pysam.c | 5 +- samtools/version.sh | 2 +- setup.cfg | 37 +- setup.py | 81 +- tests/AlignedSegment_test.py | 1282 ++++++++++++------- tests/AlignmentFileFetchTestUtils.py | 5 +- tests/AlignmentFileFetch_bench.py | 3 +- tests/AlignmentFileHeader_test.py | 6 - tests/AlignmentFilePileup_bench.py | 4 +- tests/PileupTestUtils.py | 3 +- tests/TestUtils.py | 7 +- tests/VariantFileFetchTestUtils.py | 2 - tests/VariantFile_bench.py | 2 - tests/VariantFile_test.py | 26 +- tests/VariantRecord_test.py | 12 +- tests/faidx_test.py | 1 - tests/pysam_data/MM-chebi.sam | 2 + tests/pysam_data/MM-double.sam | 3 + tests/pysam_data/MM-multi.sam | 7 + tests/pysam_data/MM-orient.sam | 6 + tests/tabixproxies_test.py | 1 - tests/test_samtools_python.py | 1 - 168 files changed, 12079 insertions(+), 6471 deletions(-) create mode 100644 bcftools/dbuf.h create mode 100644 bcftools/vcfhead.c create mode 100644 bcftools/vcfhead.c.pysam.c create mode 100644 pysam/libcalignedsegment.pyi create mode 100644 pysam/libcalignmentfile.pyi create mode 100644 pysam/libcbcf.pyi create mode 100644 pysam/libcbcftools.pyi create mode 100644 pysam/libcbgzf.pyi create mode 100644 pysam/libcfaidx.pyi create mode 100644 pysam/libchtslib.pyi create mode 100644 pysam/libcsamfile.pyi create mode 100644 pysam/libcsamtools.pyi create mode 100644 pysam/libctabix.pyi create mode 100644 pysam/libctabixproxies.pyi create mode 100644 pysam/libcutils.pyi create mode 100644 pysam/libcvcf.pyi create mode 100644 pysam/py.typed create mode 100644 pysam/version.pyi delete mode 100644 samtools/bam2bcf.c delete mode 100644 samtools/bam2bcf.c.pysam.c delete mode 100644 samtools/bam2bcf.h delete mode 100644 samtools/bam2bcf_indel.c delete mode 100644 samtools/bam2bcf_indel.c.pysam.c create mode 100644 samtools/bam_consensus.c create mode 100644 samtools/bam_consensus.c.pysam.c create mode 100644 samtools/consensus_pileup.c create mode 100644 samtools/consensus_pileup.c.pysam.c create mode 100644 samtools/consensus_pileup.h create mode 100644 samtools/splaysort.h create mode 100644 tests/pysam_data/MM-chebi.sam create mode 100644 tests/pysam_data/MM-double.sam create mode 100644 tests/pysam_data/MM-multi.sam create mode 100644 tests/pysam_data/MM-orient.sam diff --git a/NEWS b/NEWS index 75d9249..3af63dd 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,128 @@ http://pysam.readthedocs.io/en/latest/release.html Release notes ============= +Release 0.19.1 +============== + +This release wraps htslib/samtools/bcftools version 1.15.1. + +* [#1104] add an add_samples() method to quickly add multiple samples + to VCF. + +Release 0.19.0 +============== + +This release wraps htslib/samtools/bcftools version 1.15. + +* [#1085] Improve getopt()/getopt_long() resetting when running samtools/bcftools commands + +* [#1078] Support BAM_CPAD in get_aligned_pairs + +* [#1063] Run flake8 and fix some linting issues + +* [#1088] Add AlignedSegment is_mapped/mate_is_mapped/is_forward/mate_is_forward properties + +* Write an absent AlignedSegment.qual as all-bytes-0xff + +* Fix BGZFile.read() behaviour near or at EOF + +* First API for the htslib modified bases interface + +Release 0.18.0 +============== + +This release wraps htslib/samtools/bcftools version 1.14. + +* [#1048] and [#1060], clarify documentation of index statistics with CRAM files +* Prevent "retval may be used uninitialised" warning. +* Add new "samples" subcommand to pysam/samtools.py +* Introduce TupleProxyIterator iterator object class + +Release 0.17.0 +============== + +This release wraps htslib/samtools/bcftools version 1.13. Corresponding +to new samtools commands, `pysam.samtools` now has additional functions +`ampliconclip`, `ampliconstats`, `fqimport`, and `version`. + +Bugs fixed: + +* [#447] The maximum QNAME length is fully restored to 254 +* [#506, #958, #1000] Don't crash the Python interpreter on ``pysam.bcftools.*()`` errors +* [#603] count_coverage: ignore reads that have no SEQ field +* [#928] Fix ``pysam.bcftools.mpileup()`` segmentation fault +* [#983] Add win32/\*.[ch] to MANIFEST.in +* [#994] Raise exception in ``get_tid()`` if header could not be parsed +* [#995] Choose TBI/CSI in ``tabix_index()`` via both min_shift and csi +* [#996] ``AlignmentFile.fetch()`` now works with large chromosomes longer than 2\ :sup:`29` bases +* [#1019] Fix Sphinx documentation generation by avoiding Python 2 ``ur'string'`` syntax +* [#1035] Improved handling of file iteration errors +* [#1038] ``tabix_index()`` no longer leaks file descriptors +* [#1040] ``print(aligned_segment)`` now prints the correct TLEN value + (it also now prints RNAME/RNEXT more clearly and prints POS/PNEXT 1-based) +* *setup.py* longer uses ``setup(use_2to3)`` for compatibility with setuptools >= v58.0.0 + +New facilities: + +* [PR #963] Additional VCF classes are exposed to pysam programmers +* [#998, PR #1001] Add ``get/set_encoding_error_handler()`` to control UTF-8 conversion +* [PR #1012] Running ``python setup.py sdist`` now automatically runs cythonize +* Running tests with ``pytest`` now automatically runs ``make`` to generate test data + +Documentation improvements: + +* [#726] Clarify get_forward_sequence/get_forward_qualities documentation +* [#865] Improved example +* [#968] ``get_index_statstics`` parameters +* [#986] Clarify ``VariantFile.fetch`` start/stop region parameters are 0-based and half-open. +* [#990] Corrected ``PileupColumn.get_query_sequences`` documentation +* [#999] Fix documentation for ``AlignmentFile.get_reference_length()`` +* [#1002] Document the default min_base_quality for ``pileup()`` + + +Release 0.16.0 +============== + +This release wraps htslib/bcftools version 1.10.2 and samtools version +1.10. The following bugs reported against pysam are fixed due to this: + +* [#447] Writing out QNAME longer than 251 characters corrupts BAM +* [#640, #734, #843] Setting VariantRecord pos or stop raises error +* [#738, #919] FastxFile truncates concatenated plain gzip compressed files + +Additional bugfixes: + +* [#840] Pileup doesn't work on python3 when `index_filename` is used +* [#886] FastqProxy raises ValueError when instantiated from python +* [#904] VariantFile.fetch() throws ValueError on files with no records +* [#909] Fix incorrect quoting in VariantFile contig records +* [#915, #916] Implement pileup() for unindexed files and/or SAM files + +Backwards incompatible changes: + +* The `samtools import` command was removed in samtools 1.10, so pysam + no longer exports a `samimport` function. Use `pysam.view()` instead. + + +Release 0.15.4 +============== + +Bugfix release. Principal reason for release is to update cython +version in order to fix pip install pysam with python 3.8. + +* [#879] Fix add_meta function in libcbcf.pyx, so meta-information + lines in header added with this function have double-quoting rules + in accordance to rules specified in VCF4.2 and VCF4.3 specifications +* [#863] Force arg to bytes to support non-ASCII encoding +* [#875] Bump minimum Cython version +* [#868] Prevent segfault on Python 2.7 AlignedSegment.compare(other=None) +* [#867] Fix wheel building on TravisCI +* [#863] Force arg to bytes to support non-ASCII encoding +* [#799] disambiguate interpretation of bcf_read return code +* [#841] Fix silent truncation of FASTQ with bad q strings +* [#846] Prevent segmentation fault on ID, when handling malformed records +* [#829] Run configure with the correct CC/CFLAGS/LDFLAGS env vars + Release 0.15.3 ============== @@ -301,7 +423,7 @@ changes, for example:: will become:: - cimport pysam.libcamtools + cimport pysam.libcsamtools Release 0.9.1 ============= diff --git a/README.rst b/README.rst index 547868b..06d44bf 100644 --- a/README.rst +++ b/README.rst @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured compilation options. Especially for OS X this will potentially save a lot of trouble. -The current version of pysam wraps 3rd-party code from htslib-1.14, samtools-1.14, and bcftools-1.14. +The current version of pysam wraps 3rd-party code from htslib-1.15.1, samtools-1.15.1, and bcftools-1.15.1. Pysam is available through `pypi `_. To install, type:: diff --git a/bcftools/abuf.c b/bcftools/abuf.c index a97332a..78682d6 100644 --- a/bcftools/abuf.c +++ b/bcftools/abuf.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2021 Genome Research Ltd. + Copyright (c) 2021-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -117,7 +117,7 @@ void abuf_set(abuf_t *buf, abuf_opt_t key, void *value) if ( key==INFO_TAG ) { buf->split.info_tag = *((char**)value); - bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); + bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); return; } if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; } @@ -141,7 +141,7 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--; int Mlen = rlen > alen ? rlen : alen; - atom_t *atom = NULL; + atom_t *atom = NULL; int i; for (i=0; itmp2+num_size,missing_ptr,num_size); else memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size); - if ( type==BCF_HT_INT && mode==M_SUM ) + if ( type==BCF_HT_INT && mode==M_SUM ) { uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; for (i=iori; isplit.nori; i++) @@ -466,7 +466,10 @@ static void _split_table_set_gt(abuf_t *buf) error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1); int ial = _split_table_get_ial(buf,iout,iori); if ( ial==2 && !star_allele ) + { dst[j] = bcf_gt_missing; + if ( bcf_gt_is_phased(src[j]) ) dst[j] |= 1; + } else dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial); } @@ -542,7 +545,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo { int star_allele = _has_star_allele(buf,iout); bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; - int ret = 0; + int ret = 0; if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type); else if ( len==BCF_VL_A && type!=BCF_HT_STR ) @@ -707,7 +710,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec) buf->vcf[j] = bcf_dup(rec); return; } - for (i=1; in_allele; i++) + for (i=0; in_allele; i++) { if ( _is_acgtn(rec->d.allele[i]) ) continue; rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); diff --git a/bcftools/abuf.c.pysam.c b/bcftools/abuf.c.pysam.c index a727836..d85a54c 100644 --- a/bcftools/abuf.c.pysam.c +++ b/bcftools/abuf.c.pysam.c @@ -2,20 +2,20 @@ /* The MIT License - Copyright (c) 2021 Genome Research Ltd. + Copyright (c) 2021-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -119,7 +119,7 @@ void abuf_set(abuf_t *buf, abuf_opt_t key, void *value) if ( key==INFO_TAG ) { buf->split.info_tag = *((char**)value); - bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); + bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); return; } if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; } @@ -143,7 +143,7 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--; int Mlen = rlen > alen ? rlen : alen; - atom_t *atom = NULL; + atom_t *atom = NULL; int i; for (i=0; itmp2+num_size,missing_ptr,num_size); else memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size); - if ( type==BCF_HT_INT && mode==M_SUM ) + if ( type==BCF_HT_INT && mode==M_SUM ) { uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; for (i=iori; isplit.nori; i++) @@ -468,7 +468,10 @@ static void _split_table_set_gt(abuf_t *buf) error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1); int ial = _split_table_get_ial(buf,iout,iori); if ( ial==2 && !star_allele ) + { dst[j] = bcf_gt_missing; + if ( bcf_gt_is_phased(src[j]) ) dst[j] |= 1; + } else dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial); } @@ -544,7 +547,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo { int star_allele = _has_star_allele(buf,iout); bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; - int ret = 0; + int ret = 0; if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type); else if ( len==BCF_VL_A && type!=BCF_HT_STR ) @@ -709,7 +712,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec) buf->vcf[j] = bcf_dup(rec); return; } - for (i=1; in_allele; i++) + for (i=0; in_allele; i++) { if ( _is_acgtn(rec->d.allele[i]) ) continue; rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); diff --git a/bcftools/bam_sample.c b/bcftools/bam_sample.c index a6da943..d8c10b8 100644 --- a/bcftools/bam_sample.c +++ b/bcftools/bam_sample.c @@ -1,7 +1,7 @@ /* bam_sample.c -- group data by sample. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2013, 2016-2018 Genome Research Ltd. + Copyright (C) 2013, 2016-2022 Genome Research Ltd. Author: Heng Li , Petr Danecek @@ -281,7 +281,7 @@ int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file) int i, nsamples = 0; char **samples = hts_readlist(list, is_file, &nsamples); - if ( !nsamples ) return 0; + if ( !samples || !nsamples ) return 0; kstring_t ori = {0,0,0}; kstring_t ren = {0,0,0}; @@ -328,7 +328,7 @@ int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file) int i, nrows = 0; char **rows = hts_readlist(list, is_file, &nrows); - if ( !nrows ) return 0; + if ( !rows || !nrows ) return 0; kstring_t fld1 = {0,0,0}; kstring_t fld2 = {0,0,0}; diff --git a/bcftools/bam_sample.c.pysam.c b/bcftools/bam_sample.c.pysam.c index 565cfc1..9384470 100644 --- a/bcftools/bam_sample.c.pysam.c +++ b/bcftools/bam_sample.c.pysam.c @@ -3,7 +3,7 @@ /* bam_sample.c -- group data by sample. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2013, 2016-2018 Genome Research Ltd. + Copyright (C) 2013, 2016-2022 Genome Research Ltd. Author: Heng Li , Petr Danecek @@ -283,7 +283,7 @@ int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file) int i, nsamples = 0; char **samples = hts_readlist(list, is_file, &nsamples); - if ( !nsamples ) return 0; + if ( !samples || !nsamples ) return 0; kstring_t ori = {0,0,0}; kstring_t ren = {0,0,0}; @@ -330,7 +330,7 @@ int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file) int i, nrows = 0; char **rows = hts_readlist(list, is_file, &nrows); - if ( !nrows ) return 0; + if ( !rows || !nrows ) return 0; kstring_t fld1 = {0,0,0}; kstring_t fld2 = {0,0,0}; diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index b188e98..a915802 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -1,6 +1,6 @@ /* bcftools.h -- utility function declarations. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -28,6 +28,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #define FT_TAB_TEXT 0 // custom tab-delimited text file @@ -50,9 +51,11 @@ void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); -const char *hts_bcf_wmode2(int file_type, char *fname); -void set_wmode(char dst[8], int file_type, char *fname, int compression_level); // clevel: 0-9 with or zb type, -1 unset +const char *hts_bcf_wmode2(int file_type, const char *fname); +void set_wmode(char dst[8], int file_type, const char *fname, int compression_level); // clevel: 0-9 with or zb type, -1 unset char *init_tmp_prefix(const char *prefix); +int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq); +int parse_overlap_option(const char *arg); void *smalloc(size_t size); // safe malloc diff --git a/bcftools/bcftools.pysam.c b/bcftools/bcftools.pysam.c index c6f4fd8..e7f620a 100644 --- a/bcftools/bcftools.pysam.c +++ b/bcftools/bcftools.pysam.c @@ -1,5 +1,4 @@ -#include -#include +#include #include #include #include @@ -62,6 +61,15 @@ static int bcftools_status = 0; int bcftools_dispatch(int argc, char *argv[]) { + /* Reset getopt()/getopt_long() processing. */ +#if defined __GLIBC__ + optind = 0; +#elif defined _OPTRESET || defined _OPTRESET_DECLARED + optreset = optind = 1; +#else + optind = 1; +#endif + if (setjmp(bcftools_jmpbuf) == 0) return bcftools_main(argc, argv); else @@ -73,17 +81,3 @@ void bcftools_exit(int status) bcftools_status = status; longjmp(bcftools_jmpbuf, 1); } - - -void bcftools_set_optind(int val) -{ - // setting this in cython via - // "from posix.unistd cimport optind" - // did not work. - // - // setting to 0 forces a complete re-initialization - optind = val; -} - - - diff --git a/bcftools/bcftools.pysam.h b/bcftools/bcftools.pysam.h index b8bf93e..e6717bb 100644 --- a/bcftools/bcftools.pysam.h +++ b/bcftools/bcftools.pysam.h @@ -53,8 +53,22 @@ int bcftools_dispatch(int argc, char *argv[]); void PYSAM_NORETURN bcftools_exit(int status); -void bcftools_set_optind(int); - extern int bcftools_main(int argc, char *argv[]); - + +/* Define these only in samtools/bcftools C source, not Cython code. */ +#if !(defined CYTHON_ABI || defined CYTHON_HEX_VERSION) + +/*! Several non-static function names are used in both samtools and bcftools. + Both libcsamtools.so and libcbcftools.so are loaded simultaneously, leading + to collisions and wrong functions being called. #define these names so the + actual symbol names include distinct prefixes to avoid collisions. + */ +#define main_consensus bcftools_main_consensus +#define main_reheader bcftools_main_reheader +#define bam_smpl_init bcftools_bam_smpl_init +#define bam_smpl_destroy bcftools_bam_smpl_destroy +#define read_file_list bcftools_read_file_list + +#endif + #endif diff --git a/bcftools/bin.c b/bcftools/bin.c index a4817cf..645012e 100644 --- a/bcftools/bin.c +++ b/bcftools/bin.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016 Genome Research Ltd. + Copyright (c) 2016-2022 Genome Research Ltd. Author: Petr Danecek @@ -43,6 +43,7 @@ bin_t *bin_init(const char *list_def, float min, float max) int is_file = strchr(list_def,',') ? 0 : 1; int i, nlist; char **list = hts_readlist(list_def, is_file, &nlist); + if ( !list ) error("Error: failed to read %s\n",list_def); bin->nbins = nlist; bin->bins = (float*) malloc(sizeof(float)*nlist); for (i=0; i @@ -45,6 +45,7 @@ bin_t *bin_init(const char *list_def, float min, float max) int is_file = strchr(list_def,',') ? 0 : 1; int i, nlist; char **list = hts_readlist(list_def, is_file, &nlist); + if ( !list ) error("Error: failed to read %s\n",list_def); bin->nbins = nlist; bin->bins = (float*) malloc(sizeof(float)*nlist); for (i=0; irlen > args->fa_buf.l - idx ) { rec->rlen = args->fa_buf.l - idx; - alen = strlen(alt_allele); - if ( alen > rec->rlen ) + if ( alt_allele[0]!='<' ) { - alt_allele[rec->rlen] = 0; - fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + alen = strlen(alt_allele); + if ( alen > rec->rlen ) + { + fprintf(stderr,"Warning: trimming variant \"%s\" starting at %s:%"PRId64"\n", alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + alt_allele[rec->rlen] = 0; + } } } if ( idx>=args->fa_buf.l ) @@ -749,7 +752,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG if ( strcasecmp(alt_allele,"") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"") ) - error("Symbolic alleles other than , <*> or are currently not supported, e.g. %s at %s:%"PRId64".\n" + error("Symbolic alleles other than , <*> or are currently not supported, e.g. \"%s\" at %s:%"PRId64".\n" "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n", alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); if ( !strcasecmp(alt_allele,"") ) diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index fa5c14b..9c50091 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -735,11 +735,14 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( rec->rlen > args->fa_buf.l - idx ) { rec->rlen = args->fa_buf.l - idx; - alen = strlen(alt_allele); - if ( alen > rec->rlen ) + if ( alt_allele[0]!='<' ) { - alt_allele[rec->rlen] = 0; - fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + alen = strlen(alt_allele); + if ( alen > rec->rlen ) + { + fprintf(bcftools_stderr,"Warning: trimming variant \"%s\" starting at %s:%"PRId64"\n", alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + alt_allele[rec->rlen] = 0; + } } } if ( idx>=args->fa_buf.l ) @@ -751,7 +754,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG if ( strcasecmp(alt_allele,"") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"") ) - error("Symbolic alleles other than , <*> or are currently not supported, e.g. %s at %s:%"PRId64".\n" + error("Symbolic alleles other than , <*> or are currently not supported, e.g. \"%s\" at %s:%"PRId64".\n" "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n", alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); if ( !strcasecmp(alt_allele,"") ) diff --git a/bcftools/csq.c b/bcftools/csq.c index 6217987..de0d7a9 100644 --- a/bcftools/csq.c +++ b/bcftools/csq.c @@ -331,6 +331,7 @@ const char *csq_strings[] = #define GF_UTR5 ((1<<(GF_coding_bit+1))+4) // GF_MAX = (1<<30)-1, see hap_node_t +#define CDS_PHASE_UNKN 3 typedef struct _tscript_t tscript_t; typedef struct { @@ -340,7 +341,7 @@ typedef struct // update hap_node_t.sbeg in hap_init, could be calculated on the fly) uint32_t len; // exon length uint32_t icds:30, // exon index within the transcript - phase:2; // offset of the CDS + phase:2; // offset of the CDS: 0,1,2 or 3 for unknown } gf_cds_t; typedef struct @@ -517,7 +518,7 @@ typedef struct uint32_t end; uint32_t trid; uint32_t strand:1; // STRAND_REV,STRAND_FWD - uint32_t phase:2; // 0, 1 or 2 + uint32_t phase:2; // 0, 1, 2, or 3 for unknown uint32_t iseq:29; } ftr_t; @@ -1051,7 +1052,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) if ( *ss == '0' ) ftr->phase = 0; else if ( *ss == '1' ) ftr->phase = 1; else if ( *ss == '2' ) ftr->phase = 2; - else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase + else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } ss += 2; @@ -1132,6 +1133,7 @@ void tscript_init_cds(args_t *args) // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) khint_t k; + int warn_phase_unkn = 0; for (k=0; kid2tr); k++) { if ( !kh_exist(aux->id2tr, k) ) continue; @@ -1151,28 +1153,38 @@ void tscript_init_cds(args_t *args) int i, len = 0; if ( tr->strand==STRAND_FWD ) { - if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; - tr->cds[0]->beg += tr->cds[0]->phase; - tr->cds[0]->len -= tr->cds[0]->phase; - tr->cds[0]->phase = 0; + if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) + { + if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; + tr->cds[0]->beg += tr->cds[0]->phase; + tr->cds[0]->len -= tr->cds[0]->phase; + tr->cds[0]->phase = 0; + } // sanity check phase; the phase number in gff tells us how many bases to skip in this // feature to reach the first base of the next codon int tscript_ok = 1; for (i=0; incds; i++) { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + warn_phase_unkn = 1; + len += tr->cds[i]->len; + continue; + } int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3) + if ( phase!=len%3 ) { if ( args->force ) { if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); tscript_ok = 0; break; } error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } len += tr->cds[i]->len; } @@ -1180,33 +1192,43 @@ void tscript_init_cds(args_t *args) } else { - // Check that the phase is not bigger than CDS length. Curiously, this can really happen, - // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 - // todo: the same for the fwd strand - i = tr->ncds - 1; - int phase = tr->cds[i]->phase; - if ( phase ) tr->trim |= TRIM_5PRIME; - while ( i>=0 && phase > tr->cds[i]->len ) + if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) { - phase -= tr->cds[i]->len; + // Check that the phase is not bigger than CDS length. Curiously, this can really happen, + // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 + // todo: the same for the fwd strand + i = tr->ncds - 1; + int phase = tr->cds[i]->phase; + if ( phase ) tr->trim |= TRIM_5PRIME; + while ( i>=0 && phase > tr->cds[i]->len ) + { + phase -= tr->cds[i]->len; + tr->cds[i]->phase = 0; + tr->cds[i]->len = 0; + i--; + } + tr->cds[i]->len -= tr->cds[i]->phase; tr->cds[i]->phase = 0; - tr->cds[i]->len = 0; - i--; } - tr->cds[i]->len -= tr->cds[i]->phase; - tr->cds[i]->phase = 0; // sanity check phase int tscript_ok = 1; for (i=tr->ncds-1; i>=0; i--) { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + warn_phase_unkn = 1; + len += tr->cds[i]->len; + continue; + } int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; if ( phase!=len%3) { if ( args->force ) { if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); tscript_ok = 0; break; } @@ -1282,6 +1304,8 @@ void tscript_init_cds(args_t *args) regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); } } + if ( warn_phase_unkn && args->verbosity > 0 ) + fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n"); } void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } @@ -4316,16 +4340,12 @@ int main_csq(int argc, char *argv[]) case 't': targets_list = optarg; break; case 'T': targets_list = optarg; targets_is_file = 1; break; case 4 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 5 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': error("%s",usage()); diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index db46c8b..e4abf4b 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -333,6 +333,7 @@ const char *csq_strings[] = #define GF_UTR5 ((1<<(GF_coding_bit+1))+4) // GF_MAX = (1<<30)-1, see hap_node_t +#define CDS_PHASE_UNKN 3 typedef struct _tscript_t tscript_t; typedef struct { @@ -342,7 +343,7 @@ typedef struct // update hap_node_t.sbeg in hap_init, could be calculated on the fly) uint32_t len; // exon length uint32_t icds:30, // exon index within the transcript - phase:2; // offset of the CDS + phase:2; // offset of the CDS: 0,1,2 or 3 for unknown } gf_cds_t; typedef struct @@ -519,7 +520,7 @@ typedef struct uint32_t end; uint32_t trid; uint32_t strand:1; // STRAND_REV,STRAND_FWD - uint32_t phase:2; // 0, 1 or 2 + uint32_t phase:2; // 0, 1, 2, or 3 for unknown uint32_t iseq:29; } ftr_t; @@ -1053,7 +1054,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) if ( *ss == '0' ) ftr->phase = 0; else if ( *ss == '1' ) ftr->phase = 1; else if ( *ss == '2' ) ftr->phase = 2; - else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase + else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } ss += 2; @@ -1134,6 +1135,7 @@ void tscript_init_cds(args_t *args) // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) khint_t k; + int warn_phase_unkn = 0; for (k=0; kid2tr); k++) { if ( !kh_exist(aux->id2tr, k) ) continue; @@ -1153,28 +1155,38 @@ void tscript_init_cds(args_t *args) int i, len = 0; if ( tr->strand==STRAND_FWD ) { - if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; - tr->cds[0]->beg += tr->cds[0]->phase; - tr->cds[0]->len -= tr->cds[0]->phase; - tr->cds[0]->phase = 0; + if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) + { + if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; + tr->cds[0]->beg += tr->cds[0]->phase; + tr->cds[0]->len -= tr->cds[0]->phase; + tr->cds[0]->phase = 0; + } // sanity check phase; the phase number in gff tells us how many bases to skip in this // feature to reach the first base of the next codon int tscript_ok = 1; for (i=0; incds; i++) { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + warn_phase_unkn = 1; + len += tr->cds[i]->len; + continue; + } int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3) + if ( phase!=len%3 ) { if ( args->force ) { if ( args->verbosity > 0 ) - fprintf(bcftools_stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + fprintf(bcftools_stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); tscript_ok = 0; break; } error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } len += tr->cds[i]->len; } @@ -1182,33 +1194,43 @@ void tscript_init_cds(args_t *args) } else { - // Check that the phase is not bigger than CDS length. Curiously, this can really happen, - // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 - // todo: the same for the fwd strand - i = tr->ncds - 1; - int phase = tr->cds[i]->phase; - if ( phase ) tr->trim |= TRIM_5PRIME; - while ( i>=0 && phase > tr->cds[i]->len ) + if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) { - phase -= tr->cds[i]->len; + // Check that the phase is not bigger than CDS length. Curiously, this can really happen, + // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 + // todo: the same for the fwd strand + i = tr->ncds - 1; + int phase = tr->cds[i]->phase; + if ( phase ) tr->trim |= TRIM_5PRIME; + while ( i>=0 && phase > tr->cds[i]->len ) + { + phase -= tr->cds[i]->len; + tr->cds[i]->phase = 0; + tr->cds[i]->len = 0; + i--; + } + tr->cds[i]->len -= tr->cds[i]->phase; tr->cds[i]->phase = 0; - tr->cds[i]->len = 0; - i--; } - tr->cds[i]->len -= tr->cds[i]->phase; - tr->cds[i]->phase = 0; // sanity check phase int tscript_ok = 1; for (i=tr->ncds-1; i>=0; i--) { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + warn_phase_unkn = 1; + len += tr->cds[i]->len; + continue; + } int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; if ( phase!=len%3) { if ( args->force ) { if ( args->verbosity > 0 ) - fprintf(bcftools_stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + fprintf(bcftools_stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); tscript_ok = 0; break; } @@ -1284,6 +1306,8 @@ void tscript_init_cds(args_t *args) regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); } } + if ( warn_phase_unkn && args->verbosity > 0 ) + fprintf(bcftools_stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n"); } void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } @@ -4318,16 +4342,12 @@ int main_csq(int argc, char *argv[]) case 't': targets_list = optarg; break; case 'T': targets_list = optarg; targets_is_file = 1; break; case 4 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 5 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': error("%s",usage()); diff --git a/bcftools/dbuf.h b/bcftools/dbuf.h new file mode 100644 index 0000000..80b5958 --- /dev/null +++ b/bcftools/dbuf.h @@ -0,0 +1,71 @@ +/* The MIT License + + Copyright (c) 2022 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +/* + Simple data buffer +*/ + +#ifndef __DBUF_H__ +#define __DBUF_H__ + +#include + +typedef struct +{ + size_t n,m; + void **dat; +} +dbuf_t; + +static inline dbuf_t *dbuf_push(dbuf_t *buf, void *ptr) +{ + if ( !buf ) buf = calloc(1,sizeof(dbuf_t)); + buf->n++; + hts_expand(void*,buf->n,buf->m,buf->dat); + buf->dat[buf->n-1] = ptr; + return buf; +} + +static inline void *dbuf_ith(dbuf_t *buf, int i) +{ + return buf->dat[i]; +} + +static inline size_t dbuf_n(dbuf_t *buf) +{ + return buf->n; +} + +static inline void dbuf_destroy_free(dbuf_t *buf) +{ + int i; + for (i=0; in; i++) free(buf->dat[i]); + free(buf->dat); + free(buf); +} + +#endif + diff --git a/bcftools/filter.c b/bcftools/filter.c index 3c45195..7ff006e 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -1,6 +1,6 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -152,6 +152,8 @@ struct _filter_t static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently +static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok); + // Return negative values if it is a function with variable number of arguments static int filters_next_token(char **str, int *len) { @@ -471,16 +473,15 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1 } static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { - // multiple IDs not supported yet (easy to add though) - if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) - error("Only == and != operators are supported for ID\n"); - if ( btok->hash ) { token_t *tmp = atok; atok = btok; btok = tmp; } if ( atok->hash ) { + if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) + error("Only == and != operators are supported for strings read from a file\n"); + int ret = khash_str2int_has_key(atok->hash, line->d.id); if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1; rtok->pass_site = ret; @@ -491,8 +492,19 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t * if ( rtok->tok_type==TOK_EQ ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; - else + else if ( rtok->tok_type==TOK_NE ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0; + else + { + if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE ) + error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n", + rtok->tok_type,atok->regex,btok->regex); + + regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL); + if ( !regex ) error("fixme: regex initialization failed\n"); + rtok->pass_site = regexec(regex,line->d.id, 0,NULL,0) ? 0 : 1; + if ( rtok->tok_type==TOK_NLIKE ) rtok->pass_site = rtok->pass_site ? 0 : 1; + } } /** @@ -1902,7 +1914,11 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac } inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) { - token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; + token_t *tok; + if ( (atok->nsamples || btok->nsamples) && (!atok->nsamples || !btok->nsamples) ) + tok = atok->nsamples ? atok : btok; + else + tok = atok->nvalues > btok->nvalues ? atok : btok; rtok->nvalues = tok->nvalues; rtok->nval1 = tok->nval1; hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 8832633..3335cde 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -2,7 +2,7 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -154,6 +154,8 @@ struct _filter_t static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently +static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok); + // Return negative values if it is a function with variable number of arguments static int filters_next_token(char **str, int *len) { @@ -473,16 +475,15 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1 } static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { - // multiple IDs not supported yet (easy to add though) - if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) - error("Only == and != operators are supported for ID\n"); - if ( btok->hash ) { token_t *tmp = atok; atok = btok; btok = tmp; } if ( atok->hash ) { + if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) + error("Only == and != operators are supported for strings read from a file\n"); + int ret = khash_str2int_has_key(atok->hash, line->d.id); if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1; rtok->pass_site = ret; @@ -493,8 +494,19 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t * if ( rtok->tok_type==TOK_EQ ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; - else + else if ( rtok->tok_type==TOK_NE ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0; + else + { + if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE ) + error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n", + rtok->tok_type,atok->regex,btok->regex); + + regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL); + if ( !regex ) error("fixme: regex initialization failed\n"); + rtok->pass_site = regexec(regex,line->d.id, 0,NULL,0) ? 0 : 1; + if ( rtok->tok_type==TOK_NLIKE ) rtok->pass_site = rtok->pass_site ? 0 : 1; + } } /** @@ -1904,7 +1916,11 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac } inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) { - token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; + token_t *tok; + if ( (atok->nsamples || btok->nsamples) && (!atok->nsamples || !btok->nsamples) ) + tok = atok->nsamples ? atok : btok; + else + tok = atok->nvalues > btok->nvalues ? atok : btok; rtok->nvalues = tok->nvalues; rtok->nval1 = tok->nval1; hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); diff --git a/bcftools/main.c b/bcftools/main.c index f892711..3a0d557 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -43,6 +43,7 @@ int main_vcfsom(int argc, char *argv[]); int main_vcfnorm(int argc, char *argv[]); int main_vcfgtcheck(int argc, char *argv[]); int main_vcfview(int argc, char *argv[]); +int main_vcfhead(int argc, char *argv[]); int main_vcfcall(int argc, char *argv[]); int main_vcfannotate(int argc, char *argv[]); int main_vcfroh(int argc, char *argv[]); @@ -55,6 +56,7 @@ int main_polysomy(int argc, char *argv[]); #endif #ifdef ENABLE_BCF_PLUGINS int main_plugin(int argc, char *argv[]); +int count_plugins(void); #endif int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); @@ -100,6 +102,10 @@ static cmd_t cmds[] = .alias = "convert", .help = "convert VCF/BCF files to different formats and back" }, + { .func = main_vcfhead, + .alias = "head", + .help = "view VCF/BCF file headers" + }, { .func = main_vcfisec, .alias = "isec", .help = "intersections of VCF/BCF files" @@ -225,6 +231,14 @@ static void usage(FILE *fp) if ( cmds[i].func && cmds[i].help[0]!='-' ) fprintf(fp, " %-12s %s\n", cmds[i].alias, cmds[i].help); i++; } +#if ENABLE_BCF_PLUGINS + fprintf(fp,"\n -- Plugins (collection of programs for calling, file manipulation & analysis)\n"); + int nplugins = count_plugins(); + if ( nplugins ) + fprintf(fp," %d plugins available, run \"bcftools plugin -lv\" to see a complete list\n", nplugins); + else + fprintf(fp," 0 plugins available, run \"bcftools plugin -l\" for help\n"); +#endif fprintf(fp,"\n"); fprintf(fp, " Most commands accept VCF, bgzipped VCF, and BCF with the file type detected\n" @@ -251,7 +265,7 @@ int main(int argc, char *argv[]) if (argc < 2) { usage(stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version()); + printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2022 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL printf("License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index bfd0f04..535813c 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -45,6 +45,7 @@ int main_vcfsom(int argc, char *argv[]); int main_vcfnorm(int argc, char *argv[]); int main_vcfgtcheck(int argc, char *argv[]); int main_vcfview(int argc, char *argv[]); +int main_vcfhead(int argc, char *argv[]); int main_vcfcall(int argc, char *argv[]); int main_vcfannotate(int argc, char *argv[]); int main_vcfroh(int argc, char *argv[]); @@ -57,6 +58,7 @@ int main_polysomy(int argc, char *argv[]); #endif #ifdef ENABLE_BCF_PLUGINS int main_plugin(int argc, char *argv[]); +int count_plugins(void); #endif int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); @@ -102,6 +104,10 @@ static cmd_t cmds[] = .alias = "convert", .help = "convert VCF/BCF files to different formats and back" }, + { .func = main_vcfhead, + .alias = "head", + .help = "view VCF/BCF file headers" + }, { .func = main_vcfisec, .alias = "isec", .help = "intersections of VCF/BCF files" @@ -227,6 +233,14 @@ static void usage(FILE *fp) if ( cmds[i].func && cmds[i].help[0]!='-' ) fprintf(fp, " %-12s %s\n", cmds[i].alias, cmds[i].help); i++; } +#if ENABLE_BCF_PLUGINS + fprintf(fp,"\n -- Plugins (collection of programs for calling, file manipulation & analysis)\n"); + int nplugins = count_plugins(); + if ( nplugins ) + fprintf(fp," %d plugins available, run \"bcftools plugin -lv\" to see a complete list\n", nplugins); + else + fprintf(fp," 0 plugins available, run \"bcftools plugin -l\" for help\n"); +#endif fprintf(fp,"\n"); fprintf(fp, " Most commands accept VCF, bgzipped VCF, and BCF with the file type detected\n" @@ -253,7 +267,7 @@ int bcftools_main(int argc, char *argv[]) if (argc < 2) { usage(bcftools_stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version()); + fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2022 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c index eb0cc64..fd5aa51 100644 --- a/bcftools/mpileup.c +++ b/bcftools/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2021 Genome Research Ltd. + Copyright (C) 2008-2022 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -69,7 +69,7 @@ typedef struct _mplp_pileup_t mplp_pileup_t; typedef struct { int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, max_indel_depth, max_read_len, fmt_flag, ambig_reads; - int rflag_require, rflag_filter, output_type; + int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels double indel_bias; @@ -197,8 +197,10 @@ static int mplp_func(void *data, bam1_t *b) // The 'B' cigar operation is not part of the specification, considering as obsolete. // bam_remove_B(b); if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads - if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue; - if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue; + if (ma->conf->rflag_skip_any_unset && (ma->conf->rflag_skip_any_unset&b->core.flag)!=ma->conf->rflag_skip_any_unset) continue; + if (ma->conf->rflag_skip_all_set && (ma->conf->rflag_skip_all_set&b->core.flag)==ma->conf->rflag_skip_all_set) continue; + if (ma->conf->rflag_skip_all_unset && !(ma->conf->rflag_skip_all_unset&b->core.flag)) continue; + if (ma->conf->rflag_skip_any_set && ma->conf->rflag_skip_any_set&b->core.flag) continue; if (ma->conf->bed) { // test overlap @@ -1087,8 +1089,10 @@ static void list_annotations(FILE *fp) static void print_usage(FILE *fp, const mplp_conf_t *mplp) { - char *tmp_require = bam_flag2str(mplp->rflag_require); - char *tmp_filter = bam_flag2str(mplp->rflag_filter); + char *tmp_skip_all_set = bam_flag2str(mplp->rflag_skip_all_set); + char *tmp_skip_any_unset = bam_flag2str(mplp->rflag_skip_any_unset); + char *tmp_skip_all_unset = bam_flag2str(mplp->rflag_skip_all_unset); + char *tmp_skip_any_set = bam_flag2str(mplp->rflag_skip_any_set); // Display usage information, formatted for the standard 80 columns. // (The unusual string formatting here aids the readability of this @@ -1122,10 +1126,12 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" " -R, --regions-file FILE Restrict to regions listed in a file\n" " --ignore-RG Ignore RG tags (one BAM = one sample)\n" - " --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset [%s]\n", tmp_require); + " --ls, --skip-all-set STR|INT Skip reads with all of the bits set []\n"); fprintf(fp, - " --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set\n" - " [%s]\n", tmp_filter); + " --ns, --skip-any-set STR|INT Skip reads with any of the bits set [%s]\n", tmp_skip_any_set); + fprintf(fp, + " --lu, --skip-all-unset STR|INT Skip reads with all of the bits unset []\n" + " --nu, --skip-any-unset STR|INT Skip reads with any of the bits unset []\n"); fprintf(fp, " -s, --samples LIST Comma separated list of samples to include\n" " -S, --samples-file FILE File of samples to include\n" @@ -1184,8 +1190,10 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" "\n"); - free(tmp_require); - free(tmp_filter); + free(tmp_skip_all_set); + free(tmp_skip_any_unset); + free(tmp_skip_all_unset); + free(tmp_skip_any_set); } int main_mpileup(int argc, char *argv[]) @@ -1206,7 +1214,7 @@ int main_mpileup(int argc, char *argv[]) mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; - mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; + mplp.rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; mplp.output_type = FT_VCF; mplp.record_cmd_line = 1; @@ -1222,10 +1230,16 @@ int main_mpileup(int argc, char *argv[]) static const struct option lopts[] = { - {"rf", required_argument, NULL, 1}, // require flag - {"ff", required_argument, NULL, 2}, // filter flag - {"incl-flags", required_argument, NULL, 1}, - {"excl-flags", required_argument, NULL, 2}, + {"nu", required_argument, NULL, 16}, + {"lu", required_argument, NULL, 17}, + {"rf", required_argument, NULL, 17}, // old --rf, --incl-flags = --lu, --skip-all-unset + {"ns", required_argument, NULL, 18}, + {"ff", required_argument, NULL, 18}, // old --ff, --excl-flags = --ns, --skip-any-set + {"ls", required_argument, NULL, 19}, + {"skip-any-unset", required_argument, NULL, 16}, + {"skip-all-unset", required_argument, NULL, 17}, + {"skip-any-set", required_argument, NULL, 18}, + {"skip-all-set", required_argument, NULL, 19}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, {"ignore-RG", no_argument, NULL, 5}, @@ -1287,13 +1301,21 @@ int main_mpileup(int argc, char *argv[]) while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; - case 1 : - mplp.rflag_require = bam_str2flag(optarg); - if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; } + case 16 : + mplp.rflag_skip_any_unset = bam_str2flag(optarg); + if ( mplp.rflag_skip_any_unset <0 ) { fprintf(stderr,"Could not parse --nf %s\n", optarg); return 1; } + break; + case 17 : + mplp.rflag_skip_all_unset = bam_str2flag(optarg); + if ( mplp.rflag_skip_all_unset<0 ) { fprintf(stderr,"Could not parse --if %s\n", optarg); return 1; } + break; + case 18 : + mplp.rflag_skip_any_set = bam_str2flag(optarg); + if ( mplp.rflag_skip_any_set <0 ) { fprintf(stderr,"Could not parse --ef %s\n", optarg); return 1; } break; - case 2 : - mplp.rflag_filter = bam_str2flag(optarg); - if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } + case 19 : + mplp.rflag_skip_all_set = bam_str2flag(optarg); + if ( mplp.rflag_skip_all_set <0 ) { fprintf(stderr,"Could not parse --df %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index 7ba73f8..159e57c 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -2,7 +2,7 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2021 Genome Research Ltd. + Copyright (C) 2008-2022 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -71,7 +71,7 @@ typedef struct _mplp_pileup_t mplp_pileup_t; typedef struct { int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, max_indel_depth, max_read_len, fmt_flag, ambig_reads; - int rflag_require, rflag_filter, output_type; + int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels double indel_bias; @@ -199,8 +199,10 @@ static int mplp_func(void *data, bam1_t *b) // The 'B' cigar operation is not part of the specification, considering as obsolete. // bam_remove_B(b); if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads - if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue; - if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue; + if (ma->conf->rflag_skip_any_unset && (ma->conf->rflag_skip_any_unset&b->core.flag)!=ma->conf->rflag_skip_any_unset) continue; + if (ma->conf->rflag_skip_all_set && (ma->conf->rflag_skip_all_set&b->core.flag)==ma->conf->rflag_skip_all_set) continue; + if (ma->conf->rflag_skip_all_unset && !(ma->conf->rflag_skip_all_unset&b->core.flag)) continue; + if (ma->conf->rflag_skip_any_set && ma->conf->rflag_skip_any_set&b->core.flag) continue; if (ma->conf->bed) { // test overlap @@ -1089,8 +1091,10 @@ static void list_annotations(FILE *fp) static void print_usage(FILE *fp, const mplp_conf_t *mplp) { - char *tmp_require = bam_flag2str(mplp->rflag_require); - char *tmp_filter = bam_flag2str(mplp->rflag_filter); + char *tmp_skip_all_set = bam_flag2str(mplp->rflag_skip_all_set); + char *tmp_skip_any_unset = bam_flag2str(mplp->rflag_skip_any_unset); + char *tmp_skip_all_unset = bam_flag2str(mplp->rflag_skip_all_unset); + char *tmp_skip_any_set = bam_flag2str(mplp->rflag_skip_any_set); // Display usage information, formatted for the standard 80 columns. // (The unusual string formatting here aids the readability of this @@ -1124,10 +1128,12 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" " -R, --regions-file FILE Restrict to regions listed in a file\n" " --ignore-RG Ignore RG tags (one BAM = one sample)\n" - " --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset [%s]\n", tmp_require); + " --ls, --skip-all-set STR|INT Skip reads with all of the bits set []\n"); fprintf(fp, - " --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set\n" - " [%s]\n", tmp_filter); + " --ns, --skip-any-set STR|INT Skip reads with any of the bits set [%s]\n", tmp_skip_any_set); + fprintf(fp, + " --lu, --skip-all-unset STR|INT Skip reads with all of the bits unset []\n" + " --nu, --skip-any-unset STR|INT Skip reads with any of the bits unset []\n"); fprintf(fp, " -s, --samples LIST Comma separated list of samples to include\n" " -S, --samples-file FILE File of samples to include\n" @@ -1186,8 +1192,10 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" "\n"); - free(tmp_require); - free(tmp_filter); + free(tmp_skip_all_set); + free(tmp_skip_any_unset); + free(tmp_skip_all_unset); + free(tmp_skip_any_set); } int main_mpileup(int argc, char *argv[]) @@ -1208,7 +1216,7 @@ int main_mpileup(int argc, char *argv[]) mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; - mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; + mplp.rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; mplp.output_type = FT_VCF; mplp.record_cmd_line = 1; @@ -1224,10 +1232,16 @@ int main_mpileup(int argc, char *argv[]) static const struct option lopts[] = { - {"rf", required_argument, NULL, 1}, // require flag - {"ff", required_argument, NULL, 2}, // filter flag - {"incl-flags", required_argument, NULL, 1}, - {"excl-flags", required_argument, NULL, 2}, + {"nu", required_argument, NULL, 16}, + {"lu", required_argument, NULL, 17}, + {"rf", required_argument, NULL, 17}, // old --rf, --incl-flags = --lu, --skip-all-unset + {"ns", required_argument, NULL, 18}, + {"ff", required_argument, NULL, 18}, // old --ff, --excl-flags = --ns, --skip-any-set + {"ls", required_argument, NULL, 19}, + {"skip-any-unset", required_argument, NULL, 16}, + {"skip-all-unset", required_argument, NULL, 17}, + {"skip-any-set", required_argument, NULL, 18}, + {"skip-all-set", required_argument, NULL, 19}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, {"ignore-RG", no_argument, NULL, 5}, @@ -1289,13 +1303,21 @@ int main_mpileup(int argc, char *argv[]) while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; - case 1 : - mplp.rflag_require = bam_str2flag(optarg); - if ( mplp.rflag_require<0 ) { fprintf(bcftools_stderr,"Could not parse --rf %s\n", optarg); return 1; } + case 16 : + mplp.rflag_skip_any_unset = bam_str2flag(optarg); + if ( mplp.rflag_skip_any_unset <0 ) { fprintf(bcftools_stderr,"Could not parse --nf %s\n", optarg); return 1; } + break; + case 17 : + mplp.rflag_skip_all_unset = bam_str2flag(optarg); + if ( mplp.rflag_skip_all_unset<0 ) { fprintf(bcftools_stderr,"Could not parse --if %s\n", optarg); return 1; } + break; + case 18 : + mplp.rflag_skip_any_set = bam_str2flag(optarg); + if ( mplp.rflag_skip_any_set <0 ) { fprintf(bcftools_stderr,"Could not parse --ef %s\n", optarg); return 1; } break; - case 2 : - mplp.rflag_filter = bam_str2flag(optarg); - if ( mplp.rflag_filter<0 ) { fprintf(bcftools_stderr,"Could not parse --ff %s\n", optarg); return 1; } + case 19 : + mplp.rflag_skip_all_set = bam_str2flag(optarg); + if ( mplp.rflag_skip_all_set <0 ) { fprintf(bcftools_stderr,"Could not parse --df %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; diff --git a/bcftools/regidx.h b/bcftools/regidx.h index f13b52a..c40bbd8 100644 --- a/bcftools/regidx.h +++ b/bcftools/regidx.h @@ -65,6 +65,28 @@ #include #include +/* Avoid conflicts with HTSlib's regidx_* functions. */ +#define regidx_destroy bcftools_regidx_destroy +#define regidx_init bcftools_regidx_init +#define regidx_init_string bcftools_regidx_init_string +#define regidx_insert bcftools_regidx_insert +#define regidx_insert_list bcftools_regidx_insert_list +#define regidx_nregs bcftools_regidx_nregs +#define regidx_overlap bcftools_regidx_overlap +#define regidx_parse_bed bcftools_regidx_parse_bed +#define regidx_parse_reg bcftools_regidx_parse_reg +#define regidx_parse_tab bcftools_regidx_parse_tab +#define regidx_parse_vcf bcftools_regidx_parse_vcf +#define regidx_push bcftools_regidx_push +#define regidx_seq_names bcftools_regidx_seq_names +#define regidx_seq_nregs bcftools_regidx_seq_nregs +#define regitr_copy bcftools_regitr_copy +#define regitr_destroy bcftools_regitr_destroy +#define regitr_init bcftools_regitr_init +#define regitr_loop bcftools_regitr_loop +#define regitr_overlap bcftools_regitr_overlap +#define regitr_reset bcftools_regitr_reset + #ifdef __cplusplus extern "C" { #endif diff --git a/bcftools/reheader.c b/bcftools/reheader.c index ae7c622..4458f27 100644 --- a/bcftools/reheader.c +++ b/bcftools/reheader.c @@ -1,6 +1,6 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2022 Genome Research Ltd. Author: Petr Danecek @@ -142,14 +142,16 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see } char *init_tmp_prefix(const char *tmp_prefix) { - char *prefix = NULL; + kstring_t prefix = {0,0,0}; if ( tmp_prefix ) { - int len = strlen(tmp_prefix); - prefix = (char*) calloc(len+7,1); - memcpy(prefix,tmp_prefix,len); - memcpy(prefix+len,"XXXXXX",6); + ksprintf(&prefix,"%sXXXXXX",tmp_prefix); + return prefix.s; } + + char *tmpdir = getenv("TMPDIR"); + if ( tmpdir ) + kputs(tmpdir, &prefix); else { #ifdef _WIN32 @@ -157,15 +159,13 @@ char *init_tmp_prefix(const char *tmp_prefix) int ret = GetTempPath(MAX_PATH, tmp_path); if (!ret || ret > MAX_PATH) error("Could not get the path to the temporary folder\n"); - if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH) - error("Full path to the temporary folder is too long\n"); - strcat(tmp_path, "/bcftools.XXXXXX"); - prefix = strdup(tmp_path); + kputs(tmp_path, &prefix); #else - prefix = strdup("/tmp/bcftools.XXXXXX"); + kputs("/tmp", &prefix); #endif } - return prefix; + kputs("/bcftools.XXXXXX", &prefix); + return prefix.s; } static void update_from_fai(args_t *args) { diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c index 380843b..a069870 100644 --- a/bcftools/reheader.c.pysam.c +++ b/bcftools/reheader.c.pysam.c @@ -2,7 +2,7 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2022 Genome Research Ltd. Author: Petr Danecek @@ -144,14 +144,16 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see } char *init_tmp_prefix(const char *tmp_prefix) { - char *prefix = NULL; + kstring_t prefix = {0,0,0}; if ( tmp_prefix ) { - int len = strlen(tmp_prefix); - prefix = (char*) calloc(len+7,1); - memcpy(prefix,tmp_prefix,len); - memcpy(prefix+len,"XXXXXX",6); + ksprintf(&prefix,"%sXXXXXX",tmp_prefix); + return prefix.s; } + + char *tmpdir = getenv("TMPDIR"); + if ( tmpdir ) + kputs(tmpdir, &prefix); else { #ifdef _WIN32 @@ -159,15 +161,13 @@ char *init_tmp_prefix(const char *tmp_prefix) int ret = GetTempPath(MAX_PATH, tmp_path); if (!ret || ret > MAX_PATH) error("Could not get the path to the temporary folder\n"); - if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH) - error("Full path to the temporary folder is too long\n"); - strcat(tmp_path, "/bcftools.XXXXXX"); - prefix = strdup(tmp_path); + kputs(tmp_path, &prefix); #else - prefix = strdup("/tmp/bcftools.XXXXXX"); + kputs("/tmp", &prefix); #endif } - return prefix; + kputs("/bcftools.XXXXXX", &prefix); + return prefix.s; } static void update_from_fai(args_t *args) { diff --git a/bcftools/smpl_ilist.c b/bcftools/smpl_ilist.c index d170db5..e3fbacc 100644 --- a/bcftools/smpl_ilist.c +++ b/bcftools/smpl_ilist.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2016, 2018 Genome Research Ltd. + Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -63,7 +63,10 @@ smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, in char **list = hts_readlist(negate?sample_list+1:sample_list, is_file, &nlist); if ( !list ) error("Could not parse %s\n", sample_list); - // preserve the VCF order + if ( negate && (flags&SMPL_REORDER) ) flags &= ~SMPL_REORDER; + + // preserve the VCF order unless flags&SMPL_REORDER is set + int j = 0; int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); char **pair = NULL; for (i=0; in++; } + if ( flags & SMPL_REORDER ) + { + smpl->idx = tmp; + for (i=0; in = bcf_hdr_nsamples(hdr) - smpl->n; smpl->idx = (int*) malloc(sizeof(int)*smpl->n); - int j = 0; + j = 0; if ( !negate ) { if ( pair ) smpl->pair = (char**) calloc(smpl->n,sizeof(char*)); diff --git a/bcftools/smpl_ilist.c.pysam.c b/bcftools/smpl_ilist.c.pysam.c index 85b5e2f..68ed527 100644 --- a/bcftools/smpl_ilist.c.pysam.c +++ b/bcftools/smpl_ilist.c.pysam.c @@ -1,7 +1,7 @@ #include "bcftools.pysam.h" /* - Copyright (C) 2016, 2018 Genome Research Ltd. + Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -65,7 +65,10 @@ smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, in char **list = hts_readlist(negate?sample_list+1:sample_list, is_file, &nlist); if ( !list ) error("Could not parse %s\n", sample_list); - // preserve the VCF order + if ( negate && (flags&SMPL_REORDER) ) flags &= ~SMPL_REORDER; + + // preserve the VCF order unless flags&SMPL_REORDER is set + int j = 0; int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); char **pair = NULL; for (i=0; in++; } + if ( flags & SMPL_REORDER ) + { + smpl->idx = tmp; + for (i=0; in = bcf_hdr_nsamples(hdr) - smpl->n; smpl->idx = (int*) malloc(sizeof(int)*smpl->n); - int j = 0; + j = 0; if ( !negate ) { if ( pair ) smpl->pair = (char**) calloc(smpl->n,sizeof(char*)); diff --git a/bcftools/smpl_ilist.h b/bcftools/smpl_ilist.h index 23a0e53..79292c3 100644 --- a/bcftools/smpl_ilist.h +++ b/bcftools/smpl_ilist.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -36,6 +36,7 @@ #define SMPL_PAIR1 4 // two samples expected, the first is from the bcf hdr #define SMPL_PAIR2 8 // two samples expected, the second is from the bcf hdr #define SMPL_VERBOSE 16 // print warnings +#define SMPL_REORDER 32 // reorder samples as asked, sample_list[i] points to the VCF header index typedef struct { diff --git a/bcftools/tsv2vcf.c b/bcftools/tsv2vcf.c index 2e1aa52..596e75a 100644 --- a/bcftools/tsv2vcf.c +++ b/bcftools/tsv2vcf.c @@ -1,6 +1,6 @@ /* tsv2vcf.c -- convert from whitespace-separated fields to VCF - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -119,4 +119,17 @@ int tsv_setter_id(tsv_t *tsv, bcf1_t *rec, void *usr) return 0; } +int tsv_setter_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + bcf_hdr_t *hdr = (bcf_hdr_t*)usr; + char *sb = tsv->ss; + while ( *sb && !isspace(*sb) ) sb++; + if ( !*sb ) return -1; + char tmp = *sb; + *sb = ','; + bcf_update_alleles_str(hdr, rec, tsv->ss); + *sb = tmp; + return 0; +} + diff --git a/bcftools/tsv2vcf.c.pysam.c b/bcftools/tsv2vcf.c.pysam.c index f6aabf5..8c62157 100644 --- a/bcftools/tsv2vcf.c.pysam.c +++ b/bcftools/tsv2vcf.c.pysam.c @@ -2,7 +2,7 @@ /* tsv2vcf.c -- convert from whitespace-separated fields to VCF - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -121,4 +121,17 @@ int tsv_setter_id(tsv_t *tsv, bcf1_t *rec, void *usr) return 0; } +int tsv_setter_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + bcf_hdr_t *hdr = (bcf_hdr_t*)usr; + char *sb = tsv->ss; + while ( *sb && !isspace(*sb) ) sb++; + if ( !*sb ) return -1; + char tmp = *sb; + *sb = ','; + bcf_update_alleles_str(hdr, rec, tsv->ss); + *sb = tmp; + return 0; +} + diff --git a/bcftools/tsv2vcf.h b/bcftools/tsv2vcf.h index 6fe5b45..68757d4 100644 --- a/bcftools/tsv2vcf.h +++ b/bcftools/tsv2vcf.h @@ -1,6 +1,6 @@ /* tsv2vcf.h -- convert from whitespace-separated fields to VCF - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -80,6 +80,7 @@ static inline int tsv_next(tsv_t *tsv) int tsv_setter_chrom(tsv_t *tsv, bcf1_t *rec, void *usr); int tsv_setter_pos(tsv_t *tsv, bcf1_t *rec, void *usr); int tsv_setter_id(tsv_t *tsv, bcf1_t *rec, void *usr); +int tsv_setter_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr); // usr must point to bcf_hdr_t #endif diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index 14ee5de..b5e45d4 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -45,6 +45,7 @@ THE SOFTWARE. */ #include "convert.h" #include "smpl_ilist.h" #include "regidx.h" +#include "dbuf.h" struct _args_t; @@ -159,9 +160,14 @@ typedef struct _args_t char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites; + char **rename_annots_map; + char *min_overlap_str; + float min_overlap_ann, min_overlap_vcf; + int rename_annots_nmap; kstring_t merge_method_str; int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; - int columns_is_file, has_append_mode; + int columns_is_file, has_append_mode, pair_logic; + dbuf_t *header_lines; } args_t; @@ -396,7 +402,7 @@ static void init_remove_annots(args_t *args) if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT); } else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual; - else if ( !strcasecmp("INFO",str.s) ) + else if ( !strcasecmp("INFO",str.s) ) { if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt); tag->handler = remove_info; @@ -449,7 +455,7 @@ static void init_remove_annots(args_t *args) rm_tag_t *tag = &args->rm[args->nrm-1]; if ( hrec->type==BCF_HL_INFO ) tag->handler = remove_info_tag; else if ( hrec->type==BCF_HL_FMT ) tag->handler = remove_format_tag; - else + else { tag->handler = remove_filter; tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]); @@ -465,16 +471,31 @@ static void init_remove_annots(args_t *args) } static void init_header_lines(args_t *args) { - htsFile *file = hts_open(args->header_fname, "rb"); - if ( !file ) error("Error reading %s\n", args->header_fname); - kstring_t str = {0,0,0}; - while ( hts_getline(file, KS_SEP_LINE, &str) > 0 ) + if ( args->header_fname ) { - if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); - bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) + htsFile *file = hts_open(args->header_fname, "rb"); + if ( !file ) error("Error reading %s\n", args->header_fname); + kstring_t str = {0,0,0}; + while ( hts_getline(file, KS_SEP_LINE, &str) > 0 ) + { + if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); + bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) + } + if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); + free(str.s); + } + if ( args->header_lines ) + { + int i, n = dbuf_n(args->header_lines); + for (i=0; iheader_lines,i); + if ( bcf_hdr_append(args->hdr_out,line) ) error("Could not parse the header line: %s\n", line); + bcf_hdr_append(args->hdr,line); // the input file may not have the header line if run with -H (and nothing else) + } + dbuf_destroy_free(args->header_lines); + args->header_lines = NULL; } - if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); - free(str.s); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update output header", __func__); if (bcf_hdr_sync(args->hdr) < 0) @@ -482,7 +503,7 @@ static void init_header_lines(args_t *args) } static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) { - return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); + return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); } static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) { @@ -534,9 +555,9 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat if ( !(col->replace & REPLACE_MISSING) ) { bcf_update_filter(args->hdr_out,line,NULL,0); - return bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); } - + // only update missing FILTER if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); if ( !line->d.n_flt ) @@ -651,6 +672,7 @@ static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *da { bcf1_t *rec = (bcf1_t*) data; int i; + if ( line->n_allele>1 && (col->replace & REPLACE_MISSING) ) return 0; if ( rec->n_allele==line->n_allele ) { for (i=1; in_allele; i++) if ( strcmp(rec->d.allele[i],line->d.allele[i]) ) break; @@ -760,7 +782,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d { if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && - col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND && col->merge_method!=MM_APPEND_MISSING ) error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n"); } @@ -799,7 +821,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } else { - args->tmpi[ntmpi-1] = strtol(str, &end, 10); + args->tmpi[ntmpi-1] = strtol(str, &end, 10); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); str = end+1; @@ -854,7 +876,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d col->mm_dbl_nused = col->mm_dbl_ndat = 0; } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi); if ( col->replace & REPLACE_MISSING ) @@ -870,7 +892,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi); if ( ntmpi < 0 ) return 0; // nothing to add - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi); if ( col->replace & REPLACE_MISSING ) @@ -957,7 +979,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * hts_expand(float,ntmpf,args->mtmpf,args->tmpf); if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) { - if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) + if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) bcf_float_set_missing(args->tmpf[ntmpf-1]); else ntmpf--; @@ -1037,7 +1059,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * col->mm_dbl_nused = col->mm_dbl_ndat = 0; } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf); if ( col->replace & REPLACE_MISSING ) @@ -1054,7 +1076,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); if ( ntmpf < 0 ) return 0; // nothing to add - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf); if ( col->replace & REPLACE_MISSING ) @@ -1069,7 +1091,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) { assert( col->merge_method==MM_FIRST ); - + int nsrc = 1, lsrc = 0; while ( args->tmps[lsrc] ) { @@ -1087,7 +1109,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in // fill in any missing values in the target VCF (or all, if not present) int i, empty = 0, nstr, mstr = args->tmpks.m; - nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr); + nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr); args->tmpks.m = mstr; if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) ) { @@ -1200,7 +1222,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d hts_expand(char,len+1,args->mtmps,args->tmps); memcpy(args->tmps,tab->cols[col->icol],len+1); - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,tab->nals,tab->als); } @@ -1218,7 +1240,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ntmps < 0 ) return 0; // nothing to add } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele); if ( col->replace & REPLACE_MISSING ) @@ -1251,8 +1273,8 @@ gt_length_too_big: for (i=0; il - plen > blen ) @@ -1314,7 +1336,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo } return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nsrc ) + else if ( ndst >= nsrc ) { for (i=0; ihdr_out); i++) { @@ -1359,7 +1381,7 @@ static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) for (i=icol_beg; icols[i], *end = str; - if ( str[0]=='.' && !str[1] ) + if ( str[0]=='.' && !str[1] ) { // missing value if ( !nmax ) nmax = 1; @@ -1402,7 +1424,7 @@ static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, } return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nvals ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { @@ -1417,7 +1439,7 @@ static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING // x . . -TAG .. REPLACE_NON_MISSING - if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } for (j=0; jhdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nvals ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { if ( args->sample_map[i]==-1 ) continue; float *src = vals + nvals*args->sample_map[i]; float *dst = args->tmpf2 + ndst*i; - if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } for (j=0; jtmps2; for (i=0; itmpp2[i] = tmp; tmp += 2; @@ -1544,7 +1566,7 @@ static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **src = vals + args->sample_map[i]; char **dst = args->tmpp2 + i; - if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } *dst = *src; @@ -1556,7 +1578,7 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); @@ -1578,7 +1600,7 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void } char *end = str; - ptr[ival] = strtol(str, &end, 10); + ptr[ival] = strtol(str, &end, 10); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); @@ -1595,7 +1617,7 @@ static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); @@ -1611,14 +1633,14 @@ static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void { if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value { - bcf_float_set_missing(ptr[ival]); + bcf_float_set_missing(ptr[ival]); ival++; str += str[1] ? 2 : 1; continue; } char *end = str; - ptr[ival] = strtod(str, &end); + ptr[ival] = strtod(str, &end); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); @@ -1635,7 +1657,7 @@ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int ismpl; @@ -1659,12 +1681,12 @@ static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int if ( has_value ) { if ( j==ndip ) - { + { smpl[i] = 2; - max_ploidy = 2; + max_ploidy = 2; } else if ( j==nals ) - { + { smpl[i] = 1; if ( !max_ploidy ) max_ploidy = 1; } @@ -1875,7 +1897,7 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, if ( k>=0 ) { if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]); - else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); + else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); else ptr_dst[k] = ptr_src[j]; } } @@ -1928,7 +1950,7 @@ static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, v if ( ndst1 < n ) ndst1 = n; } assert( ndst1 ); - + int ndst = ndst1*nsmpl_dst; hts_expand(int32_t,ndst,args->mtmpi,args->tmpi); hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated @@ -2115,6 +2137,7 @@ static char *set_replace_mode(char *ss, int *replace) *replace = mode; return ss; } +static void rename_annots_push(args_t *args, char *src, char *dst); static void init_columns(args_t *args) { int need_sample_map = 0; @@ -2164,6 +2187,7 @@ static void init_columns(args_t *args) int icol = -1, has_fmt_str = 0; while ( *ss ) { + char *ptr; if ( *se && *se!=',' ) { se++; continue; } int replace; ss = set_replace_mode(ss, &replace); @@ -2198,6 +2222,8 @@ static void init_columns(args_t *args) col->setter = vcf_setter_alt; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); + col->replace = replace; + if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF); } else args->alt_idx = icol; } @@ -2257,11 +2283,17 @@ static void init_columns(args_t *args) col->hdr_key_dst = strdup(str.s); col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5); int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src); - if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) + if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname); if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR ) error("Only Type=String tags can be used to annotate the ID column\n"); } + else if ( (ptr=strstr(str.s,":=")) && !args->targets_fname ) + { + *ptr = 0; + rename_annots_push(args,ptr+2,str.s); + *ptr = ':'; + } else if ( !strcasecmp("FILTER",str.s) ) { if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); @@ -2536,7 +2568,7 @@ static void init_columns(args_t *args) hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } if ( args->tgts_is_vcf ) @@ -2686,41 +2718,58 @@ static void rename_chrs(args_t *args, char *fname) for (i=0; ihdr_out, BCF_DT_ID, ori_tag); + if ( id<0 ) return 1; + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", ori_tag, NULL); + if ( !hrec ) return 1; // the ID attribute not present + int j = bcf_hrec_find_key(hrec, "ID"); + assert( j>=0 ); + free(hrec->vals[j]); + char *ptr = new_tag; + while ( *ptr && !isspace(*ptr) ) ptr++; + *ptr = 0; + hrec->vals[j] = strdup(new_tag); + args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + return 0; +} +static void rename_annots(args_t *args) { - int n, i; - char **map = hts_readlist(fname, 1, &n); - if ( !map ) error("Could not read: %s\n", fname); - for (i=0; irename_annots ) { - char *sb = NULL, *ss = map[i]; - while ( *ss && !isspace(*ss) ) ss++; - if ( !*ss ) error("Could not parse: %s\n", fname); - *ss = 0; - int type; - if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5; - else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7; - else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4; - else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7; - else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname); - int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb); - if ( id<0 ) continue; - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL); - if ( !hrec ) continue; // the sequence not present - int j = bcf_hrec_find_key(hrec, "ID"); - assert( j>=0 ); - free(hrec->vals[j]); - ss++; - while ( *ss && isspace(*ss) ) ss++; - char *se = ss; - while ( *se && !isspace(*se) ) se++; - *se = 0; - hrec->vals[j] = strdup(ss); - args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + args->rename_annots_map = hts_readlist(args->rename_annots, 1, &args->rename_annots_nmap); + if ( !args->rename_annots_map ) error("Could not read: %s\n", args->rename_annots); } - for (i=0; irename_annots_nmap; i++) + { + char *ptr = args->rename_annots_map[i]; + while ( *ptr && !isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse: %s\n", args->rename_annots_map[i]); + char *rmme = ptr; + *ptr = 0; + ptr++; + while ( *ptr && isspace(*ptr) ) ptr++; + if ( !*ptr ) { *rmme = ' '; error("Could not parse: %s\n", args->rename_annots_map[i]); } + if ( rename_annots_core(args, args->rename_annots_map[i], ptr) < 0 ) + error("Could not parse \"%s %s\", expected INFO, FORMAT, or FILTER prefix\n",args->rename_annots_map[i],ptr); + } +} +static void rename_annots_push(args_t *args, char *src, char *dst) +{ + args->rename_annots_nmap++; + args->rename_annots_map = (char**)realloc(args->rename_annots_map,sizeof(*args->rename_annots_map)*args->rename_annots_nmap); + kstring_t str = {0,0,0}; + ksprintf(&str,"%s %s",src,dst); + args->rename_annots_map[ args->rename_annots_nmap - 1 ] = str.s; } static void init_data(args_t *args) @@ -2734,7 +2783,7 @@ static void init_data(args_t *args) args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt); } if ( args->remove_annots ) init_remove_annots(args); - if ( args->header_fname ) init_header_lines(args); + if ( args->header_fname || args->header_lines ) init_header_lines(args); if ( args->targets_fname && args->tgts_is_vcf ) { // reading annots from a VCF @@ -2769,6 +2818,22 @@ static void init_data(args_t *args) args->nalines++; hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); } + if ( args->min_overlap_str ) + { + char *tmp = args->min_overlap_str; + if ( args->min_overlap_str[0] != ':' ) + { + args->min_overlap_ann = strtod(args->min_overlap_str,&tmp); + if ( args->min_overlap_ann < 0 || args->min_overlap_ann > 1 || (*tmp && *tmp!=':') ) + error("Could not parse \"--min-overlap %s\", expected value(s) between 0-1\n", args->min_overlap_str); + } + if ( *tmp && *tmp==':' ) + { + args->min_overlap_vcf = strtod(tmp+1,&tmp); + if ( args->min_overlap_vcf < 0 || args->min_overlap_vcf > 1 || *tmp ) + error("Could not parse \"--min-overlap %s\", expected value(s) between 0-1\n", args->min_overlap_str); + } + } } init_merge_method(args); args->vcmp = vcmp_init(); @@ -2787,7 +2852,7 @@ static void init_data(args_t *args) if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); - if ( args->rename_annots ) rename_annots(args, args->rename_annots); + if ( args->rename_annots || args->rename_annots_map ) rename_annots(args); char wmode[8]; set_wmode(wmode,args->output_type,args->output_fname,args->clevel); @@ -2835,6 +2900,11 @@ static void destroy_data(args_t *args) regidx_destroy(args->tgt_idx); regitr_destroy(args->tgt_itr); } + if ( args->rename_annots_map ) + { + for (i=0; irename_annots_nmap; i++) free(args->rename_annots_map[i]); + free(args->rename_annots_map); + } if ( args->tgts ) bcf_sr_regions_destroy(args->tgts); free(args->tmpks.s); free(args->tmpi); @@ -2879,7 +2949,7 @@ static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) } if ( args->ref_idx != -1 ) { - if ( args->ref_idx >= tmp->ncols ) + if ( args->ref_idx >= tmp->ncols ) error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str); if ( args->alt_idx >= tmp->ncols ) error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str); @@ -2988,6 +3058,15 @@ static void annotate(args_t *args, bcf1_t *line) tmp->rid = line->rid; tmp->start = args->tgt_itr->beg; tmp->end = args->tgt_itr->end; + + // Check min overlap + int len_ann = tmp->end - tmp->start + 1; + int len_vcf = line->rlen; + int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; + assert( isec > 0 ); + if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue; + if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue; + parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); for (j=0; jncols; j++) { @@ -3186,24 +3265,26 @@ static void usage(args_t *args) { fprintf(stderr, "\n"); fprintf(stderr, "About: Annotate and edit VCF/BCF files.\n"); - fprintf(stderr, "Usage: bcftools annotate [options] \n"); + fprintf(stderr, "Usage: bcftools annotate [options] VCF\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(stderr, " --collapse STR Matching records by , see man page for details [some]\n"); fprintf(stderr, " -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); fprintf(stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " --force Continue despite parsing error (at your own risk!)\n"); + fprintf(stderr, " -H, --header-line STR Header line which should be appended to the VCF header, can be given multiple times\n"); fprintf(stderr, " -h, --header-lines FILE Lines which should be appended to the VCF header\n"); fprintf(stderr, " -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details\n"); fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them\n"); fprintf(stderr, " -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); fprintf(stderr, " -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(stderr, " --min-overlap ANN:VCF Required overlap as a fraction of variant in the -a file (ANN), the VCF (:VCF), or reciprocal (ANN:VCF)\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); @@ -3235,7 +3316,8 @@ int main_vcfannotate(int argc, char *argv[]) args->set_ids_replace = 1; args->match_id = -1; args->clevel = -1; - int regions_is_file = 0, collapse = 0; + args->pair_logic = -1; + int regions_is_file = 0; int regions_overlap = 1; static struct option loptions[] = @@ -3249,6 +3331,7 @@ int main_vcfannotate(int argc, char *argv[]) {"annotations",required_argument,NULL,'a'}, {"merge-logic",required_argument,NULL,'l'}, {"collapse",required_argument,NULL,2}, + {"pair-logic",required_argument,NULL,2}, {"include",required_argument,NULL,'i'}, {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, @@ -3260,26 +3343,28 @@ int main_vcfannotate(int argc, char *argv[]) {"rename-annots",required_argument,NULL,11}, {"rename-chrs",required_argument,NULL,1}, {"header-lines",required_argument,NULL,'h'}, + {"header-line",required_argument,NULL,'H'}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, {"single-overlaps",no_argument,NULL,10}, + {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) { switch (c) { case 'f': args->force = 1; break; case 'k': args->keep_sites = 1; break; - case 'm': + case 'm': args->mark_sites_logic = MARK_LISTED; if ( optarg[0]=='+' ) args->mark_sites = optarg+1; else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } - else args->mark_sites = optarg; + else args->mark_sites = optarg; break; - case 'l': + case 'l': if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str); kputs(optarg,&args->merge_method_str); break; @@ -3318,27 +3403,28 @@ int main_vcfannotate(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'h': args->header_fname = optarg; break; + case 'H': args->header_lines = dbuf_push(args->header_lines,strdup(optarg)); break; case 1 : args->rename_chrs = optarg; break; case 2 : - if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; - else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; - else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; - else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; - else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE; - else error("The --collapse string \"%s\" not recognised.\n", optarg); + if ( !strcmp(optarg,"snps") ) args->pair_logic |= BCF_SR_PAIR_SNP_REF; + else if ( !strcmp(optarg,"indels") ) args->pair_logic |= BCF_SR_PAIR_INDEL_REF; + else if ( !strcmp(optarg,"both") ) args->pair_logic |= BCF_SR_PAIR_BOTH_REF; + else if ( !strcmp(optarg,"any") ) args->pair_logic |= BCF_SR_PAIR_ANY; + else if ( !strcmp(optarg,"all") ) args->pair_logic |= BCF_SR_PAIR_ANY; + else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME; + else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else error("The --pair-logic string \"%s\" not recognised.\n", optarg); break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->single_overlaps = 1; break; case 11 : args->rename_annots = optarg; break; + case 12 : args->min_overlap_str = optarg; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } @@ -3360,7 +3446,7 @@ int main_vcfannotate(int argc, char *argv[]) } if ( args->targets_fname ) { - htsFile *fp = hts_open(args->targets_fname,"r"); + htsFile *fp = hts_open(args->targets_fname,"r"); if ( !fp ) error("Failed to open %s\n", args->targets_fname); htsFormat type = *hts_get_format(fp); hts_close(fp); @@ -3369,9 +3455,11 @@ int main_vcfannotate(int argc, char *argv[]) { args->tgts_is_vcf = 1; args->files->require_index = 1; - args->files->collapse = collapse ? collapse : COLLAPSE_SOME; + bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME); + if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n"); } } + if ( args->min_overlap_str && args->single_overlaps ) error("The options --single-overlaps and --min-overlap cannot be combined\n"); if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); @@ -3399,7 +3487,7 @@ int main_vcfannotate(int argc, char *argv[]) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) + if ( !pass ) { if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); continue; diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index 3c8469e..dfc8df0 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -2,7 +2,7 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -47,6 +47,7 @@ THE SOFTWARE. */ #include "convert.h" #include "smpl_ilist.h" #include "regidx.h" +#include "dbuf.h" struct _args_t; @@ -161,9 +162,14 @@ typedef struct _args_t char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites; + char **rename_annots_map; + char *min_overlap_str; + float min_overlap_ann, min_overlap_vcf; + int rename_annots_nmap; kstring_t merge_method_str; int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; - int columns_is_file, has_append_mode; + int columns_is_file, has_append_mode, pair_logic; + dbuf_t *header_lines; } args_t; @@ -398,7 +404,7 @@ static void init_remove_annots(args_t *args) if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT); } else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual; - else if ( !strcasecmp("INFO",str.s) ) + else if ( !strcasecmp("INFO",str.s) ) { if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt); tag->handler = remove_info; @@ -451,7 +457,7 @@ static void init_remove_annots(args_t *args) rm_tag_t *tag = &args->rm[args->nrm-1]; if ( hrec->type==BCF_HL_INFO ) tag->handler = remove_info_tag; else if ( hrec->type==BCF_HL_FMT ) tag->handler = remove_format_tag; - else + else { tag->handler = remove_filter; tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]); @@ -467,16 +473,31 @@ static void init_remove_annots(args_t *args) } static void init_header_lines(args_t *args) { - htsFile *file = hts_open(args->header_fname, "rb"); - if ( !file ) error("Error reading %s\n", args->header_fname); - kstring_t str = {0,0,0}; - while ( hts_getline(file, KS_SEP_LINE, &str) > 0 ) + if ( args->header_fname ) { - if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); - bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) + htsFile *file = hts_open(args->header_fname, "rb"); + if ( !file ) error("Error reading %s\n", args->header_fname); + kstring_t str = {0,0,0}; + while ( hts_getline(file, KS_SEP_LINE, &str) > 0 ) + { + if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); + bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) + } + if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); + free(str.s); + } + if ( args->header_lines ) + { + int i, n = dbuf_n(args->header_lines); + for (i=0; iheader_lines,i); + if ( bcf_hdr_append(args->hdr_out,line) ) error("Could not parse the header line: %s\n", line); + bcf_hdr_append(args->hdr,line); // the input file may not have the header line if run with -H (and nothing else) + } + dbuf_destroy_free(args->header_lines); + args->header_lines = NULL; } - if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); - free(str.s); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update output header", __func__); if (bcf_hdr_sync(args->hdr) < 0) @@ -484,7 +505,7 @@ static void init_header_lines(args_t *args) } static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) { - return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); + return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); } static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) { @@ -536,9 +557,9 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat if ( !(col->replace & REPLACE_MISSING) ) { bcf_update_filter(args->hdr_out,line,NULL,0); - return bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); } - + // only update missing FILTER if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); if ( !line->d.n_flt ) @@ -653,6 +674,7 @@ static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *da { bcf1_t *rec = (bcf1_t*) data; int i; + if ( line->n_allele>1 && (col->replace & REPLACE_MISSING) ) return 0; if ( rec->n_allele==line->n_allele ) { for (i=1; in_allele; i++) if ( strcmp(rec->d.allele[i],line->d.allele[i]) ) break; @@ -762,7 +784,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d { if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && - col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND && col->merge_method!=MM_APPEND_MISSING ) error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n"); } @@ -801,7 +823,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } else { - args->tmpi[ntmpi-1] = strtol(str, &end, 10); + args->tmpi[ntmpi-1] = strtol(str, &end, 10); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); str = end+1; @@ -856,7 +878,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d col->mm_dbl_nused = col->mm_dbl_ndat = 0; } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi); if ( col->replace & REPLACE_MISSING ) @@ -872,7 +894,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi); if ( ntmpi < 0 ) return 0; // nothing to add - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi); if ( col->replace & REPLACE_MISSING ) @@ -959,7 +981,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * hts_expand(float,ntmpf,args->mtmpf,args->tmpf); if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) { - if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) + if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) bcf_float_set_missing(args->tmpf[ntmpf-1]); else ntmpf--; @@ -1039,7 +1061,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * col->mm_dbl_nused = col->mm_dbl_ndat = 0; } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf); if ( col->replace & REPLACE_MISSING ) @@ -1056,7 +1078,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); if ( ntmpf < 0 ) return 0; // nothing to add - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf); if ( col->replace & REPLACE_MISSING ) @@ -1071,7 +1093,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) { assert( col->merge_method==MM_FIRST ); - + int nsrc = 1, lsrc = 0; while ( args->tmps[lsrc] ) { @@ -1089,7 +1111,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in // fill in any missing values in the target VCF (or all, if not present) int i, empty = 0, nstr, mstr = args->tmpks.m; - nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr); + nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr); args->tmpks.m = mstr; if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) ) { @@ -1202,7 +1224,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d hts_expand(char,len+1,args->mtmps,args->tmps); memcpy(args->tmps,tab->cols[col->icol],len+1); - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,tab->nals,tab->als); } @@ -1220,7 +1242,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ntmps < 0 ) return 0; // nothing to add } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele); if ( col->replace & REPLACE_MISSING ) @@ -1253,8 +1275,8 @@ gt_length_too_big: for (i=0; il - plen > blen ) @@ -1316,7 +1338,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo } return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nsrc ) + else if ( ndst >= nsrc ) { for (i=0; ihdr_out); i++) { @@ -1361,7 +1383,7 @@ static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) for (i=icol_beg; icols[i], *end = str; - if ( str[0]=='.' && !str[1] ) + if ( str[0]=='.' && !str[1] ) { // missing value if ( !nmax ) nmax = 1; @@ -1404,7 +1426,7 @@ static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, } return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nvals ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { @@ -1419,7 +1441,7 @@ static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING // x . . -TAG .. REPLACE_NON_MISSING - if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } for (j=0; jhdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nvals ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { if ( args->sample_map[i]==-1 ) continue; float *src = vals + nvals*args->sample_map[i]; float *dst = args->tmpf2 + ndst*i; - if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } for (j=0; jtmps2; for (i=0; itmpp2[i] = tmp; tmp += 2; @@ -1546,7 +1568,7 @@ static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **src = vals + args->sample_map[i]; char **dst = args->tmpp2 + i; - if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } *dst = *src; @@ -1558,7 +1580,7 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); @@ -1580,7 +1602,7 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void } char *end = str; - ptr[ival] = strtol(str, &end, 10); + ptr[ival] = strtol(str, &end, 10); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); @@ -1597,7 +1619,7 @@ static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); @@ -1613,14 +1635,14 @@ static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void { if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value { - bcf_float_set_missing(ptr[ival]); + bcf_float_set_missing(ptr[ival]); ival++; str += str[1] ? 2 : 1; continue; } char *end = str; - ptr[ival] = strtod(str, &end); + ptr[ival] = strtod(str, &end); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); @@ -1637,7 +1659,7 @@ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int ismpl; @@ -1661,12 +1683,12 @@ static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int if ( has_value ) { if ( j==ndip ) - { + { smpl[i] = 2; - max_ploidy = 2; + max_ploidy = 2; } else if ( j==nals ) - { + { smpl[i] = 1; if ( !max_ploidy ) max_ploidy = 1; } @@ -1877,7 +1899,7 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, if ( k>=0 ) { if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]); - else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); + else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); else ptr_dst[k] = ptr_src[j]; } } @@ -1930,7 +1952,7 @@ static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, v if ( ndst1 < n ) ndst1 = n; } assert( ndst1 ); - + int ndst = ndst1*nsmpl_dst; hts_expand(int32_t,ndst,args->mtmpi,args->tmpi); hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated @@ -2117,6 +2139,7 @@ static char *set_replace_mode(char *ss, int *replace) *replace = mode; return ss; } +static void rename_annots_push(args_t *args, char *src, char *dst); static void init_columns(args_t *args) { int need_sample_map = 0; @@ -2166,6 +2189,7 @@ static void init_columns(args_t *args) int icol = -1, has_fmt_str = 0; while ( *ss ) { + char *ptr; if ( *se && *se!=',' ) { se++; continue; } int replace; ss = set_replace_mode(ss, &replace); @@ -2200,6 +2224,8 @@ static void init_columns(args_t *args) col->setter = vcf_setter_alt; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); + col->replace = replace; + if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF); } else args->alt_idx = icol; } @@ -2259,11 +2285,17 @@ static void init_columns(args_t *args) col->hdr_key_dst = strdup(str.s); col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5); int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src); - if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) + if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname); if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR ) error("Only Type=String tags can be used to annotate the ID column\n"); } + else if ( (ptr=strstr(str.s,":=")) && !args->targets_fname ) + { + *ptr = 0; + rename_annots_push(args,ptr+2,str.s); + *ptr = ':'; + } else if ( !strcasecmp("FILTER",str.s) ) { if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); @@ -2538,7 +2570,7 @@ static void init_columns(args_t *args) hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } if ( args->tgts_is_vcf ) @@ -2688,41 +2720,58 @@ static void rename_chrs(args_t *args, char *fname) for (i=0; ihdr_out, BCF_DT_ID, ori_tag); + if ( id<0 ) return 1; + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", ori_tag, NULL); + if ( !hrec ) return 1; // the ID attribute not present + int j = bcf_hrec_find_key(hrec, "ID"); + assert( j>=0 ); + free(hrec->vals[j]); + char *ptr = new_tag; + while ( *ptr && !isspace(*ptr) ) ptr++; + *ptr = 0; + hrec->vals[j] = strdup(new_tag); + args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + return 0; +} +static void rename_annots(args_t *args) { - int n, i; - char **map = hts_readlist(fname, 1, &n); - if ( !map ) error("Could not read: %s\n", fname); - for (i=0; irename_annots ) { - char *sb = NULL, *ss = map[i]; - while ( *ss && !isspace(*ss) ) ss++; - if ( !*ss ) error("Could not parse: %s\n", fname); - *ss = 0; - int type; - if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5; - else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7; - else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4; - else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7; - else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname); - int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb); - if ( id<0 ) continue; - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL); - if ( !hrec ) continue; // the sequence not present - int j = bcf_hrec_find_key(hrec, "ID"); - assert( j>=0 ); - free(hrec->vals[j]); - ss++; - while ( *ss && isspace(*ss) ) ss++; - char *se = ss; - while ( *se && !isspace(*se) ) se++; - *se = 0; - hrec->vals[j] = strdup(ss); - args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + args->rename_annots_map = hts_readlist(args->rename_annots, 1, &args->rename_annots_nmap); + if ( !args->rename_annots_map ) error("Could not read: %s\n", args->rename_annots); } - for (i=0; irename_annots_nmap; i++) + { + char *ptr = args->rename_annots_map[i]; + while ( *ptr && !isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse: %s\n", args->rename_annots_map[i]); + char *rmme = ptr; + *ptr = 0; + ptr++; + while ( *ptr && isspace(*ptr) ) ptr++; + if ( !*ptr ) { *rmme = ' '; error("Could not parse: %s\n", args->rename_annots_map[i]); } + if ( rename_annots_core(args, args->rename_annots_map[i], ptr) < 0 ) + error("Could not parse \"%s %s\", expected INFO, FORMAT, or FILTER prefix\n",args->rename_annots_map[i],ptr); + } +} +static void rename_annots_push(args_t *args, char *src, char *dst) +{ + args->rename_annots_nmap++; + args->rename_annots_map = (char**)realloc(args->rename_annots_map,sizeof(*args->rename_annots_map)*args->rename_annots_nmap); + kstring_t str = {0,0,0}; + ksprintf(&str,"%s %s",src,dst); + args->rename_annots_map[ args->rename_annots_nmap - 1 ] = str.s; } static void init_data(args_t *args) @@ -2736,7 +2785,7 @@ static void init_data(args_t *args) args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt); } if ( args->remove_annots ) init_remove_annots(args); - if ( args->header_fname ) init_header_lines(args); + if ( args->header_fname || args->header_lines ) init_header_lines(args); if ( args->targets_fname && args->tgts_is_vcf ) { // reading annots from a VCF @@ -2771,6 +2820,22 @@ static void init_data(args_t *args) args->nalines++; hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); } + if ( args->min_overlap_str ) + { + char *tmp = args->min_overlap_str; + if ( args->min_overlap_str[0] != ':' ) + { + args->min_overlap_ann = strtod(args->min_overlap_str,&tmp); + if ( args->min_overlap_ann < 0 || args->min_overlap_ann > 1 || (*tmp && *tmp!=':') ) + error("Could not parse \"--min-overlap %s\", expected value(s) between 0-1\n", args->min_overlap_str); + } + if ( *tmp && *tmp==':' ) + { + args->min_overlap_vcf = strtod(tmp+1,&tmp); + if ( args->min_overlap_vcf < 0 || args->min_overlap_vcf > 1 || *tmp ) + error("Could not parse \"--min-overlap %s\", expected value(s) between 0-1\n", args->min_overlap_str); + } + } } init_merge_method(args); args->vcmp = vcmp_init(); @@ -2789,7 +2854,7 @@ static void init_data(args_t *args) if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); - if ( args->rename_annots ) rename_annots(args, args->rename_annots); + if ( args->rename_annots || args->rename_annots_map ) rename_annots(args); char wmode[8]; set_wmode(wmode,args->output_type,args->output_fname,args->clevel); @@ -2837,6 +2902,11 @@ static void destroy_data(args_t *args) regidx_destroy(args->tgt_idx); regitr_destroy(args->tgt_itr); } + if ( args->rename_annots_map ) + { + for (i=0; irename_annots_nmap; i++) free(args->rename_annots_map[i]); + free(args->rename_annots_map); + } if ( args->tgts ) bcf_sr_regions_destroy(args->tgts); free(args->tmpks.s); free(args->tmpi); @@ -2881,7 +2951,7 @@ static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) } if ( args->ref_idx != -1 ) { - if ( args->ref_idx >= tmp->ncols ) + if ( args->ref_idx >= tmp->ncols ) error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str); if ( args->alt_idx >= tmp->ncols ) error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str); @@ -2990,6 +3060,15 @@ static void annotate(args_t *args, bcf1_t *line) tmp->rid = line->rid; tmp->start = args->tgt_itr->beg; tmp->end = args->tgt_itr->end; + + // Check min overlap + int len_ann = tmp->end - tmp->start + 1; + int len_vcf = line->rlen; + int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; + assert( isec > 0 ); + if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue; + if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue; + parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); for (j=0; jncols; j++) { @@ -3188,24 +3267,26 @@ static void usage(args_t *args) { fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "About: Annotate and edit VCF/BCF files.\n"); - fprintf(bcftools_stderr, "Usage: bcftools annotate [options] \n"); + fprintf(bcftools_stderr, "Usage: bcftools annotate [options] VCF\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); fprintf(bcftools_stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(bcftools_stderr, " --collapse STR Matching records by , see man page for details [some]\n"); fprintf(bcftools_stderr, " -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); fprintf(bcftools_stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); fprintf(bcftools_stderr, " --force Continue despite parsing error (at your own risk!)\n"); + fprintf(bcftools_stderr, " -H, --header-line STR Header line which should be appended to the VCF header, can be given multiple times\n"); fprintf(bcftools_stderr, " -h, --header-lines FILE Lines which should be appended to the VCF header\n"); fprintf(bcftools_stderr, " -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details\n"); fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); fprintf(bcftools_stderr, " -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them\n"); fprintf(bcftools_stderr, " -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); fprintf(bcftools_stderr, " -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(bcftools_stderr, " --min-overlap ANN:VCF Required overlap as a fraction of variant in the -a file (ANN), the VCF (:VCF), or reciprocal (ANN:VCF)\n"); fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); @@ -3237,7 +3318,8 @@ int main_vcfannotate(int argc, char *argv[]) args->set_ids_replace = 1; args->match_id = -1; args->clevel = -1; - int regions_is_file = 0, collapse = 0; + args->pair_logic = -1; + int regions_is_file = 0; int regions_overlap = 1; static struct option loptions[] = @@ -3251,6 +3333,7 @@ int main_vcfannotate(int argc, char *argv[]) {"annotations",required_argument,NULL,'a'}, {"merge-logic",required_argument,NULL,'l'}, {"collapse",required_argument,NULL,2}, + {"pair-logic",required_argument,NULL,2}, {"include",required_argument,NULL,'i'}, {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, @@ -3262,26 +3345,28 @@ int main_vcfannotate(int argc, char *argv[]) {"rename-annots",required_argument,NULL,11}, {"rename-chrs",required_argument,NULL,1}, {"header-lines",required_argument,NULL,'h'}, + {"header-line",required_argument,NULL,'H'}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, {"single-overlaps",no_argument,NULL,10}, + {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) { switch (c) { case 'f': args->force = 1; break; case 'k': args->keep_sites = 1; break; - case 'm': + case 'm': args->mark_sites_logic = MARK_LISTED; if ( optarg[0]=='+' ) args->mark_sites = optarg+1; else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } - else args->mark_sites = optarg; + else args->mark_sites = optarg; break; - case 'l': + case 'l': if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str); kputs(optarg,&args->merge_method_str); break; @@ -3320,27 +3405,28 @@ int main_vcfannotate(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'h': args->header_fname = optarg; break; + case 'H': args->header_lines = dbuf_push(args->header_lines,strdup(optarg)); break; case 1 : args->rename_chrs = optarg; break; case 2 : - if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; - else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; - else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; - else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; - else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE; - else error("The --collapse string \"%s\" not recognised.\n", optarg); + if ( !strcmp(optarg,"snps") ) args->pair_logic |= BCF_SR_PAIR_SNP_REF; + else if ( !strcmp(optarg,"indels") ) args->pair_logic |= BCF_SR_PAIR_INDEL_REF; + else if ( !strcmp(optarg,"both") ) args->pair_logic |= BCF_SR_PAIR_BOTH_REF; + else if ( !strcmp(optarg,"any") ) args->pair_logic |= BCF_SR_PAIR_ANY; + else if ( !strcmp(optarg,"all") ) args->pair_logic |= BCF_SR_PAIR_ANY; + else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME; + else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else error("The --pair-logic string \"%s\" not recognised.\n", optarg); break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->single_overlaps = 1; break; case 11 : args->rename_annots = optarg; break; + case 12 : args->min_overlap_str = optarg; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } @@ -3362,7 +3448,7 @@ int main_vcfannotate(int argc, char *argv[]) } if ( args->targets_fname ) { - htsFile *fp = hts_open(args->targets_fname,"r"); + htsFile *fp = hts_open(args->targets_fname,"r"); if ( !fp ) error("Failed to open %s\n", args->targets_fname); htsFormat type = *hts_get_format(fp); hts_close(fp); @@ -3371,9 +3457,11 @@ int main_vcfannotate(int argc, char *argv[]) { args->tgts_is_vcf = 1; args->files->require_index = 1; - args->files->collapse = collapse ? collapse : COLLAPSE_SOME; + bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME); + if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n"); } } + if ( args->min_overlap_str && args->single_overlaps ) error("The options --single-overlaps and --min-overlap cannot be combined\n"); if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); @@ -3401,7 +3489,7 @@ int main_vcfannotate(int argc, char *argv[]) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) + if ( !pass ) { if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); continue; diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index ca2a899..0418d8e 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -1071,10 +1071,8 @@ int main_vcfcall(int argc, char *argv[]) case 9 : args.n_threads = strtol(optarg, 0, 0); break; case 8 : args.record_cmd_line = 0; break; case 4 : - if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args.regions_overlap = parse_overlap_option(optarg); + if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; default: usage(&args); } diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index 63e1b03..1c1710b 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -1073,10 +1073,8 @@ int main_vcfcall(int argc, char *argv[]) case 9 : args.n_threads = strtol(optarg, 0, 0); break; case 8 : args.record_cmd_line = 0; break; case 4 : - if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args.regions_overlap = parse_overlap_option(optarg); + if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; default: usage(&args); } diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c index 02f56b9..0302261 100644 --- a/bcftools/vcfcnv.c +++ b/bcftools/vcfcnv.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2021 Genome Research Ltd. + Copyright (c) 2014-2022 Genome Research Ltd. Author: Petr Danecek @@ -1131,8 +1131,6 @@ static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt, return *baf<0 ? 0 : 1; } -int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq); - static void cnv_next_line(args_t *args, bcf1_t *line) { if ( !line ) @@ -1381,16 +1379,12 @@ int main_vcfcnv(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(args); break; diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index 7562809..fd2e3bb 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2014-2021 Genome Research Ltd. + Copyright (c) 2014-2022 Genome Research Ltd. Author: Petr Danecek @@ -1133,8 +1133,6 @@ static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt, return *baf<0 ? 0 : 1; } -int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq); - static void cnv_next_line(args_t *args, bcf1_t *line) { if ( !line ) @@ -1383,16 +1381,12 @@ int main_vcfcnv(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(args); break; diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c index 50013a1..0246b59 100644 --- a/bcftools/vcfconcat.c +++ b/bcftools/vcfconcat.c @@ -1014,10 +1014,8 @@ int main_vcfconcat(int argc, char *argv[]) case 8 : args->record_cmd_line = 0; break; case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; case 12 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 'v': args->verbose = strtol(optarg, 0, 0); diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index e0b23ad..e2cd43f 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -1016,10 +1016,8 @@ int main_vcfconcat(int argc, char *argv[]) case 8 : args->record_cmd_line = 0; break; case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; case 12 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 'v': args->verbose = strtol(optarg, 0, 0); diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index c0fddac..4a5d7ba 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -64,7 +64,7 @@ struct _args_t kstring_t str; int32_t *gts; float *flt; - int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col; + int rev_als, output_vcf_ids, hap2dip, gen_3N6; int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; int regions_overlap, targets_overlap; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; @@ -139,54 +139,83 @@ static void open_vcf(args_t *args, const char *format_str) free(samples); } -static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +// Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error +static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) { args_t *args = (args_t*) usr; char tmp, *se = tsv->ss, *ss = tsv->ss; while ( se < tsv->se && *se!=':' ) se++; - if ( *se!=':' ) error("Could not parse CHROM in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( *se!=':' ) return -1; tmp = *se; *se = 0; - rec->rid = bcf_hdr_name2id(args->header,ss); - if ( rec->rid<0 ) error("Could not determine sequence name or multiple sequences present: %s\n", tsv->ss); + int rid = bcf_hdr_name2id(args->header,ss); *se = tmp; + if ( rid<0 ) return -1; // POS - rec->pos = strtol(se+1,&ss,10); - if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss); - rec->pos--; - - // ID - if ( args->output_vcf_ids ) - { - char tmp = *tsv->se; - *tsv->se = 0; - bcf_update_id(args->header, rec, tsv->ss); - *tsv->se = tmp; - } + hts_pos_t pos = strtol(se+1,&ss,10); + if ( ss==se+1 ) return -1; + pos--; // REF,ALT args->str.l = 0; se = ++ss; while ( se < tsv->se && *se!='_' ) se++; - if ( *se!='_' ) error("Could not parse REF in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( *se!='_' ) return -1; kputsn(ss,se-ss,&args->str); ss = ++se; while ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) se++; - if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) error("Could not parse ALT in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) return -1; kputc(',',&args->str); kputsn(ss,se-ss,&args->str); - bcf_update_alleles_str(args->header, rec, args->str.s); // END - optional - if (*se && *se=='_') { + if (*se && *se=='_') + { long end = strtol(se+1,&ss,10); - if ( ss==se+1 ) error("Could not parse END in CHROM:POS_REF_ALT_END: %s\n", tsv->ss); + if ( ss==se+1 ) return -1; bcf_update_info_int32(args->header, rec, "END", &end, 1); } + rec->rid = rid; + rec->pos = pos; + bcf_update_alleles_str(args->header, rec, args->str.s); + return 0; } +static int tsv_setter_chrom_pos_ref_alt_or_chrom(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + int ret = _set_chrom_pos_ref_alt(tsv,rec,usr); + if ( !ret ) return ret; + return tsv_setter_chrom(tsv,rec,args->header); +} +static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + int ret = _set_chrom_pos_ref_alt(tsv,rec,usr); + if ( ret!=0 ) error("Could not parse the CHROM:POS_REF_ALT[_END] string: %s\n", tsv->ss); + return ret; +} +// This function must be called first, then tsv_setter_chrom_pos_ref_alt_id_or_die. +// One of them is expected to find the CHROM:POS_REF_ALT[_END] string, if not, die. +static int tsv_setter_chrom_pos_ref_alt_or_id(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + if ( _set_chrom_pos_ref_alt(tsv,rec,usr)==0 ) return 0; + rec->pos = -1; // mark the record as unset + if ( !args->output_vcf_ids) return 0; + return tsv_setter_id(tsv,rec,usr); +} +static int tsv_setter_chrom_pos_ref_alt_id_or_die(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + if ( rec->pos!=-1 ) + { + if ( !args->output_vcf_ids ) return 0; + return tsv_setter_id(tsv,rec,usr); + } + return tsv_setter_chrom_pos_ref_alt(tsv,rec,usr); +} static int tsv_setter_verify_pos(tsv_t *tsv, bcf1_t *rec, void *usr) { char *se; @@ -334,7 +363,8 @@ static void gensample_to_vcf(args_t *args) * * Second column is expected in the form of CHROM:POS_REF_ALT. We use second * column because the first can be empty ("--") when filling sites from reference - * panel. + * panel. When the option --vcf-ids is given, the first column is used to set the + * VCF ID. * * Output: VCF with filled GT,GP * @@ -362,22 +392,29 @@ static void gensample_to_vcf(args_t *args) if ( !gen_fh ) error("Could not read: %s\n", gen_fname); if ( hts_getline(gen_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", gen_fname); - // Find out the chromosome name, sample names, init and print the VCF header + // Find out the chromosome name, depending on the format variant (--3N6 or plain) and the ordering + // of the columns (CHROM:POS_REF_ALT comes first or second) args->str.l = 0; - char *ss, *se = line.s; + char *sb = line.s, *se = line.s; while ( *se && !isspace(*se) ) se++; - if ( !*se ) error("Could not parse %s: %s\n", gen_fname,line.s); - ss = se+1; - se = strchr(ss,':'); - if ( !se ) error("Expected CHROM:POS_REF_ALT in second column of %s\n", gen_fname); - kputsn(ss, se-ss, &args->str); - - tsv_t *tsv = tsv_init("-,CHROM_POS_REF_ALT,POS,REF_ALT,GT_GP"); - tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); - tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); - tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args); + if ( !*se ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + if ( args->gen_3N6 ) // first column, just CHROM + kputsn(sb, se-sb, &args->str); + else // first or second column, part of CHROM:POS_REF_ALT + { + char *sc = strchr(sb,':'); + if ( !sc || sc > se ) + { + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + sb = ++se; + sc = strchr(sb,':'); + if ( !sc ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + } + kputsn(sb, sc-sb, &args->str); + } + // Initialize and print the VCF header, args->str.s contains the chr name args->header = bcf_hdr_init("w"); bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); @@ -385,6 +422,21 @@ static void gensample_to_vcf(args_t *args) bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + tsv_t *tsv; + if ( args->gen_3N6 ) + { + tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); + tsv_register(tsv, "CHROM", tsv_setter_chrom, args); + } + else + tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); + tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt_or_id, args); + tsv_register(tsv, "ID", tsv_setter_chrom_pos_ref_alt_id_or_die, args); + tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); + tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); + tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args); + + // Find out sample names int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); if ( !samples ) error("Could not read %s\n", sample_fname); @@ -456,6 +508,11 @@ static void haplegendsample_to_vcf(args_t *args) */ kstring_t line = {0,0,0}; + if ( args->output_vcf_ids ) + error( + "The option --haplegendsample2vcf cannot be combined with --vcf-ids. This is because the\n" + "ID column must be formatted as \"CHROM:POS_REF_ALT\" to check sanity of the operation\n"); + char *hap_fname = NULL, *leg_fname = NULL, *sample_fname = NULL; sample_fname = strchr(args->infname,','); if ( !sample_fname ) @@ -500,7 +557,6 @@ static void haplegendsample_to_vcf(args_t *args) tsv_register(leg_tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); tsv_register(leg_tsv, "POS", tsv_setter_verify_pos, NULL); tsv_register(leg_tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_t *hap_tsv = tsv_init("HAPS"); tsv_register(hap_tsv, "HAPS", tsv_setter_haps, args); @@ -582,7 +638,8 @@ static void hapsample_to_vcf(args_t *args) /* * Input: SHAPEIT output * - * 20:19995888_A_G 20:19995888 19995888 A G 0 0 0 0 ... + * 20:19995888_A_G rsid1 19995888 A G 0 0 0 0 ... + * 20 20:19995888_A_G 19995888 A G 0 0 0 0 ... * * First column is expected in the form of CHROM:POS_REF_ALT * @@ -612,24 +669,49 @@ static void hapsample_to_vcf(args_t *args) if ( !hap_fh ) error("Could not read: %s\n", hap_fname); if ( hts_getline(hap_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", hap_fname); - // Find out the chromosome name, sample names, init and print the VCF header + // Find out the chromosome name, it can be either in the first or second column args->str.l = 0; - char *se = strchr(line.s,':'); - if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", hap_fname); - kputsn(line.s, se-line.s, &args->str); - - tsv_t *tsv = tsv_init("CHROM_POS_REF_ALT,-,POS,REF_ALT,HAPS"); - tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); - tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); - tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_register(tsv, "HAPS", tsv_setter_haps, args); + char *sb = line.s, *se = line.s; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + if ( !args->output_vcf_ids ) + { + // first column should be just CHROM, but the second must be CHROM:POS_REF_ALT, use that + sb = ++se; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + if ( !strchr(sb,':') ) + error("Could not determine CHROM in the second column of %s: %s\n", hap_fname,line.s); + } + // Parse CHROM:POS_REF_ALT + char *sc = strchr(sb,':'); + if ( !sc || sc > se ) + error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + kputsn(sb, sc-sb, &args->str); + // Initialize and print the VCF header, args->str.s contains the chr name args->header = bcf_hdr_init("w"); bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + tsv_t *tsv; + if ( args->output_vcf_ids ) + { + tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,HAPS"); + tsv_register(tsv, "ID", tsv_setter_id, args); + } + else + { + tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,POS,REF_ALT,HAPS"); + tsv_register(tsv, "CHROM", tsv_setter_chrom_pos_ref_alt_or_chrom, args); + } + tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); + tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); + tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); + tsv_register(tsv, "HAPS", tsv_setter_haps, args); + int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); if ( !samples ) error("Could not read %s\n", sample_fname); @@ -712,13 +794,13 @@ static void vcf_to_gensample(args_t *args) kstring_t str = {0,0,0}; // insert chrom as first column if needed - if(args->output_chrom_first_col) + if ( args->gen_3N6 ) kputs("%CHROM ", &str); - else - kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); + + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); // insert rsid as second column if needed - if(args->output_vcf_ids) + if ( args->output_vcf_ids ) kputs("%ID ", &str); else kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); @@ -993,9 +1075,9 @@ static void vcf_to_hapsample(args_t *args) // print ID instead of CHROM:POS_REF_ALT1 if ( args->output_vcf_ids ) - kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str); else - kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); @@ -1419,6 +1501,7 @@ static void usage(void) fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); fprintf(stderr, " -G, --gensample2vcf ... |,\n"); fprintf(stderr, " -g, --gensample ... |,\n"); + fprintf(stderr, " --3N6 Use 3*N+6 column format instead of the old 3*N+5 column format\n"); fprintf(stderr, " --tag STRING Tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); fprintf(stderr, " --chrom Output chromosome in first column instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, " --keep-duplicates Keep duplicate positions\n"); @@ -1493,7 +1576,8 @@ int main_vcfconvert(int argc, char *argv[]) {"gensample",required_argument,NULL,'g'}, {"gensample2vcf",required_argument,NULL,'G'}, {"tag",required_argument,NULL,1}, - {"chrom",no_argument,NULL,8}, + {"chrom",no_argument,NULL,8}, + {"3N6",no_argument,NULL,15}, {"tsv2vcf",required_argument,NULL,2}, {"hapsample",required_argument,NULL,7}, {"hapsample2vcf",required_argument,NULL,3}, @@ -1532,7 +1616,8 @@ int main_vcfconvert(int argc, char *argv[]) case 5 : args->hap2dip = 1; break; case 6 : args->convert_func = gvcf_to_vcf; break; case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break; - case 8 : args->output_chrom_first_col = 1; break; + case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break; + case 15 : args->gen_3N6 = 1; break; case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break; case 'f': args->ref_fname = optarg; break; case 'c': args->columns = optarg; break; @@ -1561,16 +1646,12 @@ int main_vcfconvert(int argc, char *argv[]) case 11 : args->sex_fname = optarg; break; case 12 : args->keep_duplicates = 1; break; case 13 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 14 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index 8f8d4a3..0e64b34 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -66,7 +66,7 @@ struct _args_t kstring_t str; int32_t *gts; float *flt; - int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col; + int rev_als, output_vcf_ids, hap2dip, gen_3N6; int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; int regions_overlap, targets_overlap; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; @@ -141,54 +141,83 @@ static void open_vcf(args_t *args, const char *format_str) free(samples); } -static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +// Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error +static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) { args_t *args = (args_t*) usr; char tmp, *se = tsv->ss, *ss = tsv->ss; while ( se < tsv->se && *se!=':' ) se++; - if ( *se!=':' ) error("Could not parse CHROM in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( *se!=':' ) return -1; tmp = *se; *se = 0; - rec->rid = bcf_hdr_name2id(args->header,ss); - if ( rec->rid<0 ) error("Could not determine sequence name or multiple sequences present: %s\n", tsv->ss); + int rid = bcf_hdr_name2id(args->header,ss); *se = tmp; + if ( rid<0 ) return -1; // POS - rec->pos = strtol(se+1,&ss,10); - if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss); - rec->pos--; - - // ID - if ( args->output_vcf_ids ) - { - char tmp = *tsv->se; - *tsv->se = 0; - bcf_update_id(args->header, rec, tsv->ss); - *tsv->se = tmp; - } + hts_pos_t pos = strtol(se+1,&ss,10); + if ( ss==se+1 ) return -1; + pos--; // REF,ALT args->str.l = 0; se = ++ss; while ( se < tsv->se && *se!='_' ) se++; - if ( *se!='_' ) error("Could not parse REF in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( *se!='_' ) return -1; kputsn(ss,se-ss,&args->str); ss = ++se; while ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) se++; - if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) error("Could not parse ALT in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) return -1; kputc(',',&args->str); kputsn(ss,se-ss,&args->str); - bcf_update_alleles_str(args->header, rec, args->str.s); // END - optional - if (*se && *se=='_') { + if (*se && *se=='_') + { long end = strtol(se+1,&ss,10); - if ( ss==se+1 ) error("Could not parse END in CHROM:POS_REF_ALT_END: %s\n", tsv->ss); + if ( ss==se+1 ) return -1; bcf_update_info_int32(args->header, rec, "END", &end, 1); } + rec->rid = rid; + rec->pos = pos; + bcf_update_alleles_str(args->header, rec, args->str.s); + return 0; } +static int tsv_setter_chrom_pos_ref_alt_or_chrom(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + int ret = _set_chrom_pos_ref_alt(tsv,rec,usr); + if ( !ret ) return ret; + return tsv_setter_chrom(tsv,rec,args->header); +} +static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + int ret = _set_chrom_pos_ref_alt(tsv,rec,usr); + if ( ret!=0 ) error("Could not parse the CHROM:POS_REF_ALT[_END] string: %s\n", tsv->ss); + return ret; +} +// This function must be called first, then tsv_setter_chrom_pos_ref_alt_id_or_die. +// One of them is expected to find the CHROM:POS_REF_ALT[_END] string, if not, die. +static int tsv_setter_chrom_pos_ref_alt_or_id(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + if ( _set_chrom_pos_ref_alt(tsv,rec,usr)==0 ) return 0; + rec->pos = -1; // mark the record as unset + if ( !args->output_vcf_ids) return 0; + return tsv_setter_id(tsv,rec,usr); +} +static int tsv_setter_chrom_pos_ref_alt_id_or_die(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + if ( rec->pos!=-1 ) + { + if ( !args->output_vcf_ids ) return 0; + return tsv_setter_id(tsv,rec,usr); + } + return tsv_setter_chrom_pos_ref_alt(tsv,rec,usr); +} static int tsv_setter_verify_pos(tsv_t *tsv, bcf1_t *rec, void *usr) { char *se; @@ -336,7 +365,8 @@ static void gensample_to_vcf(args_t *args) * * Second column is expected in the form of CHROM:POS_REF_ALT. We use second * column because the first can be empty ("--") when filling sites from reference - * panel. + * panel. When the option --vcf-ids is given, the first column is used to set the + * VCF ID. * * Output: VCF with filled GT,GP * @@ -364,22 +394,29 @@ static void gensample_to_vcf(args_t *args) if ( !gen_fh ) error("Could not read: %s\n", gen_fname); if ( hts_getline(gen_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", gen_fname); - // Find out the chromosome name, sample names, init and print the VCF header + // Find out the chromosome name, depending on the format variant (--3N6 or plain) and the ordering + // of the columns (CHROM:POS_REF_ALT comes first or second) args->str.l = 0; - char *ss, *se = line.s; + char *sb = line.s, *se = line.s; while ( *se && !isspace(*se) ) se++; - if ( !*se ) error("Could not parse %s: %s\n", gen_fname,line.s); - ss = se+1; - se = strchr(ss,':'); - if ( !se ) error("Expected CHROM:POS_REF_ALT in second column of %s\n", gen_fname); - kputsn(ss, se-ss, &args->str); - - tsv_t *tsv = tsv_init("-,CHROM_POS_REF_ALT,POS,REF_ALT,GT_GP"); - tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); - tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); - tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args); + if ( !*se ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + if ( args->gen_3N6 ) // first column, just CHROM + kputsn(sb, se-sb, &args->str); + else // first or second column, part of CHROM:POS_REF_ALT + { + char *sc = strchr(sb,':'); + if ( !sc || sc > se ) + { + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + sb = ++se; + sc = strchr(sb,':'); + if ( !sc ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + } + kputsn(sb, sc-sb, &args->str); + } + // Initialize and print the VCF header, args->str.s contains the chr name args->header = bcf_hdr_init("w"); bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); @@ -387,6 +424,21 @@ static void gensample_to_vcf(args_t *args) bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + tsv_t *tsv; + if ( args->gen_3N6 ) + { + tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); + tsv_register(tsv, "CHROM", tsv_setter_chrom, args); + } + else + tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); + tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt_or_id, args); + tsv_register(tsv, "ID", tsv_setter_chrom_pos_ref_alt_id_or_die, args); + tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); + tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); + tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args); + + // Find out sample names int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); if ( !samples ) error("Could not read %s\n", sample_fname); @@ -458,6 +510,11 @@ static void haplegendsample_to_vcf(args_t *args) */ kstring_t line = {0,0,0}; + if ( args->output_vcf_ids ) + error( + "The option --haplegendsample2vcf cannot be combined with --vcf-ids. This is because the\n" + "ID column must be formatted as \"CHROM:POS_REF_ALT\" to check sanity of the operation\n"); + char *hap_fname = NULL, *leg_fname = NULL, *sample_fname = NULL; sample_fname = strchr(args->infname,','); if ( !sample_fname ) @@ -502,7 +559,6 @@ static void haplegendsample_to_vcf(args_t *args) tsv_register(leg_tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); tsv_register(leg_tsv, "POS", tsv_setter_verify_pos, NULL); tsv_register(leg_tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_t *hap_tsv = tsv_init("HAPS"); tsv_register(hap_tsv, "HAPS", tsv_setter_haps, args); @@ -584,7 +640,8 @@ static void hapsample_to_vcf(args_t *args) /* * Input: SHAPEIT output * - * 20:19995888_A_G 20:19995888 19995888 A G 0 0 0 0 ... + * 20:19995888_A_G rsid1 19995888 A G 0 0 0 0 ... + * 20 20:19995888_A_G 19995888 A G 0 0 0 0 ... * * First column is expected in the form of CHROM:POS_REF_ALT * @@ -614,24 +671,49 @@ static void hapsample_to_vcf(args_t *args) if ( !hap_fh ) error("Could not read: %s\n", hap_fname); if ( hts_getline(hap_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", hap_fname); - // Find out the chromosome name, sample names, init and print the VCF header + // Find out the chromosome name, it can be either in the first or second column args->str.l = 0; - char *se = strchr(line.s,':'); - if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", hap_fname); - kputsn(line.s, se-line.s, &args->str); - - tsv_t *tsv = tsv_init("CHROM_POS_REF_ALT,-,POS,REF_ALT,HAPS"); - tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); - tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); - tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_register(tsv, "HAPS", tsv_setter_haps, args); + char *sb = line.s, *se = line.s; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + if ( !args->output_vcf_ids ) + { + // first column should be just CHROM, but the second must be CHROM:POS_REF_ALT, use that + sb = ++se; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + if ( !strchr(sb,':') ) + error("Could not determine CHROM in the second column of %s: %s\n", hap_fname,line.s); + } + // Parse CHROM:POS_REF_ALT + char *sc = strchr(sb,':'); + if ( !sc || sc > se ) + error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + kputsn(sb, sc-sb, &args->str); + // Initialize and print the VCF header, args->str.s contains the chr name args->header = bcf_hdr_init("w"); bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + tsv_t *tsv; + if ( args->output_vcf_ids ) + { + tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,HAPS"); + tsv_register(tsv, "ID", tsv_setter_id, args); + } + else + { + tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,POS,REF_ALT,HAPS"); + tsv_register(tsv, "CHROM", tsv_setter_chrom_pos_ref_alt_or_chrom, args); + } + tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); + tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); + tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); + tsv_register(tsv, "HAPS", tsv_setter_haps, args); + int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); if ( !samples ) error("Could not read %s\n", sample_fname); @@ -714,13 +796,13 @@ static void vcf_to_gensample(args_t *args) kstring_t str = {0,0,0}; // insert chrom as first column if needed - if(args->output_chrom_first_col) + if ( args->gen_3N6 ) kputs("%CHROM ", &str); - else - kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); + + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); // insert rsid as second column if needed - if(args->output_vcf_ids) + if ( args->output_vcf_ids ) kputs("%ID ", &str); else kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); @@ -995,9 +1077,9 @@ static void vcf_to_hapsample(args_t *args) // print ID instead of CHROM:POS_REF_ALT1 if ( args->output_vcf_ids ) - kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str); else - kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); @@ -1421,6 +1503,7 @@ static void usage(void) fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); fprintf(bcftools_stderr, " -G, --gensample2vcf ... |,\n"); fprintf(bcftools_stderr, " -g, --gensample ... |,\n"); + fprintf(bcftools_stderr, " --3N6 Use 3*N+6 column format instead of the old 3*N+5 column format\n"); fprintf(bcftools_stderr, " --tag STRING Tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); fprintf(bcftools_stderr, " --chrom Output chromosome in first column instead of CHROM:POS_REF_ALT\n"); fprintf(bcftools_stderr, " --keep-duplicates Keep duplicate positions\n"); @@ -1495,7 +1578,8 @@ int main_vcfconvert(int argc, char *argv[]) {"gensample",required_argument,NULL,'g'}, {"gensample2vcf",required_argument,NULL,'G'}, {"tag",required_argument,NULL,1}, - {"chrom",no_argument,NULL,8}, + {"chrom",no_argument,NULL,8}, + {"3N6",no_argument,NULL,15}, {"tsv2vcf",required_argument,NULL,2}, {"hapsample",required_argument,NULL,7}, {"hapsample2vcf",required_argument,NULL,3}, @@ -1534,7 +1618,8 @@ int main_vcfconvert(int argc, char *argv[]) case 5 : args->hap2dip = 1; break; case 6 : args->convert_func = gvcf_to_vcf; break; case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break; - case 8 : args->output_chrom_first_col = 1; break; + case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break; + case 15 : args->gen_3N6 = 1; break; case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break; case 'f': args->ref_fname = optarg; break; case 'c': args->columns = optarg; break; @@ -1563,16 +1648,12 @@ int main_vcfconvert(int argc, char *argv[]) case 11 : args->sex_fname = optarg; break; case 12 : args->keep_duplicates = 1; break; case 13 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 14 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c index 06b8d19..68d8672 100644 --- a/bcftools/vcffilter.c +++ b/bcftools/vcffilter.c @@ -1,6 +1,6 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -39,6 +39,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "rbuf.h" +#include "regidx.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -73,8 +74,9 @@ typedef struct _args_t htsFile *out_fh; int output_type, n_threads, clevel; - char **argv, *output_fname, *targets_list, *regions_list; - int argc, record_cmd_line; + char **argv, *output_fname, *targets_list, *regions_list, *mask_list; + int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate; + regidx_t *mask; } args_t; @@ -86,11 +88,30 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); + if ( args->mask_list ) + { + if ( args->mask_list[0]=='^' ) args->mask_negate = 1; + if ( args->mask_is_file ) + args->mask = regidx_init(args->mask_negate?args->mask_list+1:args->mask_list,NULL,NULL,0,NULL); + else + { + char *rmme = strdup(args->mask_negate?args->mask_list+1:args->mask_list), *tmp = rmme; + while ( *tmp ) + { + if ( *tmp==',' ) *tmp = '\n'; + tmp++; + } + args->mask = regidx_init_string(rmme, regidx_parse_reg, NULL, 0, NULL); + free(rmme); + } + if ( !args->mask ) + error("Could not initialize the mask: %s\n",args->mask_list); + } + args->hdr = args->files->readers[0].header; args->flt_pass = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"PASS"); assert( !args->flt_pass ); // sanity check: required by BCF spec - // -i or -e: append FILTER line - if ( args->soft_filter && args->filter_logic ) + if ( args->soft_filter && (args->filter_logic || args->mask_list) ) { kstring_t flt_name = {0,0,0}; if ( strcmp(args->soft_filter,"+") ) @@ -106,18 +127,28 @@ static void init_data(args_t *args) } while ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,id) ); } - // escape quotes + kstring_t tmp = {0,0,0}; - char *t = args->filter_str; - while ( *t ) + if ( args->filter_logic ) { - if ( *t=='"' ) kputc('\\',&tmp); - kputc(*t,&tmp); - t++; + // -i or -e: append FILTER line + ksprintf(&tmp,"Set if %s: ",args->filter_logic & FLT_INCLUDE ? "not true" : "true"); + + // escape quotes + char *t = args->filter_str; + while ( *t ) + { + if ( *t=='"' ) kputc('\\',&tmp); + kputc(*t,&tmp); + t++; + } } - int ret = bcf_hdr_printf(args->hdr, "##FILTER=", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s); + else if ( args->mask_list ) + ksprintf(&tmp,"Record masked by region"); + + int ret = bcf_hdr_printf(args->hdr, "##FILTER=", flt_name.s,tmp.s); if ( ret!=0 ) - error("Failed to append header line: ##FILTER=\n", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s); + error("Failed to append header line: ##FILTER=\n", flt_name.s,tmp.s); args->flt_fail = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s); assert( args->flt_fail>=0 ); free(flt_name.s); free(tmp.s); @@ -174,6 +205,7 @@ static void destroy_data(args_t *args) filter_destroy(args->filter); free(args->tmpi); free(args->tmp_ac); + if ( args->mask ) regidx_destroy(args->mask); } static void flush_buffer(args_t *args, int n) @@ -403,6 +435,35 @@ static void set_genotypes(args_t *args, bcf1_t *line, int pass_site) if ( has_ac ) bcf_update_info_int32(args->hdr,line,"AC",args->tmp_ac,line->n_allele-1); } +static void _set_variant_boundaries(bcf1_t *rec, hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t off; + if ( rec->n_allele ) + { + off = rec->rlen; + bcf_unpack(rec, BCF_UN_STR); + int i; + for (i=1; in_allele; i++) + { + // Make symbolic alleles start at POS, although this is not strictly true for + // , where POS should be the position BEFORE the deletion/insertion. + // However, since arbitrary symbolic alleles can be defined by the user, we + // will simplify the interpretation of --targets-overlap and --region-overlap. + int j = 0; + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[i]; + while ( ref[j] && alt[j] && ref[j]==alt[j] ) j++; + if ( off > j ) off = j; + if ( !off ) break; + } + } + else + off = 0; + + *beg = rec->pos + off; + *end = rec->pos + rec->rlen - 1; +} + static void usage(args_t *args) { fprintf(stderr, "\n"); @@ -414,6 +475,9 @@ static void usage(args_t *args) fprintf(stderr, " -g, --SnpGap INT[:TYPE] Filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); fprintf(stderr, " -G, --IndelGap INT Filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); fprintf(stderr, " -i, --include EXPR Include only sites for which the expression is true (see man page for details\n"); + fprintf(stderr, " --mask [^]REGION Soft filter regions, \"^\" to negate\n"); + fprintf(stderr, " -M, --mask-file [^]FILE Soft filter regions listed in a file, \"^\" to negate\n"); + fprintf(stderr, " --mask-overlap 0|1|2 Mask if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); @@ -445,11 +509,15 @@ int main_vcffilter(int argc, char *argv[]) int regions_is_file = 0, targets_is_file = 0; int regions_overlap = 1; int targets_overlap = 0; + args->mask_overlap = 1; static struct option loptions[] = { {"set-GTs",required_argument,NULL,'S'}, {"mode",required_argument,NULL,'m'}, + {"mask",required_argument,NULL,10}, + {"mask-file",required_argument,NULL,'M'}, + {"mask-overlap",required_argument,NULL,11}, {"soft-filter",required_argument,NULL,'s'}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, @@ -542,16 +610,20 @@ int main_vcffilter(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); + break; + case 10 : args->mask_list = optarg; break; + case 'M' : args->mask_list = optarg; args->mask_is_file = 1; break; + case 11 : + if ( !strcasecmp(optarg,"0") ) args->mask_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->mask_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2; + else error("Could not parse: --mask-overlap %s\n",optarg); break; case 'h': case '?': usage(args); break; @@ -568,6 +640,8 @@ int main_vcffilter(int argc, char *argv[]) } else fname = argv[optind]; + if ( args->mask_list && !args->soft_filter ) error("The option --soft-filter is required with --mask and --mask-file options\n"); + // read in the regions from the command line if ( args->regions_list ) { @@ -607,6 +681,16 @@ int main_vcffilter(int argc, char *argv[]) pass = filter_test(args->filter, line, &args->smpl_pass); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; } + if ( args->mask ) + { + hts_pos_t beg, end; + if ( args->mask_overlap==0 ) beg = end = line->pos; + else if ( args->mask_overlap==1 ) beg = line->pos, end = line->pos + line->rlen - 1; + else _set_variant_boundaries(line,&beg,&end); + int mpass = regidx_overlap(args->mask,bcf_seqname(args->hdr,line),beg,end,NULL) ? 0 : 1; + if ( args->mask_negate ) mpass = mpass ? 0 : 1; + pass &= mpass; + } if ( args->soft_filter || args->set_gts || pass ) { if ( pass ) diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c index 28824dc..f998083 100644 --- a/bcftools/vcffilter.c.pysam.c +++ b/bcftools/vcffilter.c.pysam.c @@ -2,7 +2,7 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -41,6 +41,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "rbuf.h" +#include "regidx.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -75,8 +76,9 @@ typedef struct _args_t htsFile *out_fh; int output_type, n_threads, clevel; - char **argv, *output_fname, *targets_list, *regions_list; - int argc, record_cmd_line; + char **argv, *output_fname, *targets_list, *regions_list, *mask_list; + int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate; + regidx_t *mask; } args_t; @@ -88,11 +90,30 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); + if ( args->mask_list ) + { + if ( args->mask_list[0]=='^' ) args->mask_negate = 1; + if ( args->mask_is_file ) + args->mask = regidx_init(args->mask_negate?args->mask_list+1:args->mask_list,NULL,NULL,0,NULL); + else + { + char *rmme = strdup(args->mask_negate?args->mask_list+1:args->mask_list), *tmp = rmme; + while ( *tmp ) + { + if ( *tmp==',' ) *tmp = '\n'; + tmp++; + } + args->mask = regidx_init_string(rmme, regidx_parse_reg, NULL, 0, NULL); + free(rmme); + } + if ( !args->mask ) + error("Could not initialize the mask: %s\n",args->mask_list); + } + args->hdr = args->files->readers[0].header; args->flt_pass = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"PASS"); assert( !args->flt_pass ); // sanity check: required by BCF spec - // -i or -e: append FILTER line - if ( args->soft_filter && args->filter_logic ) + if ( args->soft_filter && (args->filter_logic || args->mask_list) ) { kstring_t flt_name = {0,0,0}; if ( strcmp(args->soft_filter,"+") ) @@ -108,18 +129,28 @@ static void init_data(args_t *args) } while ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,id) ); } - // escape quotes + kstring_t tmp = {0,0,0}; - char *t = args->filter_str; - while ( *t ) + if ( args->filter_logic ) { - if ( *t=='"' ) kputc('\\',&tmp); - kputc(*t,&tmp); - t++; + // -i or -e: append FILTER line + ksprintf(&tmp,"Set if %s: ",args->filter_logic & FLT_INCLUDE ? "not true" : "true"); + + // escape quotes + char *t = args->filter_str; + while ( *t ) + { + if ( *t=='"' ) kputc('\\',&tmp); + kputc(*t,&tmp); + t++; + } } - int ret = bcf_hdr_printf(args->hdr, "##FILTER=", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s); + else if ( args->mask_list ) + ksprintf(&tmp,"Record masked by region"); + + int ret = bcf_hdr_printf(args->hdr, "##FILTER=", flt_name.s,tmp.s); if ( ret!=0 ) - error("Failed to append header line: ##FILTER=\n", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s); + error("Failed to append header line: ##FILTER=\n", flt_name.s,tmp.s); args->flt_fail = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s); assert( args->flt_fail>=0 ); free(flt_name.s); free(tmp.s); @@ -176,6 +207,7 @@ static void destroy_data(args_t *args) filter_destroy(args->filter); free(args->tmpi); free(args->tmp_ac); + if ( args->mask ) regidx_destroy(args->mask); } static void flush_buffer(args_t *args, int n) @@ -405,6 +437,35 @@ static void set_genotypes(args_t *args, bcf1_t *line, int pass_site) if ( has_ac ) bcf_update_info_int32(args->hdr,line,"AC",args->tmp_ac,line->n_allele-1); } +static void _set_variant_boundaries(bcf1_t *rec, hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t off; + if ( rec->n_allele ) + { + off = rec->rlen; + bcf_unpack(rec, BCF_UN_STR); + int i; + for (i=1; in_allele; i++) + { + // Make symbolic alleles start at POS, although this is not strictly true for + // , where POS should be the position BEFORE the deletion/insertion. + // However, since arbitrary symbolic alleles can be defined by the user, we + // will simplify the interpretation of --targets-overlap and --region-overlap. + int j = 0; + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[i]; + while ( ref[j] && alt[j] && ref[j]==alt[j] ) j++; + if ( off > j ) off = j; + if ( !off ) break; + } + } + else + off = 0; + + *beg = rec->pos + off; + *end = rec->pos + rec->rlen - 1; +} + static void usage(args_t *args) { fprintf(bcftools_stderr, "\n"); @@ -416,6 +477,9 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -g, --SnpGap INT[:TYPE] Filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); fprintf(bcftools_stderr, " -G, --IndelGap INT Filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); fprintf(bcftools_stderr, " -i, --include EXPR Include only sites for which the expression is true (see man page for details\n"); + fprintf(bcftools_stderr, " --mask [^]REGION Soft filter regions, \"^\" to negate\n"); + fprintf(bcftools_stderr, " -M, --mask-file [^]FILE Soft filter regions listed in a file, \"^\" to negate\n"); + fprintf(bcftools_stderr, " --mask-overlap 0|1|2 Mask if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); @@ -447,11 +511,15 @@ int main_vcffilter(int argc, char *argv[]) int regions_is_file = 0, targets_is_file = 0; int regions_overlap = 1; int targets_overlap = 0; + args->mask_overlap = 1; static struct option loptions[] = { {"set-GTs",required_argument,NULL,'S'}, {"mode",required_argument,NULL,'m'}, + {"mask",required_argument,NULL,10}, + {"mask-file",required_argument,NULL,'M'}, + {"mask-overlap",required_argument,NULL,11}, {"soft-filter",required_argument,NULL,'s'}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, @@ -544,16 +612,20 @@ int main_vcffilter(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); + break; + case 10 : args->mask_list = optarg; break; + case 'M' : args->mask_list = optarg; args->mask_is_file = 1; break; + case 11 : + if ( !strcasecmp(optarg,"0") ) args->mask_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->mask_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2; + else error("Could not parse: --mask-overlap %s\n",optarg); break; case 'h': case '?': usage(args); break; @@ -570,6 +642,8 @@ int main_vcffilter(int argc, char *argv[]) } else fname = argv[optind]; + if ( args->mask_list && !args->soft_filter ) error("The option --soft-filter is required with --mask and --mask-file options\n"); + // read in the regions from the command line if ( args->regions_list ) { @@ -609,6 +683,16 @@ int main_vcffilter(int argc, char *argv[]) pass = filter_test(args->filter, line, &args->smpl_pass); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; } + if ( args->mask ) + { + hts_pos_t beg, end; + if ( args->mask_overlap==0 ) beg = end = line->pos; + else if ( args->mask_overlap==1 ) beg = line->pos, end = line->pos + line->rlen - 1; + else _set_variant_boundaries(line,&beg,&end); + int mpass = regidx_overlap(args->mask,bcf_seqname(args->hdr,line),beg,end,NULL) ? 0 : 1; + if ( args->mask_negate ) mpass = mpass ? 0 : 1; + pass &= mpass; + } if ( args->soft_filter || args->set_gts || pass ) { if ( pass ) diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c index 4d36b91..f646e1f 100644 --- a/bcftools/vcfgtcheck.c +++ b/bcftools/vcfgtcheck.c @@ -1214,16 +1214,12 @@ int main_vcfgtcheck(int argc, char *argv[]) case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 7 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 8 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(); break; diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index d13dd84..e0a70ba 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -1216,16 +1216,12 @@ int main_vcfgtcheck(int argc, char *argv[]) case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 7 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 8 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(); break; diff --git a/bcftools/vcfhead.c b/bcftools/vcfhead.c new file mode 100644 index 0000000..20be2a9 --- /dev/null +++ b/bcftools/vcfhead.c @@ -0,0 +1,133 @@ +/* vcfhead.c -- view VCF/BCF file headers. + + Copyright (C) 2021 University of Glasgow. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "bcftools.h" + +int main_vcfhead(int argc, char *argv[]) +{ + static const char usage[] = +"\n" +"About: Displays VCF/BCF headers and optionally the first few variant records\n" +"Usage: bcftools head [OPTION]... [FILE]\n" +"\n" +"Options:\n" +" -h, --headers INT Display INT header lines [all]\n" +" -n, --records INT Display INT variant record lines [none]\n" +"\n"; + + static const struct option loptions[] = { + { "headers", required_argument, NULL, 'h' }, + { "records", required_argument, NULL, 'n' }, + { NULL, 0, NULL, 0 } + }; + + int all_headers = 1; + uint64_t nheaders = 0; + uint64_t nrecords = 0; + + int c, nargs; + while ((c = getopt_long(argc, argv, "h:n:", loptions, NULL)) >= 0) + switch (c) { + case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break; + case 'n': nrecords = strtoull(optarg, NULL, 0); break; + default: + fputs(usage, stderr); + return EXIT_FAILURE; + } + + nargs = argc - optind; + if (nargs == 0 && isatty(STDIN_FILENO)) { + fputs(usage, stdout); + return EXIT_SUCCESS; + } + else if (nargs > 1) { + fputs(usage, stderr); + return EXIT_FAILURE; + } + + const char *fname = (nargs == 1)? argv[optind] : "-"; + vcfFile *fp = bcf_open(fname, "r"); + if (fp == NULL) { + if (strcmp(fname, "-") != 0) + error_errno("[%s] Can't open \"%s\"", __func__, fname); + else + error_errno("[%s] Can't open standard input", __func__); + } + + bcf_hdr_t *hdr = bcf_hdr_read(fp); + if (hdr == NULL) { + bcf_close(fp); + if (strcmp(fname, "-") != 0) + error("[%s] Can't read headers from \"%s\"\n", __func__, fname); + else + error("[%s] Can't read headers\n", __func__); + } + + kstring_t str = KS_INITIALIZE; + + if (all_headers) { + bcf_hdr_format(hdr, 0, &str); + fputs(ks_str(&str), stdout); + } + else if (nheaders > 0) { + bcf_hdr_format(hdr, 0, &str); + char *lim = str.s; + uint64_t n; + for (n = 0; n < nheaders; n++) { + lim = strchr(lim, '\n'); + if (lim) lim++; + else break; + } + if (lim) *lim = '\0'; + fputs(ks_str(&str), stdout); + } + + if (nrecords > 0) { + bcf1_t *rec = bcf_init(); + uint64_t n; + for (n = 0; n < nrecords && bcf_read(fp, hdr, rec) >= 0; n++) { + ks_clear(&str); + if (vcf_format(hdr, rec, &str) >= 0) + fputs(ks_str(&str), stdout); + else + fprintf(stderr, "[%s] Record #%"PRIu64 " is invalid\n", __func__, n+1); + } + bcf_destroy(rec); + } + + ks_free(&str); + bcf_hdr_destroy(hdr); + bcf_close(fp); + + return EXIT_SUCCESS; +} diff --git a/bcftools/vcfhead.c.pysam.c b/bcftools/vcfhead.c.pysam.c new file mode 100644 index 0000000..09744f2 --- /dev/null +++ b/bcftools/vcfhead.c.pysam.c @@ -0,0 +1,135 @@ +#include "bcftools.pysam.h" + +/* vcfhead.c -- view VCF/BCF file headers. + + Copyright (C) 2021 University of Glasgow. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "bcftools.h" + +int main_vcfhead(int argc, char *argv[]) +{ + static const char usage[] = +"\n" +"About: Displays VCF/BCF headers and optionally the first few variant records\n" +"Usage: bcftools head [OPTION]... [FILE]\n" +"\n" +"Options:\n" +" -h, --headers INT Display INT header lines [all]\n" +" -n, --records INT Display INT variant record lines [none]\n" +"\n"; + + static const struct option loptions[] = { + { "headers", required_argument, NULL, 'h' }, + { "records", required_argument, NULL, 'n' }, + { NULL, 0, NULL, 0 } + }; + + int all_headers = 1; + uint64_t nheaders = 0; + uint64_t nrecords = 0; + + int c, nargs; + while ((c = getopt_long(argc, argv, "h:n:", loptions, NULL)) >= 0) + switch (c) { + case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break; + case 'n': nrecords = strtoull(optarg, NULL, 0); break; + default: + fputs(usage, bcftools_stderr); + return EXIT_FAILURE; + } + + nargs = argc - optind; + if (nargs == 0 && isatty(STDIN_FILENO)) { + fputs(usage, bcftools_stdout); + return EXIT_SUCCESS; + } + else if (nargs > 1) { + fputs(usage, bcftools_stderr); + return EXIT_FAILURE; + } + + const char *fname = (nargs == 1)? argv[optind] : "-"; + vcfFile *fp = bcf_open(fname, "r"); + if (fp == NULL) { + if (strcmp(fname, "-") != 0) + error_errno("[%s] Can't open \"%s\"", __func__, fname); + else + error_errno("[%s] Can't open standard input", __func__); + } + + bcf_hdr_t *hdr = bcf_hdr_read(fp); + if (hdr == NULL) { + bcf_close(fp); + if (strcmp(fname, "-") != 0) + error("[%s] Can't read headers from \"%s\"\n", __func__, fname); + else + error("[%s] Can't read headers\n", __func__); + } + + kstring_t str = KS_INITIALIZE; + + if (all_headers) { + bcf_hdr_format(hdr, 0, &str); + fputs(ks_str(&str), bcftools_stdout); + } + else if (nheaders > 0) { + bcf_hdr_format(hdr, 0, &str); + char *lim = str.s; + uint64_t n; + for (n = 0; n < nheaders; n++) { + lim = strchr(lim, '\n'); + if (lim) lim++; + else break; + } + if (lim) *lim = '\0'; + fputs(ks_str(&str), bcftools_stdout); + } + + if (nrecords > 0) { + bcf1_t *rec = bcf_init(); + uint64_t n; + for (n = 0; n < nrecords && bcf_read(fp, hdr, rec) >= 0; n++) { + ks_clear(&str); + if (vcf_format(hdr, rec, &str) >= 0) + fputs(ks_str(&str), bcftools_stdout); + else + fprintf(bcftools_stderr, "[%s] Record #%"PRIu64 " is invalid\n", __func__, n+1); + } + bcf_destroy(rec); + } + + ks_free(&str); + bcf_hdr_destroy(hdr); + bcf_close(fp); + + return EXIT_SUCCESS; +} diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index acc1885..c4c09f5 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -182,6 +182,7 @@ void isec_vcf(args_t *args) } ret |= 1<isec_op) { @@ -598,16 +599,12 @@ int main_vcfisec(int argc, char *argv[]) } break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index 87178cf..f826384 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -2,7 +2,7 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -184,6 +184,7 @@ void isec_vcf(args_t *args) } ret |= 1<isec_op) { @@ -600,16 +601,12 @@ int main_vcfisec(int argc, char *argv[]) } break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index f87bce7..60a80c9 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -387,7 +387,7 @@ static void info_rules_init(args_t *args) rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id); if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); - else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); + else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type); if ( !strcmp(rule->hdr_tag,"AC") || !strcmp(rule->hdr_tag,"AN") ) args->keep_AC_AN = 1; @@ -814,7 +814,7 @@ void maux_expand1(buffer_t *buf, int size) buf->mrec = size; } } -void maux_reset(maux_t *ma) +void maux_reset(maux_t *ma, int *rid_tab) { int i,j; for (i=0; in; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1); @@ -846,7 +846,10 @@ void maux_reset(maux_t *ma) for (i=0; in; i++) { bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i); - ma->buf[i].rid = bcf_hdr_name2id(hdr,chr); + if (new_chr) + rid_tab[i] = bcf_hdr_name2id(hdr,chr); + + ma->buf[i].rid = rid_tab[i]; ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1; for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++) { @@ -1071,8 +1074,8 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t /* * copy_string_field() - copy a comma-separated field * @param src: source string - * @param isrc: index of the field to copy - * @param src_len: length of source string (excluding the terminating \0) + * @param isrc: index of the field to copy + * @param src_len: length of source string (excluding the terminating \0) * @param dst: destination kstring (must be initialized with missing values, e.g. as ".") * @param idst: index of the destination field */ @@ -1267,7 +1270,12 @@ void merge_info(args_t *args, bcf1_t *out) bcf_info_t *inf = &line->d.info[j]; const char *key = hdr->id[BCF_DT_ID][inf->key].key; - if ( !args->keep_AC_AN && (!strcmp("AC",key) || !strcmp("AN",key)) ) continue; // AC and AN are done in merge_format() after genotypes are done + // AC and AN are done in merge_format() after genotypes are done + if (!args->keep_AC_AN && + (key[0] == 'A' + && (key[1] == 'C' || key[1] == 'N') + && key[2] == 0)) + continue; int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key); if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key); @@ -1326,7 +1334,7 @@ void merge_info(args_t *args, bcf1_t *out) out->d.info[out->n_info].vptr_off = inf->vptr_off; out->d.info[out->n_info].vptr_len = inf->vptr_len; out->d.info[out->n_info].vptr_free = 1; - out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off); + out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off); memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off); out->d.info[out->n_info].vptr += inf->vptr_off; if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) ) @@ -1427,7 +1435,7 @@ void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL) if ( line->n_allele <= args->local_alleles + 1 ) { - // sort to the output order, insertion sort, ascending + // sort to the output order, insertion sort, ascending int *map = ma->buf[i].rec[ma->buf[i].cur].map; int *k2k = ma->k2k; int tmp; @@ -1738,7 +1746,7 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf int iori,inew; for (iori=ifrom; iorin_allele; iori++) { - inew = ma->buf[i].rec[irec].map[iori] - ifrom; + inew = ma->buf[i].rec[irec].map[iori] - ifrom; int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); } @@ -2302,7 +2310,7 @@ void gvcf_set_alleles(args_t *args) bcf_srs_t *files = args->files; maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; - for (i=0; inals; i++) + for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; @@ -2365,11 +2373,11 @@ void gvcf_write_block(args_t *args, int start, int end) for (i=0; ifiles->nreaders; i++) { if ( !gaux[i].active ) continue; - if ( gaux[i].end < start ) - { - gaux[i].active = 0; + if ( gaux[i].end < start ) + { + gaux[i].active = 0; maux->buf[i].cur = -1; - continue; + continue; } gaux[i].line->d.allele[0][0] = ref; if ( min > gaux[i].end ) min = gaux[i].end; @@ -2422,9 +2430,9 @@ void gvcf_write_block(args_t *args, int start, int end) if ( !gaux[i].active ) continue; if ( gaux[i].end < end ) { - gaux[i].active = 0; + gaux[i].active = 0; maux->buf[i].cur = -1; - continue; + continue; } // next min END position bigger than the current one if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1; @@ -2447,7 +2455,7 @@ void gvcf_write_block(args_t *args, int start, int end) 3 END=5 A B C 6 END=7 A B . 8 END=10 A . . - + */ void gvcf_flush(args_t *args, int done) { @@ -2581,7 +2589,7 @@ void gvcf_stage(args_t *args, int pos) if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1; } else - maux->gvcf_break = line->pos; // must break the gvcf block + maux->gvcf_break = line->pos; // must break the gvcf block } maux->ntmp_arr = nend * sizeof(int32_t); maux->tmp_arr = end; @@ -2702,7 +2710,7 @@ int can_merge(args_t *args) char *id = NULL, ref = 'N'; int i,j,k, ntodo = 0; - for (i=0; inals; i++) + for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; @@ -3060,13 +3068,17 @@ void merge_vcf(args_t *args) args->out_line = bcf_init1(); args->tmph = kh_init(strdict); + int *rid_tab = calloc(args->maux->n, sizeof(*rid_tab)); + if (!rid_tab) + error("[%s:%d] Could not allocate %zu bytes\n", __FILE__, __LINE__, args->maux->n*sizeof(*rid_tab)); + while ( bcf_sr_next_line(args->files) ) { // output cached gVCF blocks which end before the new record if ( args->do_gvcf ) gvcf_flush(args,0); - maux_reset(args->maux); + maux_reset(args->maux, rid_tab); // determine which of the new records are gvcf blocks if ( args->do_gvcf ) @@ -3080,6 +3092,7 @@ void merge_vcf(args_t *args) clean_buffer(args); // debug_state(args); } + free(rid_tab); if ( args->do_gvcf ) gvcf_flush(args,1); @@ -3173,7 +3186,7 @@ int main_vcfmerge(int argc, char *argv[]) if ( args->local_alleles < 1 ) error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg); break; - case 'F': + case 'F': if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD; else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE; else error("Filter logic not recognised: %s\n", optarg); @@ -3226,10 +3239,8 @@ int main_vcfmerge(int argc, char *argv[]) case 2 : args->header_only = 1; break; case 3 : args->force_samples = 1; break; case 4 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index 03119ae..0a373ef 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -2,7 +2,7 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -389,7 +389,7 @@ static void info_rules_init(args_t *args) rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id); if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); - else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); + else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type); if ( !strcmp(rule->hdr_tag,"AC") || !strcmp(rule->hdr_tag,"AN") ) args->keep_AC_AN = 1; @@ -816,7 +816,7 @@ void maux_expand1(buffer_t *buf, int size) buf->mrec = size; } } -void maux_reset(maux_t *ma) +void maux_reset(maux_t *ma, int *rid_tab) { int i,j; for (i=0; in; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1); @@ -848,7 +848,10 @@ void maux_reset(maux_t *ma) for (i=0; in; i++) { bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i); - ma->buf[i].rid = bcf_hdr_name2id(hdr,chr); + if (new_chr) + rid_tab[i] = bcf_hdr_name2id(hdr,chr); + + ma->buf[i].rid = rid_tab[i]; ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1; for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++) { @@ -1073,8 +1076,8 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t /* * copy_string_field() - copy a comma-separated field * @param src: source string - * @param isrc: index of the field to copy - * @param src_len: length of source string (excluding the terminating \0) + * @param isrc: index of the field to copy + * @param src_len: length of source string (excluding the terminating \0) * @param dst: destination kstring (must be initialized with missing values, e.g. as ".") * @param idst: index of the destination field */ @@ -1269,7 +1272,12 @@ void merge_info(args_t *args, bcf1_t *out) bcf_info_t *inf = &line->d.info[j]; const char *key = hdr->id[BCF_DT_ID][inf->key].key; - if ( !args->keep_AC_AN && (!strcmp("AC",key) || !strcmp("AN",key)) ) continue; // AC and AN are done in merge_format() after genotypes are done + // AC and AN are done in merge_format() after genotypes are done + if (!args->keep_AC_AN && + (key[0] == 'A' + && (key[1] == 'C' || key[1] == 'N') + && key[2] == 0)) + continue; int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key); if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key); @@ -1328,7 +1336,7 @@ void merge_info(args_t *args, bcf1_t *out) out->d.info[out->n_info].vptr_off = inf->vptr_off; out->d.info[out->n_info].vptr_len = inf->vptr_len; out->d.info[out->n_info].vptr_free = 1; - out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off); + out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off); memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off); out->d.info[out->n_info].vptr += inf->vptr_off; if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) ) @@ -1429,7 +1437,7 @@ void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL) if ( line->n_allele <= args->local_alleles + 1 ) { - // sort to the output order, insertion sort, ascending + // sort to the output order, insertion sort, ascending int *map = ma->buf[i].rec[ma->buf[i].cur].map; int *k2k = ma->k2k; int tmp; @@ -1740,7 +1748,7 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf int iori,inew; for (iori=ifrom; iorin_allele; iori++) { - inew = ma->buf[i].rec[irec].map[iori] - ifrom; + inew = ma->buf[i].rec[irec].map[iori] - ifrom; int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); } @@ -2304,7 +2312,7 @@ void gvcf_set_alleles(args_t *args) bcf_srs_t *files = args->files; maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; - for (i=0; inals; i++) + for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; @@ -2367,11 +2375,11 @@ void gvcf_write_block(args_t *args, int start, int end) for (i=0; ifiles->nreaders; i++) { if ( !gaux[i].active ) continue; - if ( gaux[i].end < start ) - { - gaux[i].active = 0; + if ( gaux[i].end < start ) + { + gaux[i].active = 0; maux->buf[i].cur = -1; - continue; + continue; } gaux[i].line->d.allele[0][0] = ref; if ( min > gaux[i].end ) min = gaux[i].end; @@ -2424,9 +2432,9 @@ void gvcf_write_block(args_t *args, int start, int end) if ( !gaux[i].active ) continue; if ( gaux[i].end < end ) { - gaux[i].active = 0; + gaux[i].active = 0; maux->buf[i].cur = -1; - continue; + continue; } // next min END position bigger than the current one if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1; @@ -2449,7 +2457,7 @@ void gvcf_write_block(args_t *args, int start, int end) 3 END=5 A B C 6 END=7 A B . 8 END=10 A . . - + */ void gvcf_flush(args_t *args, int done) { @@ -2583,7 +2591,7 @@ void gvcf_stage(args_t *args, int pos) if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1; } else - maux->gvcf_break = line->pos; // must break the gvcf block + maux->gvcf_break = line->pos; // must break the gvcf block } maux->ntmp_arr = nend * sizeof(int32_t); maux->tmp_arr = end; @@ -2704,7 +2712,7 @@ int can_merge(args_t *args) char *id = NULL, ref = 'N'; int i,j,k, ntodo = 0; - for (i=0; inals; i++) + for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; @@ -3062,13 +3070,17 @@ void merge_vcf(args_t *args) args->out_line = bcf_init1(); args->tmph = kh_init(strdict); + int *rid_tab = calloc(args->maux->n, sizeof(*rid_tab)); + if (!rid_tab) + error("[%s:%d] Could not allocate %zu bytes\n", __FILE__, __LINE__, args->maux->n*sizeof(*rid_tab)); + while ( bcf_sr_next_line(args->files) ) { // output cached gVCF blocks which end before the new record if ( args->do_gvcf ) gvcf_flush(args,0); - maux_reset(args->maux); + maux_reset(args->maux, rid_tab); // determine which of the new records are gvcf blocks if ( args->do_gvcf ) @@ -3082,6 +3094,7 @@ void merge_vcf(args_t *args) clean_buffer(args); // debug_state(args); } + free(rid_tab); if ( args->do_gvcf ) gvcf_flush(args,1); @@ -3175,7 +3188,7 @@ int main_vcfmerge(int argc, char *argv[]) if ( args->local_alleles < 1 ) error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg); break; - case 'F': + case 'F': if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD; else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE; else error("Filter logic not recognised: %s\n", optarg); @@ -3228,10 +3241,8 @@ int main_vcfmerge(int argc, char *argv[]) case 2 : args->header_only = 1; break; case 3 : args->force_samples = 1; break; case 4 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index 8a140fb..38c5de4 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -142,6 +142,7 @@ static void seq_to_upper(char *seq, int len) static void fix_ref(args_t *args, bcf1_t *line) { + bcf_unpack(line, BCF_UN_STR); int reflen = strlen(line->d.allele[0]); int i,j, maxlen = reflen, len; for (i=1; in_allele; i++) @@ -160,10 +161,10 @@ static void fix_ref(args_t *args, bcf1_t *line) if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } // is the REF allele missing? - if ( reflen==1 && line->d.allele[0][0]=='.' ) - { - line->d.allele[0][0] = ref[0]; - args->nref.set++; + if ( reflen==1 && line->d.allele[0][0]=='.' ) + { + line->d.allele[0][0] = ref[0]; + args->nref.set++; free(ref); bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); return; @@ -235,7 +236,7 @@ static void fix_ref(args_t *args, bcf1_t *line) for (j=1; jn_allele; j++) { kputc(',',&str); - if ( j==i ) + if ( j==i ) kputs(line->d.allele[0],&str); else kputs(line->d.allele[j],&str); @@ -1774,7 +1775,7 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec) } khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash; - for (j=1; jn_allele; j++) + for (j=1; jn_allele; j++) if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break; if ( jn_allele ) continue; return 1; @@ -1863,7 +1864,7 @@ static void init_data(args_t *args) args->out_hdr = bcf_hdr_dup(args->hdr); if ( args->old_rec_tag ) - bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); + bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); rbuf_init(&args->rbuf, 100); args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); @@ -1880,7 +1881,7 @@ static void init_data(args_t *args) } if ( args->atomize==SPLIT ) { - args->abuf = abuf_init(args->hdr, SPLIT); + args->abuf = abuf_init(args->hdr, SPLIT); abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr); if ( args->old_rec_tag ) abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag); @@ -2226,7 +2227,7 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'o': args->output_fname = optarg; break; case 'D': - fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); + fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); args->rmdup = BCF_SR_PAIR_EXACT; break; case 's': args->strict_filter = 1; break; @@ -2243,16 +2244,12 @@ int main_vcfnorm(int argc, char *argv[]) case 8 : args->record_cmd_line = 0; break; case 7 : args->force = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(); break; diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index aa21490..a292b96 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -144,6 +144,7 @@ static void seq_to_upper(char *seq, int len) static void fix_ref(args_t *args, bcf1_t *line) { + bcf_unpack(line, BCF_UN_STR); int reflen = strlen(line->d.allele[0]); int i,j, maxlen = reflen, len; for (i=1; in_allele; i++) @@ -162,10 +163,10 @@ static void fix_ref(args_t *args, bcf1_t *line) if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } // is the REF allele missing? - if ( reflen==1 && line->d.allele[0][0]=='.' ) - { - line->d.allele[0][0] = ref[0]; - args->nref.set++; + if ( reflen==1 && line->d.allele[0][0]=='.' ) + { + line->d.allele[0][0] = ref[0]; + args->nref.set++; free(ref); bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); return; @@ -237,7 +238,7 @@ static void fix_ref(args_t *args, bcf1_t *line) for (j=1; jn_allele; j++) { kputc(',',&str); - if ( j==i ) + if ( j==i ) kputs(line->d.allele[0],&str); else kputs(line->d.allele[j],&str); @@ -1776,7 +1777,7 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec) } khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash; - for (j=1; jn_allele; j++) + for (j=1; jn_allele; j++) if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break; if ( jn_allele ) continue; return 1; @@ -1865,7 +1866,7 @@ static void init_data(args_t *args) args->out_hdr = bcf_hdr_dup(args->hdr); if ( args->old_rec_tag ) - bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); + bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); rbuf_init(&args->rbuf, 100); args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); @@ -1882,7 +1883,7 @@ static void init_data(args_t *args) } if ( args->atomize==SPLIT ) { - args->abuf = abuf_init(args->hdr, SPLIT); + args->abuf = abuf_init(args->hdr, SPLIT); abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr); if ( args->old_rec_tag ) abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag); @@ -2228,7 +2229,7 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'o': args->output_fname = optarg; break; case 'D': - fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); + fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); args->rmdup = BCF_SR_PAIR_EXACT; break; case 's': args->strict_filter = 1; break; @@ -2245,16 +2246,12 @@ int main_vcfnorm(int argc, char *argv[]) case 8 : args->record_cmd_line = 0; break; case 7 : args->force = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(); break; diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c index 7656192..4568668 100644 --- a/bcftools/vcfplugin.c +++ b/bcftools/vcfplugin.c @@ -455,11 +455,22 @@ static int cmp_plugin_name(const void *p1, const void *p2) return strcmp(a->name,b->name); } +// If args=NULL then returns the number of plugins available. Otherwise prints the +// plugins on stdout and returns 0 on success. static int list_plugins(args_t *args) { plugin_t *plugins = NULL; int nplugins = 0, mplugins = 0; + int count_only = 0; + args_t _args; + if ( !args ) + { + memset(&_args,0,sizeof(_args)); + args = &_args; + args->nplugin_paths = -1; + count_only = 1; + } init_plugin_paths(args); kstring_t str = {0,0,0}; @@ -490,6 +501,11 @@ static int list_plugins(args_t *args) } closedir(dp); } + if ( count_only ) + { + free(str.s); + return nplugins; + } if ( nplugins ) { qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name); @@ -508,6 +524,10 @@ static int list_plugins(args_t *args) free(str.s); return nplugins ? 0 : 1; } +int count_plugins(void) +{ + return list_plugins(NULL); +} static void init_data(args_t *args) { @@ -694,16 +714,12 @@ int main_plugin(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': args->plist_only = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index cb577d5..b37ac23 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -457,11 +457,22 @@ static int cmp_plugin_name(const void *p1, const void *p2) return strcmp(a->name,b->name); } +// If args=NULL then returns the number of plugins available. Otherwise prints the +// plugins on bcftools_stdout and returns 0 on success. static int list_plugins(args_t *args) { plugin_t *plugins = NULL; int nplugins = 0, mplugins = 0; + int count_only = 0; + args_t _args; + if ( !args ) + { + memset(&_args,0,sizeof(_args)); + args = &_args; + args->nplugin_paths = -1; + count_only = 1; + } init_plugin_paths(args); kstring_t str = {0,0,0}; @@ -492,6 +503,11 @@ static int list_plugins(args_t *args) } closedir(dp); } + if ( count_only ) + { + free(str.s); + return nplugins; + } if ( nplugins ) { qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name); @@ -510,6 +526,10 @@ static int list_plugins(args_t *args) free(str.s); return nplugins ? 0 : 1; } +int count_plugins(void) +{ + return list_plugins(NULL); +} static void init_data(args_t *args) { @@ -696,16 +716,12 @@ int main_plugin(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': args->plist_only = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c index 882c3bb..70b5f30 100644 --- a/bcftools/vcfquery.c +++ b/bcftools/vcfquery.c @@ -1,6 +1,6 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -37,6 +37,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "convert.h" +#include "smpl_ilist.h" // Logic of the filters: include or exclude sites which match the filters? @@ -52,9 +53,9 @@ typedef struct convert_t *convert; bcf_srs_t *files; bcf_hdr_t *header; - int nsamples, *samples, sample_is_file; + int sample_is_file; char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out; - int argc, list_columns, print_header, allow_undef_tags; + int argc, list_columns, print_header, allow_undef_tags, force_samples; FILE *out; } args_t; @@ -76,28 +77,21 @@ static void init_data(args_t *args) { for (i=0; ifiles->nreaders; i++) { + // This tells htslib to subset samples directly when reading. Also the header is modified to + // include only the requested samples int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file); if ( ret<0 ) error("Error parsing the sample list\n"); - else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret); + else if ( ret>0 && !args->force_samples ) + error("Error: sample #%d not found in the header, user --force-samples to proceed anyway\n", ret); } - if ( args->sample_list[0]!='^' ) - { - // the sample ordering may be different if not negated - int n; - char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); - if ( !smpls ) error("Could not parse %s\n", args->sample_list); - if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) ) - error("The number of samples does not match, perhaps some are present multiple times?\n"); - nsamples = bcf_hdr_nsamples(args->files->readers[0].header); - samples = (int*) malloc(sizeof(int)*nsamples); - for (i=0; ifiles->readers[0].header, BCF_DT_SAMPLE,smpls[i]); - free(smpls[i]); - } - free(smpls); - } + int flags = SMPL_REORDER; + smpl_ilist_t *ilist = smpl_ilist_init(args->files->readers[0].header, args->sample_list, args->sample_is_file, flags); + nsamples = ilist->n; + samples = (int*) malloc(sizeof(int)*nsamples); + for (i=0; in; i++) + samples[i] = ilist->idx[i]; + smpl_ilist_destroy(ilist); } args->convert = convert_init(args->header, samples, nsamples, args->format_str); convert_set_option(args->convert, subset_samples, &args->smpl_pass); @@ -118,7 +112,6 @@ static void destroy_data(args_t *args) convert_destroy(args->convert); if ( args->filter ) filter_destroy(args->filter); - free(args->samples); } static void query_vcf(args_t *args) @@ -175,21 +168,35 @@ static void query_vcf(args_t *args) static void list_columns(args_t *args) { + int negate = 0; + int i; + bcf_sr_t *reader = &args->files->readers[0]; void *has_sample = NULL; if ( args->sample_list ) { + if ( args->sample_list[0]=='^' ) negate = 1; has_sample = khash_str2int_init(); int i, nsmpl; - char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); - for (i=0; isample_list+1 : args->sample_list, args->sample_is_file, &nsmpl); + if ( !smpl ) error("Error: failed to read %s\n", negate ? args->sample_list+1 : args->sample_list); + for (i=0; iheader,BCF_DT_SAMPLE,smpl[i])<0 && !args->force_samples ) + error("Error: sample #%d not found in the header, user --force-samples to proceed anyway\n", i+1); + khash_str2int_inc(has_sample, smpl[i]); + } free(smpl); } - int i; - bcf_sr_t *reader = &args->files->readers[0]; for (i=0; iheader); i++) { - if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue; + int skip = 0; + if ( negate ) + { + if ( khash_str2int_has_key(has_sample, reader->header->samples[i]) ) skip = 1; + } + else if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) skip = 1; + if ( skip ) continue; printf("%s\n", reader->header->samples[i]); } @@ -222,6 +229,7 @@ static void usage(void) fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(stderr, " -f, --format STRING See man page for details\n"); fprintf(stderr, " -H, --print-header Print header\n"); fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); @@ -260,6 +268,7 @@ int main_vcfquery(int argc, char *argv[]) {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"format",1,0,'f'}, + {"force-samples",0,0,3}, {"output-file",1,0,'o'}, {"output",1,0,'o'}, {"regions",1,0,'r'}, @@ -318,17 +327,14 @@ int main_vcfquery(int argc, char *argv[]) case 's': args->sample_list = optarg; break; case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 3 : args->force_samples = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -397,6 +403,7 @@ int main_vcfquery(int argc, char *argv[]) int i, k, nfiles, prev_nsamples = 0; char **fnames, **prev_samples = NULL; fnames = hts_readlist(args->vcf_list, 1, &nfiles); + if ( !fnames ) error("Error: failed to read %s\n", args->vcf_list); if ( !nfiles ) error("No files in %s?\n", args->vcf_list); for (i=0; i @@ -39,6 +39,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "convert.h" +#include "smpl_ilist.h" // Logic of the filters: include or exclude sites which match the filters? @@ -54,9 +55,9 @@ typedef struct convert_t *convert; bcf_srs_t *files; bcf_hdr_t *header; - int nsamples, *samples, sample_is_file; + int sample_is_file; char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out; - int argc, list_columns, print_header, allow_undef_tags; + int argc, list_columns, print_header, allow_undef_tags, force_samples; FILE *out; } args_t; @@ -78,28 +79,21 @@ static void init_data(args_t *args) { for (i=0; ifiles->nreaders; i++) { + // This tells htslib to subset samples directly when reading. Also the header is modified to + // include only the requested samples int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file); if ( ret<0 ) error("Error parsing the sample list\n"); - else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret); + else if ( ret>0 && !args->force_samples ) + error("Error: sample #%d not found in the header, user --force-samples to proceed anyway\n", ret); } - if ( args->sample_list[0]!='^' ) - { - // the sample ordering may be different if not negated - int n; - char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); - if ( !smpls ) error("Could not parse %s\n", args->sample_list); - if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) ) - error("The number of samples does not match, perhaps some are present multiple times?\n"); - nsamples = bcf_hdr_nsamples(args->files->readers[0].header); - samples = (int*) malloc(sizeof(int)*nsamples); - for (i=0; ifiles->readers[0].header, BCF_DT_SAMPLE,smpls[i]); - free(smpls[i]); - } - free(smpls); - } + int flags = SMPL_REORDER; + smpl_ilist_t *ilist = smpl_ilist_init(args->files->readers[0].header, args->sample_list, args->sample_is_file, flags); + nsamples = ilist->n; + samples = (int*) malloc(sizeof(int)*nsamples); + for (i=0; in; i++) + samples[i] = ilist->idx[i]; + smpl_ilist_destroy(ilist); } args->convert = convert_init(args->header, samples, nsamples, args->format_str); convert_set_option(args->convert, subset_samples, &args->smpl_pass); @@ -120,7 +114,6 @@ static void destroy_data(args_t *args) convert_destroy(args->convert); if ( args->filter ) filter_destroy(args->filter); - free(args->samples); } static void query_vcf(args_t *args) @@ -177,21 +170,35 @@ static void query_vcf(args_t *args) static void list_columns(args_t *args) { + int negate = 0; + int i; + bcf_sr_t *reader = &args->files->readers[0]; void *has_sample = NULL; if ( args->sample_list ) { + if ( args->sample_list[0]=='^' ) negate = 1; has_sample = khash_str2int_init(); int i, nsmpl; - char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); - for (i=0; isample_list+1 : args->sample_list, args->sample_is_file, &nsmpl); + if ( !smpl ) error("Error: failed to read %s\n", negate ? args->sample_list+1 : args->sample_list); + for (i=0; iheader,BCF_DT_SAMPLE,smpl[i])<0 && !args->force_samples ) + error("Error: sample #%d not found in the header, user --force-samples to proceed anyway\n", i+1); + khash_str2int_inc(has_sample, smpl[i]); + } free(smpl); } - int i; - bcf_sr_t *reader = &args->files->readers[0]; for (i=0; iheader); i++) { - if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue; + int skip = 0; + if ( negate ) + { + if ( khash_str2int_has_key(has_sample, reader->header->samples[i]) ) skip = 1; + } + else if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) skip = 1; + if ( skip ) continue; fprintf(bcftools_stdout, "%s\n", reader->header->samples[i]); } @@ -224,6 +231,7 @@ static void usage(void) fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(bcftools_stderr, " -f, --format STRING See man page for details\n"); fprintf(bcftools_stderr, " -H, --print-header Print header\n"); fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); @@ -262,6 +270,7 @@ int main_vcfquery(int argc, char *argv[]) {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"format",1,0,'f'}, + {"force-samples",0,0,3}, {"output-file",1,0,'o'}, {"output",1,0,'o'}, {"regions",1,0,'r'}, @@ -320,17 +329,14 @@ int main_vcfquery(int argc, char *argv[]) case 's': args->sample_list = optarg; break; case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 3 : args->force_samples = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -399,6 +405,7 @@ int main_vcfquery(int argc, char *argv[]) int i, k, nfiles, prev_nsamples = 0; char **fnames, **prev_samples = NULL; fnames = hts_readlist(args->vcf_list, 1, &nfiles); + if ( !fnames ) error("Error: failed to read %s\n", args->vcf_list); if ( !nfiles ) error("No files in %s?\n", args->vcf_list); for (i=0; i @@ -153,7 +153,7 @@ static void init_data(args_t *args) args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"); if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) ) error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n"); - if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT ) + if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT ) error("Error: The FORMAT/PL tag not defined as Integer in the header\n"); } @@ -279,15 +279,15 @@ static void init_data(args_t *args) MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ; - MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; args->hmm = hmm_init(2, tprob, 10000); - if ( args->genmap_fname ) + if ( args->genmap_fname ) hmm_set_tprob_func(args->hmm, set_tprob_genmap, args); else if ( args->rec_rate > 0 ) hmm_set_tprob_func(args->hmm, set_tprob_rrate, args); - args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); + args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno)); // print header @@ -509,7 +509,7 @@ static void flush_viterbi(args_t *args, int ismpl) if ( !args->vi_training ) // single viterbi pass { - hmm_restore(args->hmm, smpl->snapshot); + hmm_restore(args->hmm, smpl->snapshot); int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites; if ( end < smpl->nsites ) smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->sites[smpl->nsites - args->nbuf_olap - 1]); @@ -535,7 +535,7 @@ static void flush_viterbi(args_t *args, int ismpl) if ( args->output_type & OUTPUT_RG ) { - if ( state!=smpl->rg.state ) + if ( state!=smpl->rg.state ) { if ( !state ) // the region ends, flush { @@ -599,7 +599,7 @@ static void flush_viterbi(args_t *args, int ismpl) MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ; - MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; hmm_set_tprob(args->hmm, tprob_arr, 10000); int niter = 0; @@ -627,14 +627,14 @@ static void flush_viterbi(args_t *args, int ismpl) delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev); niter++; args->str.l = 0; - ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n", + ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n", name,niter,deltaz,delthw, 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW), 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ)); if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th ); - + // output the results for (i=0; inrid; i++) { @@ -658,10 +658,10 @@ static void flush_viterbi(args_t *args, int ismpl) } } -int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) +int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) { if ( tgt->nals < 2 ) - error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s); + error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", tgt->fname,tgt->line.s); if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match int i; @@ -671,7 +671,7 @@ int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) char *tmp, *str = tgt->line.s; i = 0; - while ( *str && i<3 ) + while ( *str && i<3 ) { if ( *str=='\t' ) i++; str++; @@ -722,7 +722,11 @@ int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq) int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr); while ( gt < end ) { - if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) + { + gt += 2; + continue; + } if ( bcf_gt_allele(gt[0]) ) nalt++; else nref++; @@ -746,7 +750,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial); if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields - + if ( args->af_smpl ) // subset samples for AF estimate { #define BRANCH(type_t) \ @@ -838,10 +842,10 @@ int process_line(args_t *args, bcf1_t *line, int ial) if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%"PRId64"\n", args->af_tag, bcf_seqname(args->hdr,line), (int64_t) line->pos+1); } - else if ( args->af_fname ) + else if ( args->af_fname ) { // Read AF from a file - ret = read_AF(args, args->files->targets, line, &alt_freq); + ret = read_AF(args->files->targets, line, &alt_freq); } else if ( args->dflt_AF > 0 ) { @@ -875,9 +879,9 @@ int process_line(args_t *args, bcf1_t *line, int ial) if ( ret>0 ) AC = args->itmp[0]; } - if ( AN<=0 || AC<0 ) + if ( AN<=0 || AC<0 ) ret = -1; - else + else alt_freq = (double) AC/AN; } @@ -962,12 +966,12 @@ int process_line(args_t *args, bcf1_t *line, int ial) smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2); if ( !smpl->eprob ) error("Error: failed to alloc %"PRIu64" bytes\n", (uint64_t)(sizeof(*smpl->eprob)*smpl->msites*2)); } - + // Calculate emission probabilities P(D|AZ) and P(D|HW) double *eprob = &smpl->eprob[2*smpl->nsites]; eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; - + smpl->sites[smpl->nsites] = line->pos; smpl->nsites++; @@ -994,12 +998,12 @@ static void vcfroh(args_t *args, bcf1_t *line) // Are we done? if ( !line ) - { + { for (i=0; iroh_smpl->n; i++) flush_viterbi(args, i); - return; + return; } - // Skip unwanted lines, for simplicity we consider only biallelic sites + // Skip unwanted lines, for simplicity we consider only biallelic sites if ( line->rid == args->skip_rid ) return; // This can be raw callable VCF with the symbolic unseen allele <*> @@ -1043,7 +1047,7 @@ static void vcfroh(args_t *args, bcf1_t *line) args->prev_pos = line->pos; skip_rid = load_genmap(args, bcf_seqname(args->hdr,line)); } - else if ( args->prev_pos == line->pos ) + else if ( args->prev_pos == line->pos ) { args->ndup++; return; // skip duplicate positions @@ -1161,7 +1165,7 @@ int main_vcfroh(int argc, char *argv[]) switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; - case 2: + case 2: args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; @@ -1173,7 +1177,7 @@ int main_vcfroh(int argc, char *argv[]) args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 5: args->include_noalt_sites = 1; break; case 'o': args->output_fname = optarg; break; - case 'O': + case 'O': if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST; if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG; if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ; @@ -1183,10 +1187,10 @@ int main_vcfroh(int argc, char *argv[]) case 'i': args->skip_homref = 1; break; case 'I': args->snps_only = 1; break; case 'G': - args->fake_PLs = 1; + args->fake_PLs = 1; args->unseen_PL = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -G %s\n", optarg); - args->unseen_PL = pow(10,-args->unseen_PL/10.); + args->unseen_PL = pow(10,-args->unseen_PL/10.); break; case 'm': args->genmap_fname = optarg; break; case 'M': @@ -1208,24 +1212,20 @@ int main_vcfroh(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 6 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 7 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 'V': - args->vi_training = 1; - args->baum_welch_th = strtod(optarg,&tmp); + case 'V': + args->vi_training = 1; + args->baum_welch_th = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg); break; - case 'h': + case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c index 1546461..f9b8aab 100644 --- a/bcftools/vcfroh.c.pysam.c +++ b/bcftools/vcfroh.c.pysam.c @@ -2,7 +2,7 @@ /* vcfroh.c -- HMM model for detecting runs of autozygosity. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -155,7 +155,7 @@ static void init_data(args_t *args) args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"); if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) ) error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n"); - if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT ) + if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT ) error("Error: The FORMAT/PL tag not defined as Integer in the header\n"); } @@ -281,15 +281,15 @@ static void init_data(args_t *args) MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ; - MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; args->hmm = hmm_init(2, tprob, 10000); - if ( args->genmap_fname ) + if ( args->genmap_fname ) hmm_set_tprob_func(args->hmm, set_tprob_genmap, args); else if ( args->rec_rate > 0 ) hmm_set_tprob_func(args->hmm, set_tprob_rrate, args); - args->out = bgzf_open(strcmp("bcftools_stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); + args->out = bgzf_open(strcmp("bcftools_stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno)); // print header @@ -511,7 +511,7 @@ static void flush_viterbi(args_t *args, int ismpl) if ( !args->vi_training ) // single viterbi pass { - hmm_restore(args->hmm, smpl->snapshot); + hmm_restore(args->hmm, smpl->snapshot); int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites; if ( end < smpl->nsites ) smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->sites[smpl->nsites - args->nbuf_olap - 1]); @@ -537,7 +537,7 @@ static void flush_viterbi(args_t *args, int ismpl) if ( args->output_type & OUTPUT_RG ) { - if ( state!=smpl->rg.state ) + if ( state!=smpl->rg.state ) { if ( !state ) // the region ends, flush { @@ -601,7 +601,7 @@ static void flush_viterbi(args_t *args, int ismpl) MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ; - MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; hmm_set_tprob(args->hmm, tprob_arr, 10000); int niter = 0; @@ -629,14 +629,14 @@ static void flush_viterbi(args_t *args, int ismpl) delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev); niter++; args->str.l = 0; - ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n", + ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n", name,niter,deltaz,delthw, 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW), 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ)); if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th ); - + // output the results for (i=0; inrid; i++) { @@ -660,10 +660,10 @@ static void flush_viterbi(args_t *args, int ismpl) } } -int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) +int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) { if ( tgt->nals < 2 ) - error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s); + error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", tgt->fname,tgt->line.s); if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match int i; @@ -673,7 +673,7 @@ int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) char *tmp, *str = tgt->line.s; i = 0; - while ( *str && i<3 ) + while ( *str && i<3 ) { if ( *str=='\t' ) i++; str++; @@ -724,7 +724,11 @@ int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq) int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr); while ( gt < end ) { - if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) + { + gt += 2; + continue; + } if ( bcf_gt_allele(gt[0]) ) nalt++; else nref++; @@ -748,7 +752,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial); if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields - + if ( args->af_smpl ) // subset samples for AF estimate { #define BRANCH(type_t) \ @@ -840,10 +844,10 @@ int process_line(args_t *args, bcf1_t *line, int ial) if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%"PRId64"\n", args->af_tag, bcf_seqname(args->hdr,line), (int64_t) line->pos+1); } - else if ( args->af_fname ) + else if ( args->af_fname ) { // Read AF from a file - ret = read_AF(args, args->files->targets, line, &alt_freq); + ret = read_AF(args->files->targets, line, &alt_freq); } else if ( args->dflt_AF > 0 ) { @@ -877,9 +881,9 @@ int process_line(args_t *args, bcf1_t *line, int ial) if ( ret>0 ) AC = args->itmp[0]; } - if ( AN<=0 || AC<0 ) + if ( AN<=0 || AC<0 ) ret = -1; - else + else alt_freq = (double) AC/AN; } @@ -964,12 +968,12 @@ int process_line(args_t *args, bcf1_t *line, int ial) smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2); if ( !smpl->eprob ) error("Error: failed to alloc %"PRIu64" bytes\n", (uint64_t)(sizeof(*smpl->eprob)*smpl->msites*2)); } - + // Calculate emission probabilities P(D|AZ) and P(D|HW) double *eprob = &smpl->eprob[2*smpl->nsites]; eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; - + smpl->sites[smpl->nsites] = line->pos; smpl->nsites++; @@ -996,12 +1000,12 @@ static void vcfroh(args_t *args, bcf1_t *line) // Are we done? if ( !line ) - { + { for (i=0; iroh_smpl->n; i++) flush_viterbi(args, i); - return; + return; } - // Skip unwanted lines, for simplicity we consider only biallelic sites + // Skip unwanted lines, for simplicity we consider only biallelic sites if ( line->rid == args->skip_rid ) return; // This can be raw callable VCF with the symbolic unseen allele <*> @@ -1045,7 +1049,7 @@ static void vcfroh(args_t *args, bcf1_t *line) args->prev_pos = line->pos; skip_rid = load_genmap(args, bcf_seqname(args->hdr,line)); } - else if ( args->prev_pos == line->pos ) + else if ( args->prev_pos == line->pos ) { args->ndup++; return; // skip duplicate positions @@ -1163,7 +1167,7 @@ int main_vcfroh(int argc, char *argv[]) switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; - case 2: + case 2: args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; @@ -1175,7 +1179,7 @@ int main_vcfroh(int argc, char *argv[]) args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 5: args->include_noalt_sites = 1; break; case 'o': args->output_fname = optarg; break; - case 'O': + case 'O': if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST; if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG; if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ; @@ -1185,10 +1189,10 @@ int main_vcfroh(int argc, char *argv[]) case 'i': args->skip_homref = 1; break; case 'I': args->snps_only = 1; break; case 'G': - args->fake_PLs = 1; + args->fake_PLs = 1; args->unseen_PL = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -G %s\n", optarg); - args->unseen_PL = pow(10,-args->unseen_PL/10.); + args->unseen_PL = pow(10,-args->unseen_PL/10.); break; case 'm': args->genmap_fname = optarg; break; case 'M': @@ -1210,24 +1214,20 @@ int main_vcfroh(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 6 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 7 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 'V': - args->vi_training = 1; - args->baum_welch_th = strtod(optarg,&tmp); + case 'V': + args->vi_training = 1; + args->baum_welch_th = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg); break; - case 'h': + case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index c13b3e3..b286c90 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -72,7 +72,7 @@ idist_t; typedef struct { - uint32_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; + uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons #if HWE_STATS int *af_hwe; @@ -107,7 +107,7 @@ typedef struct { uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats /* - Pearson's R^2 is used for aggregate R^2 + Pearson's R^2 is used for aggregate R^2 y, yy .. sum of dosage and squared dosage in the query VCF (second file) x, xx .. sum of squared dosage in the truth VCF (first file) n .. number of genotypes @@ -436,7 +436,7 @@ static void init_stats(args_t *args) else { args->af_bins = bin_init(args->af_bins_list,0,1); - + // m_af is used also for other af arrays, where the first bin is for // singletons. However, since the last element is unused in af_bins // (n boundaries form n-1 intervals), the m_af count is good for both. @@ -892,7 +892,7 @@ static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int else if ( len > stats->m_indel ) len = stats->m_indel; int bin = stats->m_indel + len; stats->nvaf[bin]++; - stats->dvaf[bin] += dvaf; + stats->dvaf[bin] += dvaf; } static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) @@ -1199,7 +1199,7 @@ static void do_vcf_stats(args_t *args) do_sample_stats(args, stats, reader, ret); if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 ) - (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; + (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; } } @@ -1270,14 +1270,14 @@ static void print_stats(args_t *args) for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - printf("SN\t%d\tnumber of records:\t%u\n", id, stats->n_records); - printf("SN\t%d\tnumber of no-ALTs:\t%u\n", id, stats->n_noalts); - printf("SN\t%d\tnumber of SNPs:\t%u\n", id, stats->n_snps); - printf("SN\t%d\tnumber of MNPs:\t%u\n", id, stats->n_mnps); - printf("SN\t%d\tnumber of indels:\t%u\n", id, stats->n_indels); - printf("SN\t%d\tnumber of others:\t%u\n", id, stats->n_others); - printf("SN\t%d\tnumber of multiallelic sites:\t%u\n", id, stats->n_mals); - printf("SN\t%d\tnumber of multiallelic SNP sites:\t%u\n", id, stats->n_snp_mals); + printf("SN\t%d\tnumber of records:\t%"PRIu64"\n", id, stats->n_records); + printf("SN\t%d\tnumber of no-ALTs:\t%"PRIu64"\n", id, stats->n_noalts); + printf("SN\t%d\tnumber of SNPs:\t%"PRIu64"\n", id, stats->n_snps); + printf("SN\t%d\tnumber of MNPs:\t%"PRIu64"\n", id, stats->n_mnps); + printf("SN\t%d\tnumber of indels:\t%"PRIu64"\n", id, stats->n_indels); + printf("SN\t%d\tnumber of others:\t%"PRIu64"\n", id, stats->n_others); + printf("SN\t%d\tnumber of multiallelic sites:\t%"PRIu64"\n", id, stats->n_mals); + printf("SN\t%d\tnumber of multiallelic SNP sites:\t%"PRIu64"\n", id, stats->n_snp_mals); } printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); for (id=0; idnstats; id++) @@ -1419,7 +1419,7 @@ static void print_stats(args_t *args) { if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); - const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n"; + const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n"; printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } @@ -1489,7 +1489,7 @@ static void print_stats(args_t *args) for (k=0; k<4; k++) { n += stats[i].gt2gt[j][k]; - if ( j==k ) + if ( j==k ) { nrd_m[j] += stats[i].gt2gt[j][k]; m[j] += stats[i].gt2gt[j][k]; @@ -1512,8 +1512,8 @@ static void print_stats(args_t *args) } double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', af); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); + printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); + printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); if ( stats[i].n && !isnan(r2) ) printf("\t%f", r2); else printf("\t"NA_STRING); printf("\t%.0f\n", stats[i].n); @@ -1571,11 +1571,11 @@ static void print_stats(args_t *args) r2 *= r2; } printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", + printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", + printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]); @@ -1631,7 +1631,7 @@ static void print_stats(args_t *args) printf("GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]); for (j=0; j<5; j++) for (k=0; k<5; k++) - printf("\t%"PRId64, stats[i].gt2gt[j][k]); + printf("\t%"PRIu64, stats[i].gt2gt[j][k]); printf("\n"); } } @@ -1650,8 +1650,8 @@ static void print_stats(args_t *args) if ( i==0 ) printf("<%d", stats->dp.min); else if ( i+1==stats->dp.m_vals ) printf(">%d", stats->dp.max); else printf("%d", idist_i2bin(&stats->dp,i)); - printf("\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); - printf("\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); + printf("\t%"PRIu64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); + printf("\t%"PRIu64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); } } @@ -1851,16 +1851,12 @@ int main_vcfstats(int argc, char *argv[]) if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index 4f6c898..ebde82e 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -74,7 +74,7 @@ idist_t; typedef struct { - uint32_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; + uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons #if HWE_STATS int *af_hwe; @@ -109,7 +109,7 @@ typedef struct { uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats /* - Pearson's R^2 is used for aggregate R^2 + Pearson's R^2 is used for aggregate R^2 y, yy .. sum of dosage and squared dosage in the query VCF (second file) x, xx .. sum of squared dosage in the truth VCF (first file) n .. number of genotypes @@ -438,7 +438,7 @@ static void init_stats(args_t *args) else { args->af_bins = bin_init(args->af_bins_list,0,1); - + // m_af is used also for other af arrays, where the first bin is for // singletons. However, since the last element is unused in af_bins // (n boundaries form n-1 intervals), the m_af count is good for both. @@ -894,7 +894,7 @@ static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int else if ( len > stats->m_indel ) len = stats->m_indel; int bin = stats->m_indel + len; stats->nvaf[bin]++; - stats->dvaf[bin] += dvaf; + stats->dvaf[bin] += dvaf; } static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) @@ -1201,7 +1201,7 @@ static void do_vcf_stats(args_t *args) do_sample_stats(args, stats, reader, ret); if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 ) - (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; + (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; } } @@ -1272,14 +1272,14 @@ static void print_stats(args_t *args) for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - fprintf(bcftools_stdout, "SN\t%d\tnumber of records:\t%u\n", id, stats->n_records); - fprintf(bcftools_stdout, "SN\t%d\tnumber of no-ALTs:\t%u\n", id, stats->n_noalts); - fprintf(bcftools_stdout, "SN\t%d\tnumber of SNPs:\t%u\n", id, stats->n_snps); - fprintf(bcftools_stdout, "SN\t%d\tnumber of MNPs:\t%u\n", id, stats->n_mnps); - fprintf(bcftools_stdout, "SN\t%d\tnumber of indels:\t%u\n", id, stats->n_indels); - fprintf(bcftools_stdout, "SN\t%d\tnumber of others:\t%u\n", id, stats->n_others); - fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic sites:\t%u\n", id, stats->n_mals); - fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%u\n", id, stats->n_snp_mals); + fprintf(bcftools_stdout, "SN\t%d\tnumber of records:\t%"PRIu64"\n", id, stats->n_records); + fprintf(bcftools_stdout, "SN\t%d\tnumber of no-ALTs:\t%"PRIu64"\n", id, stats->n_noalts); + fprintf(bcftools_stdout, "SN\t%d\tnumber of SNPs:\t%"PRIu64"\n", id, stats->n_snps); + fprintf(bcftools_stdout, "SN\t%d\tnumber of MNPs:\t%"PRIu64"\n", id, stats->n_mnps); + fprintf(bcftools_stdout, "SN\t%d\tnumber of indels:\t%"PRIu64"\n", id, stats->n_indels); + fprintf(bcftools_stdout, "SN\t%d\tnumber of others:\t%"PRIu64"\n", id, stats->n_others); + fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic sites:\t%"PRIu64"\n", id, stats->n_mals); + fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%"PRIu64"\n", id, stats->n_snp_mals); } fprintf(bcftools_stdout, "# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); for (id=0; idnstats; id++) @@ -1421,7 +1421,7 @@ static void print_stats(args_t *args) { if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); - const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n"; + const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n"; fprintf(bcftools_stdout, fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } @@ -1491,7 +1491,7 @@ static void print_stats(args_t *args) for (k=0; k<4; k++) { n += stats[i].gt2gt[j][k]; - if ( j==k ) + if ( j==k ) { nrd_m[j] += stats[i].gt2gt[j][k]; m[j] += stats[i].gt2gt[j][k]; @@ -1514,8 +1514,8 @@ static void print_stats(args_t *args) } double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); fprintf(bcftools_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', af); - fprintf(bcftools_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); - fprintf(bcftools_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); + fprintf(bcftools_stdout, "\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); + fprintf(bcftools_stdout, "\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); if ( stats[i].n && !isnan(r2) ) fprintf(bcftools_stdout, "\t%f", r2); else fprintf(bcftools_stdout, "\t"NA_STRING); fprintf(bcftools_stdout, "\t%.0f\n", stats[i].n); @@ -1573,11 +1573,11 @@ static void print_stats(args_t *args) r2 *= r2; } fprintf(bcftools_stdout, "GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); - fprintf(bcftools_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", + fprintf(bcftools_stdout, "\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]); - fprintf(bcftools_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", + fprintf(bcftools_stdout, "\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]); @@ -1633,7 +1633,7 @@ static void print_stats(args_t *args) fprintf(bcftools_stdout, "GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]); for (j=0; j<5; j++) for (k=0; k<5; k++) - fprintf(bcftools_stdout, "\t%"PRId64, stats[i].gt2gt[j][k]); + fprintf(bcftools_stdout, "\t%"PRIu64, stats[i].gt2gt[j][k]); fprintf(bcftools_stdout, "\n"); } } @@ -1652,8 +1652,8 @@ static void print_stats(args_t *args) if ( i==0 ) fprintf(bcftools_stdout, "<%d", stats->dp.min); else if ( i+1==stats->dp.m_vals ) fprintf(bcftools_stdout, ">%d", stats->dp.max); else fprintf(bcftools_stdout, "%d", idist_i2bin(&stats->dp,i)); - fprintf(bcftools_stdout, "\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); - fprintf(bcftools_stdout, "\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); + fprintf(bcftools_stdout, "\t%"PRIu64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); + fprintf(bcftools_stdout, "\t%"PRIu64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); } } @@ -1853,16 +1853,12 @@ int main_vcfstats(int argc, char *argv[]) if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index 1dbcc61..cc02058 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -716,16 +716,12 @@ int main_vcfview(int argc, char *argv[]) break; } case 2 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 3 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index 9767124..4bbbefb 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -718,16 +718,12 @@ int main_vcfview(int argc, char *argv[]) break; } case 2 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 3 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/bcftools/version.c b/bcftools/version.c index 73e0b04..4306d40 100644 --- a/bcftools/version.c +++ b/bcftools/version.c @@ -69,7 +69,7 @@ const char *hts_bcf_wmode(int file_type) return "w"; // uncompressed VCF } -const char *hts_bcf_wmode2(int file_type, char *fname) +const char *hts_bcf_wmode2(int file_type, const char *fname) { if ( !fname ) return hts_bcf_wmode(file_type); int len = strlen(fname); @@ -80,7 +80,7 @@ const char *hts_bcf_wmode2(int file_type, char *fname) return hts_bcf_wmode(file_type); } -void set_wmode(char dst[8], int file_type, char *fname, int clevel) +void set_wmode(char dst[8], int file_type, const char *fname, int clevel) { const char *ret = NULL; int len = fname ? strlen(fname) : 0; @@ -100,3 +100,10 @@ void set_wmode(char dst[8], int file_type, char *fname, int clevel) strcpy(dst, ret); } +int parse_overlap_option(const char *arg) +{ + if ( strcasecmp(arg, "pos") == 0 || strcmp(arg, "0") == 0 ) return 0; + else if ( strcasecmp(arg, "record") == 0 || strcmp(arg, "1") == 0 ) return 1; + else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2; + else return -1; +} diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c index f524b21..df12fc4 100644 --- a/bcftools/version.c.pysam.c +++ b/bcftools/version.c.pysam.c @@ -71,7 +71,7 @@ const char *hts_bcf_wmode(int file_type) return "w"; // uncompressed VCF } -const char *hts_bcf_wmode2(int file_type, char *fname) +const char *hts_bcf_wmode2(int file_type, const char *fname) { if ( !fname ) return hts_bcf_wmode(file_type); int len = strlen(fname); @@ -82,7 +82,7 @@ const char *hts_bcf_wmode2(int file_type, char *fname) return hts_bcf_wmode(file_type); } -void set_wmode(char dst[8], int file_type, char *fname, int clevel) +void set_wmode(char dst[8], int file_type, const char *fname, int clevel) { const char *ret = NULL; int len = fname ? strlen(fname) : 0; @@ -102,3 +102,10 @@ void set_wmode(char dst[8], int file_type, char *fname, int clevel) strcpy(dst, ret); } +int parse_overlap_option(const char *arg) +{ + if ( strcasecmp(arg, "pos") == 0 || strcmp(arg, "0") == 0 ) return 0; + else if ( strcasecmp(arg, "record") == 0 || strcmp(arg, "1") == 0 ) return 1; + else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2; + else return -1; +} diff --git a/bcftools/version.sh b/bcftools/version.sh index 1bcfcea..0e51fdd 100755 --- a/bcftools/version.sh +++ b/bcftools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.14 +VERSION=1.15.1 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/cy_build.py b/cy_build.py index aff41a0..2726e94 100644 --- a/cy_build.py +++ b/cy_build.py @@ -8,7 +8,7 @@ except ImportError: from setuptools.command.build_ext import build_ext from distutils.extension import Extension -from distutils.sysconfig import get_config_vars, get_python_lib, get_python_version +from distutils.sysconfig import get_config_vars, get_python_version from pkg_resources import Distribution diff --git a/devtools/import.py b/devtools/import.py index ffbd88f..b4d6ffa 100644 --- a/devtools/import.py +++ b/devtools/import.py @@ -18,7 +18,6 @@ import hashlib EXCLUDE = { "samtools": ( "test", "misc", - "razip.c", "bgzip.c", "main.c", "calDepth.c", @@ -27,15 +26,13 @@ EXCLUDE = { "bam_tview.h", "bam_tview_html.c", "bam_tview_curses.c", - "bamcheck.c", - "chk_indel.c", + "bam2bcf.c", + "bam2bcf.h", "vcf-miniview.c", ), "bcftools": ( "test", "plugins", "peakfit.c", "peakfit.h", - # needs to renamed, name conflict with samtools reheader - # "reheader.c", "polysomy.c"), "htslib": ( 'htslib/tabix.c', 'htslib/bgzip.c', @@ -93,9 +90,6 @@ def _update_pysam_files(cf, destdir): else: lines = re.sub(r"int main\(", "int {}_{}_main(".format( basename, subname), lines) - if basename == "samtools": - lines = re.sub(r"main_(reheader)\(", - r"samtools_main_\1(", lines) lines = re.sub(r"\b({}_stdout)\b".format(basename), r"\1_internal", lines) lines = re.sub(r"\bexit\(", "{}_exit(".format(basename), lines) lines = re.sub(r"\bstderr\b", "{}_stderr".format(basename), lines) diff --git a/doc/api.rst b/doc/api.rst index 6246c35..47fe314 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -21,7 +21,7 @@ iteration returns a :class:`~pysam.AlignedSegment` object which represents a single read along with its fields and optional tags:: for read in samfile.fetch('chr1', 100, 120): - print read + print(read) samfile.close() @@ -55,12 +55,11 @@ reads are represented as :class:`~pysam.PileupRead` objects in the import pysam samfile = pysam.AlignmentFile("ex1.bam", "rb" ) for pileupcolumn in samfile.pileup("chr1", 100, 120): - print ("\ncoverage at base %s = %s" % - (pileupcolumn.pos, pileupcolumn.n)) + print("\ncoverage at base %s = %s" % (pileupcolumn.pos, pileupcolumn.n)) for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: # query position is None if is_del or is_refskip is set. - print ('\tbase in read %s = %s' % + print('\tbase in read %s = %s' % (pileupread.alignment.query_name, pileupread.alignment.query_sequence[pileupread.query_position])) @@ -82,7 +81,7 @@ The above code outputs:: base in read EAS51_64:3:190:727:308 = G ... -Commands available in :term:`csamtools` are available as simple +Commands available in `samtools`_ are available as simple function calls. For example:: pysam.sort("-o", "output.bam", "ex1.bam") @@ -99,7 +98,7 @@ tabix indexed tab-separated file formats with genomic data:: tabixfile = pysam.TabixFile("example.gtf.gz") for gtf in tabixfile.fetch("chr1", 1000, 2000): - print (gtf.contig, gtf.start, gtf.end, gtf.gene_id) + print(gtf.contig, gtf.start, gtf.end, gtf.gene_id) :class:`~pysam.TabixFile` implements lazy parsing in order to iterate over large tables efficiently. diff --git a/doc/conf.py b/doc/conf.py index 39b6f45..162ea38 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -57,15 +57,16 @@ rst_epilog = ''' .. _samtools: http://samtools.sourceforge.net/ .. _bcftools: https://samtools.github.io/bcftools/bcftools.html .. _htslib: http://www.htslib.org/ -.. _tabix: http://samtools.sourceforge.net/tabix.shtml +.. _tabix: http://www.htslib.org/doc/tabix.html .. _Galaxy: https://main.g2.bx.psu.edu/ -.. _cython: http://cython.org/ -.. _python: http://python.org/ +.. _cython: https://cython.org/ +.. _python: https://www.python.org/ .. _pypi: https://pypi.org/ .. _pip: https://pip.pypa.io/ -.. _pyximport: http://www.prescod.net/pyximport/ +.. _pyximport: https://github.com/cython/cython/tree/master/pyximport .. _conda: https://conda.io/docs/ .. _bioconda: https://bioconda.github.io/ +.. _sphinx: https://www.sphinx-doc.org/en/master/usage/installation.html ''' autosummary_generate = True diff --git a/doc/developer.rst b/doc/developer.rst index ca49fdc..5bc3066 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -9,26 +9,27 @@ The top level directory is organized in the following directories: :file:`pysam` - Code specific to pysam + Code specific to pysam. :file:`doc` - The documentation. To build the latest documentation type:: + The documentation. To build the latest documentation, first install + `Sphinx`_ and then type:: make -C doc html :file:`tests` - Code and data for testing and benchmarking + Code and data for testing and benchmarking. :file:`htslib` - Source code from htslib_ shipped with pysam. See + Source code from `htslib`_ shipped with pysam. See :file:`import.py` about importing. :file:`samtools` - Source code from :term:`csamtools` shipped with pysam. See + Source code from `samtools`_ shipped with pysam. See :file:`import.py` about importing. :file:`bcftools` - Source code from :term:`cbcftools` shipped with pysam. See + Source code from `bcftools`_ shipped with pysam. See :file:`import.py` about importing. @@ -36,7 +37,7 @@ Importing new versions of htslib and samtools ============================================= See instructions in :file:`import.py` to import the latest -version of htslib_, samtools_ and bcftools_. +versions of `htslib`_, `samtools`_ and `bcftools`_. Unit testing ============ @@ -71,15 +72,9 @@ See :ref:`Benchmarking` for more on this topic. Contributors ============ -Please see github for a list of all contributors: +Please see Github for a list of all contributors: https://github.com/pysam-developers/pysam/graphs/contributors Many thanks to all contributors for helping in making pysam useful. - - - - - - diff --git a/doc/faq.rst b/doc/faq.rst index fc39b60..e2352eb 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -41,7 +41,7 @@ parts have not been fully tested. A related issue is when different threads read from the same file object - or the same thread uses two iterators over a file. There is only a single file-position for each opened file. To prevent this from -hapeding, use the option ``multiple_iterator=True`` when calling +happening, use the option ``multiple_iterators=True`` when calling a fetch() method. This will return an iterator on a newly opened file. @@ -73,21 +73,20 @@ The following code will cause unexpected behaviour:: samfile = pysam.AlignmentFile("pysam_ex1.bam", "rb") iter1 = samfile.fetch("chr1") - print (iter1.next().reference_id) + print(next(iter1).reference_id) iter2 = samfile.fetch("chr2") - print (iter2.next().reference_id) - print (iter1.next().reference_id) - + print(next(iter2).reference_id) + print(next(iter1).reference_id) + This will give the following output:: 0 1 Traceback (most recent call last): File "xx.py", line 8, in - print iter1.next().reference_id - File "calignmentfile.pyx", line 1408, in - pysam.calignmentfile.IteratorRowRegion.__next__ - (pysam/calignmentfile.c:16461) + print(next(iter1).reference_id) + File "libcalignmentfile.pyx", line 2103, + in pysam.libcalignmentfile.IteratorRowRegion.__next__ StopIteration Note how the second iterator stops as the file pointer has moved to @@ -95,11 +94,11 @@ chr2. The correct way to work with multiple iterators is:: samfile = pysam.AlignmentFile("pysam_ex1.bam", "rb") - iter1 = samfile.fetch("chr1", all) - print (iter1.next().reference_id) + iter1 = samfile.fetch("chr1", multiple_iterators=True) + print(next(iter1).reference_id) iter2 = samfile.fetch("chr2") - print (iter2.next().reference_id) - print (iter1.next().reference_id) + print(next(iter2).reference_id) + print(next(iter1).reference_id) Here, the output is:: @@ -135,31 +134,31 @@ in the iteration by adding the ``until_eof=True`` flag:: bf = pysam.AlignmentFile(fname, "rb") for r in bf.fetch(until_eof=True): if r.is_unmapped: - print ("read is unmapped") + print("read is unmapped") -I can't call AlignmentFile.fetch on a file without index -======================================================== +I can't call AlignmentFile.fetch on a file without an index +=========================================================== :meth:`~pysam.AlignmentFile.fetch` requires an index when -iterating over a SAM/BAM file. To iterate over a file without -index, use the ``until_eof=True``:: +iterating over a SAM/BAM file. To iterate over a file without an +index, use ``until_eof=True``:: bf = pysam.AlignmentFile(fname, "rb") for r in bf.fetch(until_eof=True): - print (r) + print(r) BAM files with a large number of reference sequences are slow ============================================================= -If you have many reference sequences in a bam file, the following +If you have many reference sequences in a BAM file, the following might be slow:: track = pysam.AlignmentFile(fname, "rb") for aln in track.fetch(): pass -The reason is that track.fetch() will iterate through the bam file +The reason is that track.fetch() will iterate through the BAM file for each reference sequence in the order as it is defined in the header. This might require a lot of jumping around in the file. To avoid this, use:: @@ -174,24 +173,24 @@ Weirdness with spliced reads in samfile.pileup(chr,start,end) given spliced alig =============================================================================================================== Spliced reads are reported within samfile.pileup. To ignore these -in your analysis, test the flags ``is_del == True and indel=0`` +in your analysis, test the flags ``is_del == True and indel == 0`` in the :class:`~.PileupRead` object. I can't edit quality scores in place ==================================== -Editing reads in-place generally works, though there is some -quirk to be aware of. Assigning to AlignedRead.seq will invalidate -any quality scores in AlignedRead.qual. The reason is that samtools +Editing reads in-place generally works, though there is one +quirk to be aware of. Assigning to AlignedSegment.query_sequence will invalidate +any quality scores in AlignedSegment.query_qualities. The reason is that samtools manages the memory of the sequence and quality scores together and thus requires them to always be of the same length or 0. Thus, to in-place edit the sequence and quality scores, copies of the quality scores need to be taken. Consider trimming for example:: - q = read.qual - read.seq = read.seq[5:10] - read.qual = q[5:10] + quals = read.query_qualities + read.query_sequence = read.query_sequence[5:10] + read.query_qualities = quals[5:10] Why is there no SNPCaller class anymore? ========================================= @@ -201,13 +200,13 @@ danger that the pysam implementations might show different behaviour from the samtools implementation, which would have caused a lot of confusion. The best way to use samtools SNP calling from python is to use the -:meth:`pysam.mpileup` command and parse the output directly. +:meth:`pysam.mpileup` command and parse the output directly. I get an error 'PileupProxy accessed after iterator finished' ============================================================= Pysam works by providing proxy objects to objects defined within -the C-samtools package. Thus, some attention must be paid at the +the C-samtools package. Thus, some attention must be paid to the lifetime of objects. The following to code snippets will cause an error:: @@ -216,14 +215,14 @@ error:: pass for pp in p.pileups: - print pp + print(pp) -The iteration has finished, thus the contents of p are invalid. A +The iteration has finished, thus the contents of ``p`` are invalid. Another variation of this:: p = next(AlignmentFile('ex1.bam').pileup('chr1', 1000, 1010)) for pp in p.pileups: - print pp + print(pp) Again, the iteration finishes as the temporary iterator created by pileup goes out of scope. The solution is to keep a handle @@ -232,14 +231,14 @@ to the iterator that remains alive:: i = AlignmentFile('ex1.bam').pileup('chr1', 1000, 1010) p = next(i) for pp in p.pileups: - print pp + print(pp) Pysam won't compile =================== Compiling pysam can be tricky as there are numerous variables that differ between build environments such as OS, version, python version, -and compiler. It is difficult to build software that build cleanly +and compiler. It is difficult to build software that builds cleanly on all systems and the process might fail. Please see the `pysam user group `_ @@ -254,7 +253,7 @@ this at the very top of its error messages but will follow it with any unknown function or variable definition it encounters later on. -A general advice is to always use the latest version on python_ and +General advice is to always use the latest version on python_ and cython_ when building pysam. There are some known incompatibilities: * Python 3.4 requires cython 0.20.2 or later (see `here @@ -269,11 +268,11 @@ In version 0.10.0 and onwards, all pysam extension modules contain a ``lib``-prefix. This facilates linking against pysam extension modules with compilers that require to start with ``lib``. As a consequence, all code using pysam extension modules directly will need to be -adapted. For example, for example:: +adapted. For example,:: cimport pysam.csamtools will become:: - cimport pysam.libcamtools + cimport pysam.libcsamtools diff --git a/doc/glossary.rst b/doc/glossary.rst index 0389270..b67e2f4 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -16,19 +16,29 @@ Glossary insertions and another 2 matches. region - A genomic region, stated relative to a reference sequence. A - region consists of reference name ('chr1'), start (10000), and + A genomic region, stated relative to a :term:`reference` sequence. A + region consists of reference name ('chr1'), start (15000), and end (20000). Start and end can be omitted for regions spanning - a whole chromosome. If end is missing, the region will span from - start to the end of the chromosome. Within pysam, coordinates - are 0-based, half-open intervals, i.e., the position 10,000 is - part of the interval, but 20,000 is not. An exception are - :term:`samtools` compatible region strings such as - 'chr1:10000-20000', which are closed, i.e., both positions 10,000 - and 20,000 are part of the interval. + a whole chromosome. If ``end`` is missing, the region will span from + ``start`` to the end of the chromosome. Within pysam, coordinates + are 0-based half-open intervals, i.e., the first base of the + reference sequence is numbered zero; and the base at position + ``start`` is part of the interval, but the base at ``end`` is not. + + When a region is written as a single string using + `samtools`_-compatible notation, e.g., 'chr1:15001-20000', + the string's coordinates instead represent a 1-based closed interval, + i.e., both (1-based) positions 15,001 and 20,000 are part of the + interval. (This example denotes the same 5,000-base region as the + example in the previous paragraph.) + + genotype + An individual's collection of genes. It can also refer to the two alleles + inherited for a particular gene. column - Reads that are aligned to a base in the :term:`reference` sequence. + The portion of reads aligned to a single base in the + :term:`reference` sequence. tid The :term:`target` id. The target id is 0 or a positive integer mapping to @@ -38,8 +48,16 @@ Glossary contig The sequence that a :term:`tid` refers to. For example ``chr1``, ``contig123``. - Reference - Synonym for contig + reference + Synonym for contig. + + BED + Browser Extensible Data format. A text file format used to store genomic + :term:`regions` as coordinates and associated notations. + + GTF + The Gene Transfer Format is a file format used to hold information + about gene structure. SAM A textual format for storing genomic alignment information. @@ -64,12 +82,6 @@ Glossary pileup Pileup - samtools - The samtools_ package. - - csamtools - The samtools_ C-API. - fetching Retrieving all mapped reads mapped to a :term:`region`. @@ -106,10 +118,10 @@ Glossary to :term:`soft clipped` reads. VCF - Variant call format + Variant Call Format. BCF - Binary :term:`VCF` + Binary :term:`VCF`. FASTA Simple text format containing sequence data, with only the bare @@ -124,7 +136,7 @@ Glossary files. faidx - Utility in the samtools package to index :term:`fasta` formatted + Utility in the `samtools`_ package to index :term:`fasta` formatted files. bgzip diff --git a/doc/index.rst b/doc/index.rst index bfdc602..6f4e408 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,7 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as using cython and a high-level, pythonic API for convenient access to the data within genomic file formats. -The current version wraps *htslib-1.14*, *samtools-1.14*, and *bcftools-1.14*. +The current version wraps *htslib-1.15.1*, *samtools-1.15.1*, and *bcftools-1.15.1*. To install the latest release, type:: @@ -78,7 +78,7 @@ References http://samtools.sourceforge.net The cython C-extensions for python - http://cython.org/ + https://cython.org/ The python language - http://www.python.org + https://www.python.org diff --git a/doc/release.rst b/doc/release.rst index a2f31d4..d731c36 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,33 @@ Release notes ============= +Release 0.19.1 +============== + +This release wraps htslib/samtools/bcftools version 1.15.1. + +* [#1104] add an add_samples() method to quickly add multiple samples + to VCF. + +Release 0.19.0 +============== + +This release wraps htslib/samtools/bcftools version 1.15. + +* [#1085] Improve getopt()/getopt_long() resetting when running samtools/bcftools commands + +* [#1078] Support BAM_CPAD in get_aligned_pairs + +* [#1063] Run flake8 and fix some linting issues + +* [#1088] Add AlignedSegment is_mapped/mate_is_mapped/is_forward/mate_is_forward properties + +* Write an absent AlignedSegment.qual as all-bytes-0xff + +* Fix BGZFile.read() behaviour near or at EOF + +* First API for the htslib modified bases interface + Release 0.18.0 ============== @@ -398,7 +425,7 @@ changes, for example:: will become:: - cimport pysam.libcamtools + cimport pysam.libcsamtools Release 0.9.1 ============= diff --git a/doc/usage.rst b/doc/usage.rst index fc4f2bb..3c8ab04 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -35,7 +35,7 @@ object:: iter = samfile.fetch("seq1", 10, 20) for x in iter: - print (str(x)) + print(str(x)) :meth:`pysam.AlignmentFile.fetch` returns all reads overlapping a region sorted by the first aligned base in the :term:`reference` @@ -51,7 +51,7 @@ each base in the :term:`reference` sequence the reads that map to that particular position. In the typical view of reads stacking vertically on top of the reference sequence similar to a multiple alignment, :term:`fetching` iterates over the rows of this implied multiple -alignment while a :term:`pileup` iterates over the :term:`columns`. +alignment while a :term:`pileup` iterates over the :term:`columns`. Calling :meth:`~pysam.AlignmentFile.pileup` will return an iterator over each :term:`column` (reference base) of a specified @@ -62,7 +62,7 @@ some additional information:: iter = samfile.pileup('seq1', 10, 20) for x in iter: - print (str(x)) + print(str(x)) Creating BAM/CRAM/SAM files from scratch @@ -123,7 +123,7 @@ Note that the file open mode needs to changed from ``r`` to ``rb``. Using samtools commands within python ===================================== -Commands available in :term:`csamtools` are available as simple +Commands available in `samtools`_ are available as simple function calls. Command line options are provided as arguments. For example:: @@ -152,7 +152,7 @@ Argument errors raise a :class:`pysam.SamtoolsError`:: if retval: raise SamtoolsError( "\n".join( stderr ) ) pysam.SamtoolsError: 'Usage: samtools sort [-n] [-m ] \n' -Messages from :term:`csamtools` on stderr are captured and are +Messages from `samtools`_ on stderr are captured and are available using the :meth:`getMessages` method:: pysam.sort.getMessage() @@ -186,21 +186,21 @@ Similar to :class:`~pysam.AlignmentFile.fetch`, intervals within a region can be retrieved by calling :meth:`~pysam.TabixFile.fetch()`:: for row in tbx.fetch("chr1", 1000, 2000): - print (str(row)) + print(str(row)) This will return a tuple-like data structure in which columns can be retrieved by numeric index:: for row in tbx.fetch("chr1", 1000, 2000): - print ("chromosome is", row[0]) + print("chromosome is", row[0]) By providing a parser to :class:`~pysam.AlignmentFile.fetch` or :class:`~pysam.TabixFile`, the data will we presented in parsed form:: for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asTuple()): - print ("chromosome is", row.contig) - print ("first field (chrom)=", row[0]) + print("chromosome is", row.contig) + print("first field (chrom)=", row[0]) Pre-built parsers are available for :term:`bed` (:class:`~pysam.asBed`) formatted files and :term:`gtf` @@ -208,7 +208,7 @@ Pre-built parsers are available for :term:`bed` become available through named access, for example:: for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asBed()): - print ("name is", row.name) + print("name is", row.name) .. Currently inactivated as pileup deprecated @@ -227,7 +227,7 @@ become available through named access, for example:: .. pileup_iter = samfile.pileup( stepper = "samtools", fastafile = fastafile ) .. sncpall_iter = pysam.IteratorSNPCalls(pileup_iter) .. for call in snpcall_iter: -.. print str(call) +.. print(str(call)) .. Usage of :class:`pysam.SNPCaller` is similar:: @@ -235,7 +235,7 @@ become available through named access, for example:: .. fastafile = pysam.Fastafile( "ex1.fa" ) .. pileup_iter = samfile.pileup( stepper = "samtools", fastafile = fastafile ) .. snpcaller = pysam.SNPCaller.call(pileup_iter) -.. print snpcaller( "chr1", 100 ) +.. print(snpcaller( "chr1", 100 )) .. Note the use of the option *stepper* to control which reads are included in the .. in the :term:`pileup`. The ``samtools`` stepper implements the same read selection @@ -266,7 +266,7 @@ simple variant attributes such as :class:`~pysam.VariantRecord.contig`, :class:`~pysam.VariantRecord.pos`, :class:`~pysam.VariantRecord.ref`:: for rec in bcf_in.fetch(): - print (rec.pos) + print(rec.pos) but also to complex attributes such as the contents to the :class:`~pysam.VariantRecord.info`, :class:`~pysam.VariantRecord.format` @@ -275,15 +275,15 @@ complex attributes are views on the underlying htslib data structures and provide dictionary-like access to the data:: for rec in bcf_in.fetch(): - print (rec.info) - print (rec.info.keys()) - print (rec.info["DP"]) + print(rec.info) + print(rec.info.keys()) + print(rec.info["DP"]) The :py:attr:`~pysam.VariantFile.header` attribute (:class:`~pysam.VariantHeader`) provides access information stored in the :term:`vcf` header. The complete header can be printed:: - >>> print (bcf_in.header) + >>> print(bcf_in.header) ##fileformat=VCFv4.2 ##FILTER= ##fileDate=20090805 @@ -315,26 +315,26 @@ stored in the :term:`vcf` header. The complete header can be printed:: Individual contents such as contigs, info fields, samples, formats can be retrieved as attributes from :py:attr:`~pysam.VariantFile.header`:: - >>> print (bcf_in.header.contigs) + >>> print(bcf_in.header.contigs) To convert these views to native python types, iterate through the views:: - >>> print list((bcf_in.header.contigs)) + >>> print(list((bcf_in.header.contigs))) ['M', '17', '20'] - >>> print list((bcf_in.header.filters)) + >>> print(list((bcf_in.header.filters))) ['PASS', 'q10', 's50'] - >>> print list((bcf_in.header.info)) + >>> print(list((bcf_in.header.info))) ['NS', 'DP', 'AF', 'AA', 'DB', 'H2'] - >>> print list((bcf_in.header.samples)) + >>> print(list((bcf_in.header.samples))) ['NA00001', 'NA00002', 'NA00003'] Alternatively, it is possible to iterate through all records in the header returning objects of type :py:class:`~pysam.VariantHeaderRecord`:: :: >>> for x in bcf_in.header.records: - >>> print (x) - >>> print (x.type, x.key) + >>> print(x) + >>> print(x.type, x.key) GENERIC fileformat FILTER FILTER GENERIC fileDate @@ -364,7 +364,7 @@ Extending pysam =============== Using pyximport_, it is (relatively) straight-forward to access pysam -internals and the underlying samtools library. An example is provided +internals and the underlying `samtools`_ library. An example is provided in the :file:`tests` directory. The example emulates the samtools flagstat command and consists of three files: diff --git a/import/pysam.c b/import/pysam.c index 2a81e4d..168255c 100644 --- a/import/pysam.c +++ b/import/pysam.c @@ -1,5 +1,4 @@ -#include -#include +#include #include #include #include @@ -62,6 +61,15 @@ static int @pysam@_status = 0; int @pysam@_dispatch(int argc, char *argv[]) { + /* Reset getopt()/getopt_long() processing. */ +#if defined __GLIBC__ + optind = 0; +#elif defined _OPTRESET || defined _OPTRESET_DECLARED + optreset = optind = 1; +#else + optind = 1; +#endif + if (setjmp(@pysam@_jmpbuf) == 0) return @pysam@_main(argc, argv); else @@ -73,17 +81,3 @@ void @pysam@_exit(int status) @pysam@_status = status; longjmp(@pysam@_jmpbuf, 1); } - - -void @pysam@_set_optind(int val) -{ - // setting this in cython via - // "from posix.unistd cimport optind" - // did not work. - // - // setting to 0 forces a complete re-initialization - optind = val; -} - - - diff --git a/import/pysam.h b/import/pysam.h index 8dbb09e..da07281 100644 --- a/import/pysam.h +++ b/import/pysam.h @@ -53,8 +53,22 @@ int @pysam@_dispatch(int argc, char *argv[]); void PYSAM_NORETURN @pysam@_exit(int status); -void @pysam@_set_optind(int); - extern int @pysam@_main(int argc, char *argv[]); - + +/* Define these only in samtools/bcftools C source, not Cython code. */ +#if !(defined CYTHON_ABI || defined CYTHON_HEX_VERSION) + +/*! Several non-static function names are used in both samtools and bcftools. + Both libcsamtools.so and libcbcftools.so are loaded simultaneously, leading + to collisions and wrong functions being called. #define these names so the + actual symbol names include distinct prefixes to avoid collisions. + */ +#define main_consensus @pysam@_main_consensus +#define main_reheader @pysam@_main_reheader +#define bam_smpl_init @pysam@_bam_smpl_init +#define bam_smpl_destroy @pysam@_bam_smpl_destroy +#define read_file_list @pysam@_read_file_list + +#endif + #endif diff --git a/pysam/Pileup.py b/pysam/Pileup.py index 1fe05ec..2d05e3a 100644 --- a/pysam/Pileup.py +++ b/pysam/Pileup.py @@ -3,7 +3,7 @@ import collections import pysam PileupSubstitution = collections.namedtuple("PileupSubstitution", - " ".join(( + ( "chromosome", "pos", "reference_base", @@ -13,10 +13,10 @@ PileupSubstitution = collections.namedtuple("PileupSubstitution", "mapping_quality", "coverage", "read_bases", - "base_qualities"))) + "base_qualities")) PileupIndel = collections.namedtuple("PileupIndel", - " ".join(( + ( "chromosome", "pos", "reference_base", @@ -29,7 +29,7 @@ PileupIndel = collections.namedtuple("PileupIndel", "second_allele", "reads_first", "reads_second", - "reads_diff"))) + "reads_diff")) def iterate(infile): diff --git a/pysam/__init__.py b/pysam/__init__.py index a6ff6d7..ec52d94 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -1,8 +1,8 @@ import os -import sys import sysconfig from pysam.libchtslib import * +import pysam.libchtslib as libchtslib from pysam.libcsamtools import * from pysam.libcbcftools import * from pysam.libcutils import * @@ -32,21 +32,21 @@ import pysam.config # export all the symbols from separate modules -__all__ = \ - libchtslib.__all__ +\ - libcutils.__all__ +\ - libctabix.__all__ +\ - libcvcf.__all__ +\ - libcbcf.__all__ +\ - libcbgzf.__all__ +\ - libcfaidx.__all__ +\ - libctabixproxies.__all__ +\ - libcalignmentfile.__all__ +\ - libcalignedsegment.__all__ +\ - libcsamfile.__all__ +\ - ["SamtoolsError"] +\ +__all__ = ( + libchtslib.__all__ + # type: ignore + libcutils.__all__ + # type: ignore + libctabix.__all__ + # type: ignore + libcvcf.__all__ + # type: ignore + libcbcf.__all__ + # type: ignore + libcbgzf.__all__ + # type: ignore + libcfaidx.__all__ + # type: ignore + libctabixproxies.__all__ + # type: ignore + libcalignmentfile.__all__ + # type: ignore + libcalignedsegment.__all__ + # type: ignore + libcsamfile.__all__ + # type: ignore + ["SamtoolsError"] + ["Pileup"] - +) from pysam.version import __version__, __samtools_version__ diff --git a/pysam/bcftools.py b/pysam/bcftools.py index f65e17c..4cbe82f 100644 --- a/pysam/bcftools.py +++ b/pysam/bcftools.py @@ -13,6 +13,7 @@ BCFTOOLS_DISPATCH = [ "reheader", "sort", "view", + "head", "call", "consensus", "cnv", diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd index 473c5b1..32e2c97 100644 --- a/pysam/libcalignedsegment.pxd +++ b/pysam/libcalignedsegment.pxd @@ -49,8 +49,7 @@ cdef class AlignedSegment: # an existing tag of the same name will be replaced. cpdef set_tag(self, tag, value, value_type=?, replace=?) - # add an alignment tag with value to the AlignedSegment - # an existing tag of the same name will be replaced. + # get an alignment tag from the AlignedSegment cpdef get_tag(self, tag, with_value_type=?) # return true if tag exists diff --git a/pysam/libcalignedsegment.pyi b/pysam/libcalignedsegment.pyi new file mode 100644 index 0000000..f53c318 --- /dev/null +++ b/pysam/libcalignedsegment.pyi @@ -0,0 +1,216 @@ +import enum +import re +import sys +from array import array +from typing import Any, List, Optional, Dict, Tuple, Union, overload + +if sys.version_info < (3, 8): + from typing_extensions import Literal +else: + from typing import Literal + +from pysam import AlignmentHeader # type: ignore + +CMATCH: int +CINS: int +CDEL: int +CREF_SKIP: int +CSOFT_CLIP: int +CHARD_CLIP: int +CPAD: int +CEQUAL: int +CDIFF: int +CBACK: int + +FPAIRED: int +FPROPER_PAIR: int +FUNMAP: int +FMUNMAP: int +FREVERSE: int +FMREVERSE: int +FREAD1: int +FREAD2: int +FSECONDARY: int +FQCFAIL: int +FDUP: int +FSUPPLEMENTARY: int + +CIGAR2CODE: Dict[int, str] +CIGAR_REGEX: re.Pattern +DATATYPE2FORMAT: Dict[int, Tuple[str, int]] +KEY_NAMES: List[str] + +TagValue = Union[str, int, float, array] + +class CIGAR_OPS(enum.IntEnum): + CBACK: int + CDEL: int + CDIFF: int + CEQUAL: int + CHARD_CLIP: int + CINS: int + CMATCH: int + CPAD: int + CREF_SKIP: int + CSOFT_CLIP: int + +class SAM_FLAGS(enum.IntEnum): + FDUP: int + FMREVERSE: int + FMUNMAP: int + FPAIRED: int + FPROPER_PAIR: int + FQCFAIL: int + FREAD1: int + FREAD2: int + FREVERSE: int + FSECONDARY: int + FSUPPLEMENTARY: int + FUNMAP: int + +class AlignedSegment: + header: AlignmentHeader + query_name: Optional[str] + flag: int + reference_name: Optional[str] + reference_id: int + reference_start: int + mapping_quality: int + cigarstring: Optional[str] + next_reference_id: int + next_reference_name: Optional[str] + next_reference_start: int + template_length: int + query_sequence: Optional[str] + query_qualities: Optional[array] + bin: int + is_paired: bool + is_proper_pair: bool + is_unmapped: bool + mate_is_unmapped: bool + is_reverse: bool + mate_is_reverse: bool + is_read1: bool + is_read2: bool + is_secondary: bool + is_qcfail: bool + is_duplicate: bool + is_supplementary: bool + cigartuples: Optional[List[Tuple[int, int]]] + def __init__(self, header: Optional[AlignmentHeader] = ...) -> None: ... + def compare(self, other: Any) -> int: ... + def to_string(self) -> str: ... + @classmethod + def fromstring(cls, sam: str, header: AlignmentHeader) -> AlignedSegment: ... + def to_dict(self) -> Dict: ... + @classmethod + def from_dict(cls, sam_dict: Dict[str, Any], header: AlignmentHeader) -> Any: ... + def get_reference_positions(self, full_length: bool = ...) -> List[int]: ... + @property + def query_length(self) -> int: ... + @property + def reference_end(self) -> Optional[int]: ... + @property + def reference_length(self) -> Optional[int]: ... + @property + def query_alignment_sequence(self) -> Optional[str]: ... + @property + def query_alignment_qualities(self) -> Optional[array]: ... + @property + def query_alignment_start(self) -> int: ... + @property + def query_alignment_end(self) -> int: ... + @property + def query_alignment_length(self) -> int: ... + def infer_query_length(self) -> Optional[int]: ... + def infer_read_length(self) -> Optional[int]: ... + def get_reference_sequence(self) -> str: ... + def get_forward_sequence(self) -> Optional[str]: ... + def get_forward_qualities(self) -> Optional[array]: ... + def get_aligned_pairs( + self, matches_only: bool = ..., with_seq: bool = ... + ) -> List[Tuple[int, int]]: ... + def get_blocks(self) -> List[Tuple[int, int]]: ... + def get_overlap(self, start: int, end: int) -> Optional[int]: ... + def get_cigar_stats(self) -> Tuple[array, array]: ... + def set_tag( + self, + tag: str, + value: Union[int, float, str, bytes, array, List, Tuple, None], + value_type: Optional[ + Literal["A", "i", "f", "Z", "H", "B", "c", "C", "s", "S", "I"] + ] = ..., + replace: bool = ..., + ) -> None: ... + def has_tag(self, tag: str) -> bool: ... + @overload + def get_tag(self, tag: str, with_value_type: Literal[False]) -> TagValue: ... + @overload + def get_tag(self, tag, with_value_type: Literal[True]) -> Tuple[TagValue, str]: ... + @overload + def get_tag( + self, tag, with_value_type: bool = ... + ) -> Union[TagValue, Tuple[TagValue, str]]: ... + @overload + def get_tags( + self, with_value_type: Literal[False] + ) -> List[Tuple[str, TagValue]]: ... + @overload + def get_tags( + self, with_value_type: Literal[True] + ) -> List[Tuple[str, TagValue, str]]: ... + @overload + def get_tags( + self, with_value_type: bool = ... + ) -> Union[List[Tuple[str, TagValue, str]], List[Tuple[str, TagValue]]]: ... + def set_tags(self, tags: Any) -> None: ... + def __eq__(self, other): ... + def __ge__(self, other): ... + def __gt__(self, other): ... + def __le__(self, other): ... + def __lt__(self, other): ... + def __ne__(self, other): ... + +class PileupRead: + @property + def alignment(self) -> AlignedSegment: ... + @property + def query_position(self) -> Optional[int]: ... + @property + def query_position_or_next(self) -> int: ... + @property + def indel(self) -> int: ... + @property + def level(self) -> int: ... + @property + def is_del(self) -> int: ... + @property + def is_head(self) -> int: ... + @property + def is_tail(self) -> int: ... + @property + def is_refskip(self) -> int: ... + +class PileupColumn: + nsegments: int + def set_min_base_quality(self, min_base_quality: int) -> None: ... + def __len__(self) -> int: ... + @property + def reference_id(self) -> int: ... + @property + def reference_name(self) -> Optional[str]: ... + @property + def reference_pos(self) -> int: ... + @property + def pileups(self) -> List[PileupRead]: ... + def get_num_aligned(self) -> int: ... + def get_query_sequences( + self, + mark_matches: bool = ..., + mark_ends: bool = ..., + add_indels: bool = ..., + ) -> List[str]: ... + def get_query_qualities(self) -> List[int]: ... + def get_mapping_qualities(self) -> List[int]: ... + def get_query_positions(self) -> List[int]: ... + def get_query_names(self) -> List[str]: ... diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index da7274c..810a861 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -64,7 +64,7 @@ cimport cython from cpython cimport array as c_array from cpython.version cimport PY_MAJOR_VERSION from cpython cimport PyBytes_FromStringAndSize -from libc.string cimport strchr +from libc.string cimport memset, strchr from cpython cimport array as c_array from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \ INT8_MAX, INT16_MAX, INT32_MAX, \ @@ -705,11 +705,13 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): sequence. Positions corresponding to `N` (skipped region from the reference) - in the CIGAR string will not appear in the returned sequence. The - MD should correspondingly not contain these. Thus proper tags are:: + or `P` (padding (silent deletion from padded reference)) in the CIGAR + string will not appear in the returned sequence. The MD should + correspondingly not contain these. Thus proper tags are:: - Deletion from the reference: cigar=5M1D5M MD=5^C5 - Skipped region from reference: cigar=5M1N5M MD=10 + Deletion from the reference: cigar=5M1D5M MD=5^C5 + Skipped region from reference: cigar=5M1N5M MD=10 + Padded region in the reference: cigar=5M1P5M MD=10 Returns ------- @@ -762,7 +764,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): s_idx += 1 elif op == BAM_CREF_SKIP: pass - elif op == BAM_CINS: + elif op == BAM_CINS or op == BAM_CPAD: for i from 0 <= i < l: # encode insertions into reference as lowercase s[s_idx] = read_sequence[r_idx] + 32 @@ -772,10 +774,6 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): pass elif op == BAM_CHARD_CLIP: pass # advances neither - elif op == BAM_CPAD: - raise NotImplementedError( - "Padding (BAM_CPAD, 6) is currently not supported. " - "Please implement. Sorry about that.") cdef char * md_tag = bam_aux2Z(md_tag_ptr) cdef int md_idx = 0 @@ -795,6 +793,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): cdef uint32_t md_len = get_md_reference_length(md_tag) if md_len + insertions > max_len: + free(s) raise AssertionError( "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format( md_len, max_len, insertions)) @@ -886,16 +885,12 @@ cdef inline bytes build_reference_sequence(bam1_t * src): s_idx += 1 elif op == BAM_CREF_SKIP: pass - elif op == BAM_CINS: + elif op == BAM_CINS or op == BAM_CPAD: r_idx += l elif op == BAM_CSOFT_CLIP: pass elif op == BAM_CHARD_CLIP: pass # advances neither - elif op == BAM_CPAD: - raise NotImplementedError( - "Padding (BAM_CPAD, 6) is currently not supported. " - "Please implement. Sorry about that.") seq = PyBytes_FromStringAndSize(s, s_idx) free(s) @@ -1362,8 +1357,8 @@ cdef class AlignedSegment: The length includes soft-clipped bases and is equal to ``len(query_sequence)``. - This property is read-only but can be set by providing a - sequence. + This property is read-only but is updated when a new query + sequence is assigned to this AlignedSegment. Returns 0 if not available. @@ -1382,12 +1377,12 @@ cdef class AlignedSegment: """read sequence bases, including :term:`soft clipped` bases (None if not present). - Note that assigning to seq will invalidate any quality scores. + Assigning to this attribute will invalidate any quality scores. Thus, to in-place edit the sequence and quality scores, copies of the quality scores need to be taken. Consider trimming for example:: q = read.query_qualities - read.query_squence = read.query_sequence[5:10] + read.query_sequence = read.query_sequence[5:10] read.query_qualities = q[5:10] The sequence is returned as it is stored in the BAM file. (This will @@ -1457,7 +1452,7 @@ cdef class AlignedSegment: # erase qualities p = pysam_bam_get_qual(src) - p[0] = 0xff + memset(p, 0xff, l) self.cache_query_sequence = force_str(seq) @@ -1466,8 +1461,8 @@ cdef class AlignedSegment: self.cache_query_alignment_qualities = None property query_qualities: - """read sequence base qualities, including :term:`soft - clipped` bases (None if not present). + """read sequence base qualities, including :term:`soft clipped` bases + (None if not present). Quality scores are returned as a python array of unsigned chars. Note that this is not the ASCII-encoded value typically @@ -1510,8 +1505,7 @@ cdef class AlignedSegment: p = pysam_bam_get_qual(src) if qual is None or len(qual) == 0: # if absent and there is a sequence: set to 0xff - if src.core.l_qseq != 0: - p[0] = 0xff + memset(p, 0xff, src.core.l_qseq) return # check for length match @@ -1561,6 +1555,7 @@ cdef class AlignedSegment: return (self.flag & BAM_FPROPER_PAIR) != 0 def __set__(self,val): pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR) + property is_unmapped: """true if read itself is unmapped""" def __get__(self): @@ -1571,24 +1566,60 @@ cdef class AlignedSegment: # bin as alignment length is now implicitly 1 update_bin(self._delegate) + property is_mapped: + """true if read itself is mapped + (implemented in terms of :attr:`is_unmapped`)""" + def __get__(self): + return (self.flag & BAM_FUNMAP) == 0 + def __set__(self, val): + pysam_update_flag(self._delegate, not val, BAM_FUNMAP) + update_bin(self._delegate) + property mate_is_unmapped: """true if the mate is unmapped""" def __get__(self): return (self.flag & BAM_FMUNMAP) != 0 def __set__(self,val): pysam_update_flag(self._delegate, val, BAM_FMUNMAP) + + property mate_is_mapped: + """true if the mate is mapped + (implemented in terms of :attr:`mate_is_unmapped`)""" + def __get__(self): + return (self.flag & BAM_FMUNMAP) == 0 + def __set__(self,val): + pysam_update_flag(self._delegate, not val, BAM_FMUNMAP) + property is_reverse: """true if read is mapped to reverse strand""" def __get__(self): return (self.flag & BAM_FREVERSE) != 0 def __set__(self,val): pysam_update_flag(self._delegate, val, BAM_FREVERSE) + + property is_forward: + """true if read is mapped to forward strand + (implemented in terms of :attr:`is_reverse`)""" + def __get__(self): + return (self.flag & BAM_FREVERSE) == 0 + def __set__(self,val): + pysam_update_flag(self._delegate, not val, BAM_FREVERSE) + property mate_is_reverse: - """true is read is mapped to reverse strand""" + """true if the mate is mapped to reverse strand""" def __get__(self): return (self.flag & BAM_FMREVERSE) != 0 def __set__(self,val): pysam_update_flag(self._delegate, val, BAM_FMREVERSE) + + property mate_is_forward: + """true if the mate is mapped to forward strand + (implemented in terms of :attr:`mate_is_reverse`)""" + def __get__(self): + return (self.flag & BAM_FMREVERSE) == 0 + def __set__(self,val): + pysam_update_flag(self._delegate, not val, BAM_FMREVERSE) + property is_read1: """true if this is read1""" def __get__(self): @@ -1645,7 +1676,9 @@ cdef class AlignedSegment: property reference_length: '''aligned length of the read on the reference genome. - This is equal to `aend - pos`. Returns None if not available.''' + This is equal to `reference_end - reference_start`. + Returns None if not available. + ''' def __get__(self): cdef bam1_t * src src = self._delegate @@ -1657,9 +1690,9 @@ cdef class AlignedSegment: property query_alignment_sequence: """aligned portion of the read. - This is a substring of :attr:`seq` that excludes flanking + This is a substring of :attr:`query_sequence` that excludes flanking bases that were :term:`soft clipped` (None if not present). It - is equal to ``seq[qstart:qend]``. + is equal to ``query_sequence[query_alignment_start:query_alignment_end]``. SAM/BAM files may include extra flanking bases that are not part of the alignment. These bases may be the result of the @@ -1692,9 +1725,10 @@ cdef class AlignedSegment: property query_alignment_qualities: """aligned query sequence quality values (None if not present). These - are the quality values that correspond to :attr:`query`, that - is, they exclude qualities of :term:`soft clipped` bases. This - is equal to ``qual[qstart:qend]``. + are the quality values that correspond to + :attr:`query_alignment_sequence`, that is, they exclude qualities of + :term:`soft clipped` bases. This is equal to + ``query_qualities[query_alignment_start:query_alignment_end]``. Quality scores are returned as a python array of unsigned chars. Note that this is not the ASCII-encoded value typically @@ -1727,8 +1761,8 @@ cdef class AlignedSegment: """start index of the aligned query portion of the sequence (0-based, inclusive). - This the index of the first base in :attr:`seq` that is not - soft-clipped. + This the index of the first base in :attr:`query_sequence` + that is not soft-clipped. """ def __get__(self): return getQueryStart(self._delegate) @@ -1737,16 +1771,87 @@ cdef class AlignedSegment: """end index of the aligned query portion of the sequence (0-based, exclusive) - This the index just past the last base in :attr:`seq` that is not - soft-clipped. + This the index just past the last base in :attr:`query_sequence` + that is not soft-clipped. """ def __get__(self): return getQueryEnd(self._delegate) + property modified_bases: + """Modified bases annotations from Ml/Mm tags. The output is + Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...] + with qual being (256*probability), or -1 if unknown. + Strand==0 for forward and 1 for reverse strand modification + """ + def __get__(self): + cdef bam1_t * src + cdef hts_base_mod_state *m = hts_base_mod_state_alloc() + cdef hts_base_mod mods[5] + cdef int pos + + ret = {} + src = self._delegate + + if bam_parse_basemod(src, m) < 0: + return None + + n = bam_next_basemod(src, m, mods, 5, &pos) + + while n>0: + for i in range(n): + mod_code = chr(mods[i].modified_base) if mods[i].modified_base>0 else -mods[i].modified_base + mod_strand = mods[i].strand + if self.is_reverse: + mod_strand = 1 - mod_strand + key = (chr(mods[i].canonical_base), + mod_strand, + mod_code ) + ret.setdefault(key,[]).append((pos,mods[i].qual)) + + n = bam_next_basemod(src, m, mods, 5, &pos) + + if n<0: + return None + + hts_base_mod_state_free(m) + return ret + + property modified_bases_forward: + """Modified bases annotations from Ml/Mm tags. The output is + Dict[(canonical base, strand, modification)] -> [ (pos,qual), ...] + with qual being (256*probability), or -1 if unknown. + Strand==0 for forward and 1 for reverse strand modification. + The positions are with respect to the original sequence from get_forward_sequence() + """ + def __get__(self): + pmods = self.modified_bases + if pmods and self.is_reverse: + rmod = {} + + # Try to find the length of the original sequence + rlen = self.infer_read_length() + if rlen is None and self.query_sequence is None: + return rmod + else: + rlen = len(self.query_sequence) + + for k,mods in pmods.items(): + nk = k[0],1 - k[1],k[2] + for i in range(len(mods)): + + mods[i] = (rlen - 1 -mods[i][0], mods[i][1]) + rmod[nk] = mods + return rmod + + return pmods + + property query_alignment_length: """length of the aligned query sequence. - This is equal to :attr:`qend` - :attr:`qstart`""" + This is equal to :attr:`query_alignment_end` - + :attr:`query_alignment_start` + """ def __get__(self): cdef bam1_t * src src = self._delegate @@ -1874,7 +1979,8 @@ cdef class AlignedSegment: For inserts, deletions, skipping either query or reference position may be None. - Padding is currently not supported and leads to an exception. + For padding in the reference, the reference position will + always be None. Parameters ---------- @@ -1884,8 +1990,9 @@ cdef class AlignedSegment: side. with_seq : bool If True, return a third element in the tuple containing the - reference sequence. Substitutions are lower-case. This option - requires an MD tag to be present. + reference sequence. For CIGAR 'P' (padding in the reference) + operations, the third tuple element will be None. Substitutions + are lower-case. This option requires an MD tag to be present. Returns ------- @@ -1934,7 +2041,7 @@ cdef class AlignedSegment: qpos += 1 pos += l - elif op == BAM_CINS or op == BAM_CSOFT_CLIP: + elif op == BAM_CINS or op == BAM_CSOFT_CLIP or op == BAM_CPAD: if not _matches_only: if _with_seq: for i from pos <= i < pos + l: @@ -1974,11 +2081,6 @@ cdef class AlignedSegment: pos += l - elif op == BAM_CPAD: - raise NotImplementedError( - "Padding (BAM_CPAD, 6) is currently not supported. " - "Please implement. Sorry about that.") - return result def get_blocks(self): @@ -2247,7 +2349,7 @@ cdef class AlignedSegment: *replace* is set to False. This is usually not recommended as a tag may only appear once in the optional alignment section. - If *value* is None, the tag will be deleted. + If *value* is `None`, the tag will be deleted. This method accepts valid SAM specification value types, which are:: @@ -2474,9 +2576,7 @@ cdef class AlignedSegment: Returns a list of all fields in the optional alignment section. Values are converted to appropriate python - values. For example: - - [(NM, 2), (RG, "GJP00TM04")] + values. For example: ``[(NM, 2), (RG, "GJP00TM04")]`` If *with_value_type* is set, the value type as encode in the AlignedSegment record will be returned as well: @@ -2552,7 +2652,7 @@ cdef class AlignedSegment: """sets the fields in the optional alignment section with a list of (tag, value) tuples. - The :term:`value type` of the values is determined from the + The value type of the values is determined from the python type. Optionally, a type may be given explicitly as a third value in the tuple, For example: @@ -2589,7 +2689,7 @@ cdef class AlignedSegment: new_size, pysam_bam_get_aux(src)) if retval == NULL: - raise MemoryError("could not allocated memory") + raise MemoryError("could not allocate memory") # copy data only if there is any if new_size > 0: @@ -2619,33 +2719,33 @@ cdef class AlignedSegment: # explicit declaration of getters/setters ######################################################## property qname: - """deprecated, use query_name instead""" + """deprecated, use :attr:`query_name` instead.""" def __get__(self): return self.query_name def __set__(self, v): self.query_name = v property tid: - """deprecated, use reference_id instead""" + """deprecated, use :attr:`reference_id` instead.""" def __get__(self): return self.reference_id def __set__(self, v): self.reference_id = v property pos: - """deprecated, use reference_start instead""" + """deprecated, use :attr:`reference_start` instead.""" def __get__(self): return self.reference_start def __set__(self, v): self.reference_start = v property mapq: - """deprecated, use mapping_quality instead""" + """deprecated, use :attr:`mapping_quality` instead.""" def __get__(self): return self.mapping_quality def __set__(self, v): self.mapping_quality = v property rnext: - """deprecated, use next_reference_id instead""" + """deprecated, use :attr:`next_reference_id` instead.""" def __get__(self): return self.next_reference_id def __set__(self, v): self.next_reference_id = v property pnext: - """deprecated, use next_reference_start instead""" + """deprecated, use :attr:`next_reference_start` instead.""" def __get__(self): return self.next_reference_start def __set__(self, v): self.next_reference_start = v property cigar: - """deprecated, use cigartuples instead""" + """deprecated, use :attr:`cigarstring` or :attr:`cigartuples` instead.""" def __get__(self): r = self.cigartuples if r is None: @@ -2653,125 +2753,129 @@ cdef class AlignedSegment: return r def __set__(self, v): self.cigartuples = v property tlen: - """deprecated, use template_length instead""" + """deprecated, use :attr:`template_length` instead.""" def __get__(self): return self.template_length def __set__(self, v): self.template_length = v property seq: - """deprecated, use query_sequence instead""" + """deprecated, use :attr:`query_sequence` instead.""" def __get__(self): return self.query_sequence def __set__(self, v): self.query_sequence = v property qual: - """deprecated, query_qualities instead""" + """deprecated, use :attr:`query_qualities` instead.""" def __get__(self): return array_to_qualitystring(self.query_qualities) def __set__(self, v): self.query_qualities = qualitystring_to_array(v) property alen: - """deprecated, reference_length instead""" + """deprecated, use :attr:`reference_length` instead.""" def __get__(self): return self.reference_length def __set__(self, v): self.reference_length = v property aend: - """deprecated, reference_end instead""" + """deprecated, use :attr:`reference_end` instead.""" def __get__(self): return self.reference_end def __set__(self, v): self.reference_end = v property rlen: - """deprecated, query_length instead""" + """deprecated, use :attr:`query_length` instead.""" def __get__(self): return self.query_length def __set__(self, v): self.query_length = v property query: - """deprecated, query_alignment_sequence instead""" + """deprecated, use :attr:`query_alignment_sequence` + instead.""" def __get__(self): return self.query_alignment_sequence def __set__(self, v): self.query_alignment_sequence = v property qqual: - """deprecated, query_alignment_qualities instead""" + """deprecated, use :attr:`query_alignment_qualities` + instead.""" def __get__(self): return array_to_qualitystring(self.query_alignment_qualities) def __set__(self, v): self.query_alignment_qualities = qualitystring_to_array(v) property qstart: - """deprecated, use query_alignment_start instead""" + """deprecated, use :attr:`query_alignment_start` instead.""" def __get__(self): return self.query_alignment_start def __set__(self, v): self.query_alignment_start = v property qend: - """deprecated, use query_alignment_end instead""" + """deprecated, use :attr:`query_alignment_end` instead.""" def __get__(self): return self.query_alignment_end def __set__(self, v): self.query_alignment_end = v property qlen: - """deprecated, use query_alignment_length instead""" + """deprecated, use :attr:`query_alignment_length` + instead.""" def __get__(self): return self.query_alignment_length def __set__(self, v): self.query_alignment_length = v property mrnm: - """deprecated, use next_reference_id instead""" + """deprecated, use :attr:`next_reference_id` instead.""" def __get__(self): return self.next_reference_id def __set__(self, v): self.next_reference_id = v property mpos: - """deprecated, use next_reference_start instead""" + """deprecated, use :attr:`next_reference_start` + instead.""" def __get__(self): return self.next_reference_start def __set__(self, v): self.next_reference_start = v property rname: - """deprecated, use reference_id instead""" + """deprecated, use :attr:`reference_id` instead.""" def __get__(self): return self.reference_id def __set__(self, v): self.reference_id = v property isize: - """deprecated, use template_length instead""" + """deprecated, use :attr:`template_length` instead.""" def __get__(self): return self.template_length def __set__(self, v): self.template_length = v property blocks: - """deprecated, use get_blocks() instead""" + """deprecated, use :meth:`get_blocks()` instead.""" def __get__(self): return self.get_blocks() property aligned_pairs: - """deprecated, use get_aligned_pairs() instead""" + """deprecated, use :meth:`get_aligned_pairs()` instead.""" def __get__(self): return self.get_aligned_pairs() property inferred_length: - """deprecated, use infer_query_length() instead""" + """deprecated, use :meth:`infer_query_length()` instead.""" def __get__(self): return self.infer_query_length() property positions: - """deprecated, use get_reference_positions() instead""" + """deprecated, use :meth:`get_reference_positions()` instead.""" def __get__(self): return self.get_reference_positions() property tags: - """deprecated, use get_tags() instead""" + """deprecated, use :meth:`get_tags()` instead.""" def __get__(self): return self.get_tags() def __set__(self, tags): self.set_tags(tags) def overlap(self): - """deprecated, use get_overlap() instead""" + """deprecated, use :meth:`get_overlap()` instead.""" return self.get_overlap() def opt(self, tag): - """deprecated, use get_tag() instead""" + """deprecated, use :meth:`get_tag()` instead.""" return self.get_tag(tag) def setTag(self, tag, value, value_type=None, replace=True): - """deprecated, use set_tag() instead""" + """deprecated, use :meth:`set_tag()` instead.""" return self.set_tag(tag, value, value_type, replace) @@ -2864,21 +2968,21 @@ cdef class PileupColumn: # Functions, properties for compatibility with pysam < 0.8 ######################################################## property pos: - """deprecated: use reference_pos""" + """deprecated, use :attr:`reference_pos` instead.""" def __get__(self): return self.reference_pos def __set__(self, v): self.reference_pos = v property tid: - """deprecated: use reference_id""" + """deprecated, use :attr:`reference_id` instead.""" def __get__(self): return self.reference_id def __set__(self, v): self.reference_id = v property n: - """deprecated: use nsegments""" + """deprecated, use :attr:`nsegments` instead.""" def __get__(self): return self.nsegments def __set__(self, v): @@ -2962,7 +3066,7 @@ cdef class PileupColumn: Returns ------- - list: a list of bases/sequences per read at pileup column position. + a list of bases/sequences per read at pileup column position. : list """ cdef uint32_t x = 0 @@ -3051,7 +3155,7 @@ cdef class PileupColumn: Returns ------- - list: a list of quality scores + a list of quality scores : list """ cdef uint32_t x = 0 cdef const bam_pileup1_t * p = NULL @@ -3079,7 +3183,7 @@ cdef class PileupColumn: Returns ------- - list: a list of quality scores + a list of quality scores : list """ if self.plp == NULL or self.plp[0] == NULL: raise ValueError("PileupColumn accessed after iterator finished") @@ -3105,7 +3209,7 @@ cdef class PileupColumn: Returns ------- - list: a list of read positions + a list of read positions : list """ if self.plp == NULL or self.plp[0] == NULL: raise ValueError("PileupColumn accessed after iterator finished") @@ -3131,7 +3235,7 @@ cdef class PileupColumn: Returns ------- - list: a list of query names at pileup column position. + a list of query names at pileup column position. : list """ if self.plp == NULL or self.plp[0] == NULL: raise ValueError("PileupColumn accessed after iterator finished") @@ -3177,7 +3281,7 @@ cdef class PileupRead: property query_position: """position of the read base at the pileup site, 0-based. - None if is_del or is_refskip is set. + None if :attr:`is_del` or :attr:`is_refskip` is set. """ def __get__(self): diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi new file mode 100644 index 0000000..23631db --- /dev/null +++ b/pysam/libcalignmentfile.pyi @@ -0,0 +1,237 @@ +import array +import sys +from typing import ( + Any, + Dict, + Type, + NamedTuple, + Tuple, + Optional, + Sequence, + Union, + Callable, + List, + Iterable, +) + +if sys.version_info < (3, 8): + from typing_extensions import Literal +else: + from typing import Literal + +from pysam.libchtslib import HTSFile, _HasFileNo +from pysam.libcalignedsegment import AlignedSegment +from pysam.libcfaidx import FastaFile + +class IndexStats(NamedTuple): + contig: str + mapped: int + unmapped: int + total: int + +VALID_HEADER_TYPES: Dict[str, Type] +VALID_HEADERS: Tuple[str] +KNOWN_HEADER_FIELDS: Dict[str, Dict[str, Type]] +VALID_HEADER_ORDER: Dict[str, Tuple[str]] + +def build_header_line(fields: Dict[str, str], record: str) -> str: ... + +class AlignmentHeader: + def __init__(self) -> None: ... + @classmethod + def _from_text_and_lengths( + cls, + text: Optional[str], + reference_names: Optional[Sequence[str]], + reference_lengths: Optional[Sequence[int]], + ) -> AlignmentHeader: ... + @classmethod + def from_text(cls, text: str) -> AlignmentHeader: ... + @classmethod + def from_dict(cls, header_dict: Dict) -> AlignmentHeader: ... + @classmethod + def from_references( + cls, + reference_names: Sequence[str], + reference_lengths: Sequence[int], + text: Optional[str] = ..., + add_sq_text: bool = ..., + ) -> AlignmentHeader: ... + def __bool__(self) -> bool: ... + def copy(self) -> AlignmentHeader: ... + @property + def nreferences(self) -> int: ... + @property + def references(self) -> Tuple[str]: ... + @property + def lengths(self) -> Tuple[int]: ... + def to_dict(self) -> Dict: ... + def get_reference_name(self, tid: int) -> Optional[str]: ... + def get_reference_length(self, reference: int) -> int: ... + def is_valid_tid(self, tid: int) -> bool: ... + def get_tid(self, reference: int) -> int: ... + +class AlignmentFile(HTSFile): + def __init__( + self, + filename: Union[str, bytes, int, _HasFileNo], + mode: Optional[ + Literal["r", "w", "wh", "rb", "wb", "wbu", "wb0", "rc", "wc"] + ] = ..., + template: Optional[AlignmentFile] = ..., + reference_names: Optional[Sequence[str]] = ..., + reference_lengths: Optional[Sequence[int]] = ..., + reference_filename: Optional[str] = ..., + text: Optional[str] = ..., + header: Union[None, Dict, AlignmentHeader] = ..., + add_sq_text: bool = ..., + add_sam_header: bool = ..., + check_sq: bool = ..., + index_filename: Optional[str] = ..., + filepath_index: Optional[str] = ..., + require_index: bool = ..., + duplicate_filehandle: bool = ..., + ignore_truncation: bool = ..., + format_options: Optional[Sequence[str]] = ..., + threads: int = ..., + ) -> None: ... + def has_index(self) -> bool: ... + def check_index(self) -> bool: ... + def fetch( + self, + contig: Optional[str] = ..., + start: Optional[int] = ..., + stop: Optional[int] = ..., + region: Optional[str] = ..., + tid: Optional[int] = ..., + until_eof: bool = ..., + multiple_iterators: bool = ..., + reference: Optional[str] = ..., + end: int = ..., + ) -> IteratorRow: ... + def head(self, n: int, multiple_iterators: bool = ...) -> IteratorRow: ... + def mate(self, read: AlignedSegment) -> AlignedSegment: ... + def pileup( + self, + contig: Optional[str] = ..., + start: Optional[int] = ..., + stop: Optional[int] = ..., + region: Optional[str] = ..., + reference: Optional[str] = ..., + end: Optional[int] = ..., + truncate: bool = ..., + max_depth: int = ..., + stepper: str = ..., + fastafile: Optional[FastaFile] = ..., + ignore_overlaps: bool = ..., + flag_filter: int = ..., + flag_require: int = ..., + ignore_orphans: bool = ..., + min_base_quality: int = ..., + adjust_capq_threshold: int = ..., + min_mapping_quality: int = ..., + compute_baq: bool = ..., + redo_baq: bool = ..., + ) -> IteratorColumn: ... + def count( + self, + contig: Optional[str] = ..., + start: Optional[int] = ..., + stop: Optional[int] = ..., + region: Optional[str] = ..., + until_eof: bool = ..., + read_callback: Union[str, Callable[[AlignedSegment], bool]] = ..., + reference: Optional[str] = ..., + end: Optional[int] = ..., + ) -> int: ... + def count_coverage( + self, + contig: Optional[str] = ..., + start: Optional[int] = ..., + stop: Optional[int] = ..., + region: Optional[str] = ..., + quality_threshold: int = ..., + read_callback: Union[str, Callable[[AlignedSegment], bool]] = ..., + reference: Optional[str] = ..., + end: Optional[int] = ..., + ) -> Tuple[array.array, array.array, array.array, array.array]: ... + def find_introns_slow( + self, read_iterator: Iterable[AlignedSegment] + ) -> Dict[Tuple[int, int], int]: ... + def find_introns( + self, read_iterator: Iterable[AlignedSegment] + ) -> Dict[Tuple[int, int], int]: ... + def close(self) -> None: ... + def write(self, read: AlignedSegment) -> int: ... + def __enter__(self) -> AlignmentFile: ... + def __exit__(self, exc_type, exc_value, traceback): ... + @property + def mapped(self) -> int: ... + @property + def unmapped(self) -> int: ... + @property + def nocoordinate(self) -> int: ... + def get_index_statistics(self) -> List[IndexStats]: ... + def __iter__(self) -> Any: ... + def __next__(self) -> Any: ... + def is_valid_tid(self, tid: int) -> bool: ... + def get_tid(self, reference: str) -> int: ... + def get_reference_name(self, tid: int) -> str: ... + def get_reference_length(self, reference: str) -> int: ... + @property + def nreferences(self) -> int: ... + @property + def references(self) -> Tuple[str, ...]: ... + @property + def lengths(self) -> Tuple[int, ...]: ... + @property + def reference_filename(self) -> Optional[str]: ... + @property + def header(self) -> AlignmentHeader: ... + +class IteratorRow: + def __iter__(self) -> IteratorRow: ... + def __next__(self) -> AlignedSegment: ... + +class IteratorRowAll(IteratorRow): ... +class IteratorRowAllRefs(IteratorRow): ... +class IteratorRowHead(IteratorRow): ... +class IteratorRowRegion(IteratorRow): ... +class IteratorRowSelection(IteratorRow): ... + +class IteratorColumn: + def __iter__(self) -> IteratorRow: ... + def __next__(self) -> AlignedSegment: ... + @property + def seq_len(self) -> int: ... + def add_reference(self, fastafile: FastaFile) -> None: ... + def has_reference(self) -> bool: ... + +class IteratorColumnAll(IteratorColumn): ... +class IteratorColumnAllRefs(IteratorColumn): ... +class IteratorColumnRegion(IteratorColumn): ... + +class SNPCall: + @property + def tid(self) -> int: ... + @property + def pos(self) -> int: ... + @property + def reference_base(self) -> str: ... + @property + def genotype(self) -> str: ... + @property + def consensus_quality(self) -> int: ... + @property + def snp_quality(self) -> int: ... + @property + def mapping_quality(self) -> int: ... + @property + def coverage(self) -> int: ... + +class IndexedReads: + def __init__( + self, samfile: AlignmentFile, multiple_iterators: bool = ... + ) -> None: ... + def build(self) -> None: ... + def find(self, query_name: str) -> IteratorRow: ... diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index d4bdfba..799258a 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -494,7 +494,7 @@ cdef class AlignmentHeader(object): return result def as_dict(self): - """deprecated: use :meth:`to_dict()`""" + """deprecated, use :meth:`to_dict()` instead""" return self.to_dict() def get_reference_name(self, tid): @@ -605,7 +605,7 @@ cdef class AlignmentFile(HTSFile): be constituted from several sources (see also the samtools format specification): - 1. If `template` is given, the header is copied from a another + 1. If `template` is given, the header is copied from another `AlignmentFile` (`template` must be a :class:`~pysam.AlignmentFile`). @@ -1073,7 +1073,7 @@ cdef class AlignmentFile(HTSFile): Returns ------- - An iterator over a collection of reads. + An iterator over a collection of reads. : IteratorRow Raises ------ @@ -1145,14 +1145,14 @@ cdef class AlignmentFile(HTSFile): Returns ------- - an iterator over a collection of reads + an iterator over a collection of reads : IteratorRowHead ''' return IteratorRowHead(self, n, multiple_iterators=multiple_iterators) def mate(self, AlignedSegment read): - '''return the mate of :class:`~pysam.AlignedSegment` `read`. + '''return the mate of :class:`pysam.AlignedSegment` `read`. .. note:: @@ -1169,7 +1169,7 @@ cdef class AlignmentFile(HTSFile): Returns ------- - :class:`~pysam.AlignedSegment` : the mate + the mate : AlignedSegment Raises ------ @@ -1266,7 +1266,7 @@ cdef class AlignmentFile(HTSFile): uses every single read turning off any filtering. ``samtools`` - same filter and read processing as in :term:`csamtools` + same filter and read processing as in samtools pileup. For full compatibility, this requires a 'fastafile' to be given. The following options all pertain to filtering of the ``samtools`` stepper. @@ -1323,7 +1323,7 @@ cdef class AlignmentFile(HTSFile): Returns ------- - an iterator over genomic positions. + an iterator over genomic positions. : IteratorColumn """ cdef int rtid, has_coord @@ -1364,7 +1364,7 @@ cdef class AlignmentFile(HTSFile): The region is specified by :term:`contig`, `start` and `stop`. :term:`reference` and `end` are also accepted for backward compatibility as synonyms for :term:`contig` and `stop`, - respectively. Alternatively, a :term:`samtools` :term:`region` + respectively. Alternatively, a `samtools`_ :term:`region` string can be supplied. A :term:`SAM` file does not allow random access and if @@ -1468,7 +1468,7 @@ cdef class AlignmentFile(HTSFile): The region is specified by :term:`contig`, `start` and `stop`. :term:`reference` and `end` are also accepted for backward compatibility as synonyms for :term:`contig` and `stop`, - respectively. Alternatively, a :term:`samtools` :term:`region` + respectively. Alternatively, a `samtools`_ :term:`region` string can be supplied. The coverage is computed per-base [ACGT]. Parameters @@ -1946,7 +1946,7 @@ cdef class AlignmentFile(HTSFile): # Compatibility functions for pysam < 0.14 property text: - """deprecated, use .header directly""" + """deprecated, use :attr:`references` and :attr:`lengths` instead""" def __get__(self): if self.header: return self.header.__str__() @@ -1955,11 +1955,11 @@ cdef class AlignmentFile(HTSFile): # Compatibility functions for pysam < 0.8.3 def gettid(self, reference): - """deprecated, use get_tid() instead""" + """deprecated, use :meth:`get_tid` instead""" return self.get_tid(reference) def getrname(self, tid): - """deprecated, use get_reference_name() instead""" + """deprecated, use :meth:`get_reference_name` instead""" return self.get_reference_name(tid) @@ -2885,7 +2885,7 @@ cdef class IndexedReads: The index is kept in memory and can be substantial. - By default, the file is re-openend to avoid conflicts if multiple + By default, the file is re-opened to avoid conflicts if multiple operators work on the same file. Set `multiple_iterators` = False to not re-open `samfile`. diff --git a/pysam/libcbcf.pxd b/pysam/libcbcf.pxd index 1d4129b..6508994 100644 --- a/pysam/libcbcf.pxd +++ b/pysam/libcbcf.pxd @@ -38,6 +38,8 @@ from pysam.libchtslib cimport * cdef class VariantHeader(object): cdef bcf_hdr_t *ptr + cdef _add_sample(self, name) + cdef _hdr_sync(self) cdef _subset_samples(self, include_samples) diff --git a/pysam/libcbcf.pyi b/pysam/libcbcf.pyi new file mode 100644 index 0000000..f896cca --- /dev/null +++ b/pysam/libcbcf.pyi @@ -0,0 +1,369 @@ +import sys +from typing import ( + Optional, + Union, + Any, + Sequence, + Tuple, + Iterator, + List, + Iterable, + Dict, + overload, + TypeVar, + Mapping, + Generic, +) + +if sys.version_info < (3, 8): + from typing_extensions import Literal +else: + from typing import Literal + +from .libchtslib import HTSFile, _HasFileNo + +_D = TypeVar("_D") +_K = TypeVar("_K", str, Union[int, str]) +_V = TypeVar("_V") + +class _Mapping(Generic[_K, _V]): + def __len__(self) -> int: ... + def __contains__(self, key: _K) -> bool: ... + def __iter__(self) -> Iterator[_K]: ... + def iterkeys(self) -> Iterator[_K]: ... + def itervalues(self) -> Iterator[_V]: ... + def iteritems(self) -> Iterator[Tuple[_K, _V]]: ... + def keys(self) -> List[_K]: ... + def items(self) -> List[Tuple[_K, _V]]: ... + def values(self) -> List[_V]: ... + def __bool__(self) -> bool: ... + def __getitem__(self, key: _K) -> _V: ... + def get(self, key: _K, default: _D = ...) -> Union[_D, _V]: ... + +class VariantHeaderRecord(_Mapping[str, str]): + @property + def header(self) -> VariantHeader: ... + @property + def type(self) -> Optional[str]: ... + @property + def key(self) -> Optional[str]: ... + @property + def value(self) -> Optional[str]: ... + @property + def attrs(self) -> Sequence[Tuple[str, str]]: ... + def update(self, items: Union[Iterable, Dict] = ..., **kwargs) -> None: ... + def pop(self, key: str, default: str = ...) -> str: ... + def remove(self) -> None: ... # crashes + +class VariantHeaderRecords: + @property + def header(self) -> VariantHeader: ... + def __len__(self) -> int: ... + def __bool__(self) -> bool: ... + def __getitem__(self, index) -> VariantHeaderRecord: ... + def __iter__(self) -> Iterator[VariantHeaderRecord]: ... + +class VariantMetadata: + @property + def header(self) -> VariantHeader: ... + @property + def name(self) -> str: ... + # @property # should this be exposed? + # def id(self) -> int: ... + @property + def number(self) -> Optional[str]: ... + @property + def type(self) -> Optional[str]: ... + @property + def description(self) -> Optional[str]: ... + @property + def record(self) -> Optional[VariantHeaderRecord]: ... + def remove_header(self) -> None: ... + +class VariantHeaderMetadata(_Mapping[str, VariantMetadata]): + @property + def header(self) -> VariantHeader: ... + def add( + self, + id: str, + number: Optional[str], + type: Optional[str], + description: str, + **kwargs + ) -> None: ... + def remove_header(self, key: str) -> None: ... + def clear_header(self) -> None: ... + +class VariantContig: + @property + def header(self) -> VariantHeader: ... + @property + def name(self) -> str: ... + @property + def id(self) -> int: ... + @property + def length(self) -> Optional[int]: ... + @property + def header_record(self) -> VariantHeaderRecord: ... + def remove_header(self) -> None: ... + +class VariantHeaderContigs(_Mapping[Union[int, str], VariantContig]): + @property + def header(self) -> VariantHeader: ... + def remove_header(self, key: Union[int, str]) -> None: ... + def clear_header(self) -> None: ... + def add(self, id: str, length: Optional[int] = ..., **kwargs) -> None: ... + +class VariantHeaderSamples: + @property + def header(self) -> VariantHeader: ... + def __len__(self) -> int: ... + def __bool__(self) -> bool: ... + def __getitem__(self, index: int) -> str: ... + def __iter__(self) -> Iterator[str]: ... + def __contains__(self, key: str) -> bool: ... + def add(self, name: str) -> None: ... + +class VariantHeader: + def __init__(self) -> None: ... + def __bool__(self) -> bool: ... + def copy(self) -> VariantHeader: ... + def merge(self, header: VariantHeader) -> None: ... + @property + def version(self) -> str: ... + @property + def samples(self) -> VariantHeaderSamples: ... + @property + def records(self) -> VariantHeaderRecords: ... + @property + def contigs(self) -> VariantHeaderContigs: ... + @property + def filters(self) -> VariantHeaderMetadata: ... + @property + def info(self) -> VariantHeaderMetadata: ... + @property + def formats(self) -> VariantHeaderMetadata: ... + @property + def alts(self) -> Dict[str, VariantHeaderRecord]: ... + def new_record( + self, + contig: Optional[str] = ..., + start: int = ..., + stop: int = ..., + alleles: Optional[Tuple[str]] = ..., + id: Optional[str] = ..., + qual: Optional[int] = ..., + filter: Optional[Any] = ..., + info: Optional[Mapping[str, _InfoValue]] = ..., + samples: Optional[Iterable[str]] = ..., + **kwargs + ) -> VariantRecord: ... + def add_record(self, record: VariantHeaderRecord) -> None: ... + def add_line(self, line: str) -> None: ... + @overload + def add_meta( + self, key: str, value: None = ..., items: Iterable[Tuple[str, str]] = ... + ) -> None: ... + @overload + def add_meta(self, key: str, value: str = ..., items: None = ...) -> None: ... + def add_sample(self, name: str) -> None: ... + def add_samples(self, *args: Union[str, Iterable[str]]) -> None: ... + +class VariantRecordFilter(_Mapping[Union[int, str], VariantMetadata]): + def add(self, key: str) -> None: ... + def __delitem__(self, key: Union[int, str]) -> None: ... + def clear(self) -> None: ... + def __eq__(self, other) -> bool: ... + def __ne__(self, other) -> bool: ... + +class VariantRecordFormat(_Mapping[str, VariantMetadata]): + def __delitem__(self, key: str) -> None: ... + def clear(self) -> None: ... + +_InfoValue = Any # TODO see bcf_info_get_value + +class VariantRecordInfo(_Mapping[str, _InfoValue]): + def __setitem__(self, key: str, object: _InfoValue) -> None: ... + def __delitem__(self, key: str) -> None: ... + def clear(self) -> None: ... + def update( + self, items: Optional[_Mapping[str, _InfoValue]] = ..., **kwargs + ) -> None: ... + def pop(self, key: str, default: _D = ...) -> Union[_D, _InfoValue]: ... + def __eq__(self, other) -> bool: ... + def __ne__(self, other) -> bool: ... + +class VariantRecordSamples(_Mapping[Union[str, int], "VariantRecordSample"]): + def __eq__(self, other) -> bool: ... + def __ne__(self, other) -> bool: ... + # TODO Do these work? Isn’t the container read only? + def update( + self, + items: Optional[Mapping[Union[str, int], VariantRecordSample]] = ..., + **kwargs + ) -> None: ... + def pop( + self, key: Union[str, int], default: _D = ... + ) -> Union[_D, VariantRecordSample]: ... + +class VariantRecord: + @property + def header(self) -> VariantHeader: ... + def copy(self) -> VariantRecord: ... + def translate(self, dst_header: VariantHeader) -> None: ... + rid: int + chrom: str + contig: str + pos: int + start: int + stop: int + rlen: int + qual: Optional[int] + id: Optional[str] + ref: Optional[str] + alleles: Optional[Tuple[str]] + alts: Optional[Tuple[str]] + @property + def filter(self) -> VariantRecordFilter: ... + @property + def info(self) -> VariantRecordInfo: ... + @property + def format(self) -> VariantRecordFormat: ... + @property + def samples(self) -> VariantRecordSamples: ... + def __eq__(self, other) -> bool: ... + def __ne__(self, other) -> bool: ... + +_FormatValue = Any # TODO see bcf_format_get_value + +class VariantRecordSample(_Mapping[str, _FormatValue]): + @property + def index(self) -> int: ... + @property + def name(self) -> str: ... + allele_indices: Optional[Tuple[Optional[int]]] + alleles: Optional[Tuple[Optional[str]]] + phased: bool + def __setitem__(self, key: str, value: _FormatValue) -> None: ... + def __delitem__(self, key: str) -> None: ... + def clear(self) -> None: ... + def update( + self, items: Optional[Mapping[str, _FormatValue]] = ..., **kwargs + ) -> None: ... + def pop(self, key: str, default: _D = ...) -> Union[_D, _FormatValue]: ... + def __eq__(self, other) -> Any: ... + def __ne__(self, other) -> Any: ... + +class BaseIndex(_Mapping[Union[int, str], str]): + refs: Sequence[str] + refmap: Dict[str, str] + def __init__(self) -> None: ... + # TODO Do these work? Isn’t the container read only? + def update(self, items: Optional[Mapping[str, str]] = ..., **kwargs) -> None: ... + def pop(self, key: str, default: _D = ...) -> Union[_D, str]: ... + +class BCFIndex(BaseIndex): + @property + def header(self) -> VariantHeader: ... + def __init__(self) -> None: ... + def fetch( + self, + bcf: VariantFile, + contig: str, + start: Optional[int], + stop: Optional[int], + reopen: bool, + ) -> BCFIterator: ... + +class TabixIndex(BaseIndex): + def __init__(self) -> None: ... + def fetch( + self, + bcf: VariantFile, + contig: str, + start: Optional[int], + stop: Optional[int], + reopen: bool, + ) -> TabixIterator: ... + +class BaseIterator: + def __init__(self) -> None: ... + +class BCFIterator(BaseIterator): + def __init__( + self, + bcf: VariantFile, + contig: str, + start: Optional[int] = ..., + stop: Optional[int] = ..., + reopen: bool = ..., + ) -> None: ... + def __iter__(self) -> BCFIterator: ... + def __next__(self) -> VariantRecord: ... + +class TabixIterator(BaseIterator): + def __init__( + self, + bcf: VariantFile, + contig: str, + start: Optional[int] = ..., + stop: Optional[int] = ..., + reopen: bool = ..., + ) -> None: ... + def __iter__(self) -> TabixIterator: ... + def __next__(self) -> VariantRecord: ... + +class VariantFile(HTSFile): + @property + def header(self) -> VariantHeader: ... + @property + def index(self) -> BaseIndex: ... + @property + def drop_samples(self) -> bool: ... + @property + def is_reading(self) -> bool: ... + @property + def header_written(self) -> bool: ... + def __init__( + self, + filename: Union[str, bytes, int, _HasFileNo], + mode: Optional[Literal["r", "w", "wh", "rb", "wb", "wbu", "wb0"]] = ..., + index_filename: Optional[str] = ..., + header: Optional[VariantHeader] = ..., + drop_samples: bool = ..., + duplicate_filehandle: bool = ..., + ignore_truncation: bool = ..., + threads: int = ..., + ) -> None: ... + def close(self) -> None: ... + def __iter__(self) -> VariantFile: ... + def __next__(self) -> VariantRecord: ... + def copy(self) -> VariantFile: ... + def open( + self, + filename: Union[str, bytes, int, _HasFileNo], + mode: Optional[Literal["r", "w", "wh", "rb", "wb", "wbu", "wb0"]] = ..., + index_filename: Optional[str] = ..., + header: Optional[VariantHeader] = ..., + drop_samples: bool = ..., + duplicate_filehandle: bool = ..., + ignore_truncation: bool = ..., + threads: int = ..., + ) -> None: ... + def reset(self) -> None: ... + def is_valid_tid(self, tid: int) -> bool: ... + def get_tid(self, reference: str) -> int: ... + def get_reference_name(self, tid: int) -> str: ... + def fetch( + self, + contig: Optional[str] = ..., + start: Optional[int] = ..., + stop: Optional[int] = ..., + region: Optional[str] = ..., + reopen: bool = ..., + end: Optional[int] = ..., + reference: Optional[str] = ..., + ) -> Iterator[VariantRecord]: ... + def new_record(self, *args, **kwargs) -> Any: ... + def write(self, record: VariantRecord) -> int: ... + def subset_samples(self, include_samples: Iterable[str]) -> None: ... diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index 05a5fe8..fbb3a3d 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -2134,8 +2134,7 @@ cdef class VariantHeader(object): bcf_hdr_add_hrec(self.ptr, hrec) - if self.ptr.dirty: - bcf_hdr_sync(self.ptr) + self._hdr_sync() def add_line(self, line): """Add a metadata line to this header""" @@ -2143,8 +2142,7 @@ cdef class VariantHeader(object): if bcf_hdr_append(self.ptr, bline) < 0: raise ValueError('invalid header line') - if self.ptr.dirty: - bcf_hdr_sync(self.ptr) + self._hdr_sync() def add_meta(self, key, value=None, items=None): @@ -2176,16 +2174,37 @@ cdef class VariantHeader(object): bcf_hdr_add_hrec(self.ptr, hrec) - if self.ptr.dirty: - bcf_hdr_sync(self.ptr) + self._hdr_sync() - def add_sample(self, name): - """Add a new sample to this header""" + cdef _add_sample(self, name): bname = force_bytes(name) if bcf_hdr_add_sample(self.ptr, bname) < 0: raise ValueError('Duplicated sample name: {}'.format(name)) - if self.ptr.dirty: - bcf_hdr_sync(self.ptr) + + cdef _hdr_sync(self): + cdef bcf_hdr_t *hdr = self.ptr + if hdr.dirty: + if bcf_hdr_sync(hdr) < 0: + raise MemoryError('unable to reallocate VariantHeader') + + def add_sample(self, name): + """Add a new sample to this header""" + self._add_sample(name) + self._hdr_sync() + + def add_samples(self, *args): + """Add several new samples to this header. + This function takes multiple arguments, each of which may + be either a sample name or an iterable returning sample names + (e.g., a list of sample names). + """ + for arg in args: + if isinstance(arg, str): + self._add_sample(arg) + else: + for name in arg: + self._add_sample(name) + self._hdr_sync() cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr): @@ -3242,6 +3261,7 @@ cdef class VariantRecord(object): self.ptr.rlen = rlen else: self.ptr.rlen = len(values[0]) + r.d.var_type = -1 bcf_sync_end(self) @property @@ -3270,6 +3290,7 @@ cdef class VariantRecord(object): raise ValueError('cannot set null alt allele') ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.'] self.alleles = ref + value + r.d.var_type = -1 @property def filter(self): @@ -3299,6 +3320,34 @@ cdef class VariantRecord(object): raise ValueError('Error unpacking VariantRecord') return makeVariantRecordSamples(self) + property alleles_variant_types: + def __get__(self): + cdef bcf1_t *r = self.ptr + cdef tuple result = PyTuple_New(r.n_allele) + + for i in range(r.n_allele): + tp = bcf_get_variant_type(r, i) + + if tp == VCF_REF: + v_type = "REF" + elif tp == VCF_SNP: + v_type = "SNP" + elif tp == VCF_MNP: + v_type = "MNP" + elif tp == VCF_INDEL: + v_type = "INDEL" + elif tp == VCF_BND: + v_type = "BND" + elif tp == VCF_OVERLAP: + v_type = "OVERLAP" + else: + v_type = "OTHER" + + PyTuple_SET_ITEM(result, i, v_type) + Py_INCREF(v_type) + + return result + def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op): if op != 2 and op != 3: return NotImplemented diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd index d57f784..f8892ed 100644 --- a/pysam/libcbcftools.pxd +++ b/pysam/libcbcftools.pxd @@ -6,4 +6,3 @@ cdef extern from "bcftools.pysam.h": void bcftools_set_stdout(int fd) void bcftools_set_stdout_fn(const char *) void bcftools_close_stdout() - void bcftools_set_optind(int) diff --git a/pysam/libcbcftools.pyi b/pysam/libcbcftools.pyi new file mode 100644 index 0000000..242b931 --- /dev/null +++ b/pysam/libcbcftools.pyi @@ -0,0 +1 @@ +def py_bcftools() -> None: ... diff --git a/pysam/libcbgzf.pyi b/pysam/libcbgzf.pyi new file mode 100644 index 0000000..4d64e8d --- /dev/null +++ b/pysam/libcbgzf.pyi @@ -0,0 +1,40 @@ +import sys + +from typing import Optional, Union, Any, NoReturn + +if sys.version_info < (3, 8): + from typing_extensions import Literal +else: + from typing import Literal + +BUFFER_SIZE: int + +class BGZFile: + def __init__( + self, + filename: str, + mode: Optional[Literal["r", "rb", "a", "ab", "w", "wb", "x", "xb"]], + index: Optional[str], + ) -> None: ... + @property + def name(self) -> str: ... + @property + def index(self) -> Optional[str]: ... + def write(self, data: Union[bytes, bytearray, memoryview]) -> int: ... + def read(self, size: int = ...) -> bytes: ... + @property + def closed(self) -> bool: ... + def close(self) -> None: ... + def __enter__(self) -> BGZFile: ... + def __exit__(self, type, value, traceback) -> Any: ... + def flush(self) -> None: ... + def fileno(self) -> NoReturn: ... + def rewind(self) -> None: ... + def readable(self) -> bool: ... + def writable(self) -> bool: ... + def seekable(self) -> bool: ... + def tell(self) -> int: ... + def seek(self, offset: int, whence: int = ...) -> int: ... + def readline(self, size: int = ...) -> bytes: ... + def __iter__(self) -> BGZFile: ... + def __next__(self) -> bytes: ... diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx index 02ff2a2..ede6463 100644 --- a/pysam/libcbgzf.pyx +++ b/pysam/libcbgzf.pyx @@ -115,7 +115,7 @@ cdef class BGZFile(object): if read_size < 0: raise IOError('Error reading from BGZFile') elif read_size < size: - chunk = chunk[:size] + chunk = chunk[:read_size] return chunk else: return b'' diff --git a/pysam/libcfaidx.pyi b/pysam/libcfaidx.pyi new file mode 100644 index 0000000..5865701 --- /dev/null +++ b/pysam/libcfaidx.pyi @@ -0,0 +1,68 @@ +import array +from typing import Optional, Any, Sequence, Iterator + +class FastaFile: + def __init__( + self, + filename: str, + filepath_index: Optional[str] = ..., + filepath_index_compressed: Optional[str] = ..., + ) -> None: ... + def is_open(self) -> bool: ... + def __len__(self) -> int: ... + def close(self) -> None: ... + def __enter__(self) -> FastaFile: ... + def __exit__(self, type, value, traceback) -> Any: ... + @property + def closed(self) -> bool: ... + @property + def filename(self) -> str: ... + @property + def references(self) -> Sequence[str]: ... + @property + def nreferences(self) -> Optional[int]: ... + @property + def lengths(self) -> Sequence[int]: ... + def fetch( + self, + reference: Optional[str] = ..., + start: Optional[int] = ..., + end: Optional[int] = ..., + region: Optional[str] = ..., + ) -> str: ... + def get_reference_length(self, reference: str) -> int: ... + def __getitem__(self, reference: str) -> str: ... + def __contains__(self, reference: str) -> bool: ... + +class FastxRecord: + comment: str = ... + quality: str = ... + sequence: str = ... + name: str = ... + def __init__( + self, + name: Optional[str] = ..., + comment: Optional[str] = ..., + sequence: Optional[str] = ..., + quality: Optional[str] = ..., + ) -> None: ... + def set_name(self, name: str) -> None: ... + def set_comment(self, comment: str) -> None: ... + def set_sequence(self, sequence: str, quality: Optional[str] = ...) -> None: ... + def get_quality_array(self, offset: int = ...) -> array.array: ... + +class FastxFile: + def __init__(self, filename: str, persist: bool = ...) -> None: ... + def is_open(self) -> bool: ... + def close(self) -> None: ... + def __enter__(self) -> FastxFile: ... + def __exit__(self, type, value, traceback) -> Any: ... + @property + def closed(self) -> bool: ... + @property + def filename(self) -> str: ... + def __iter__(self) -> Iterator[FastxRecord]: ... + def __next__(self) -> FastxRecord: ... + +# deprecated +class FastqFile(FastxFile): ... diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index 9684ef9..ed3ca92 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -1237,6 +1237,85 @@ cdef extern from "htslib/sam.h" nogil: # Added by AH # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" + + + + # // --------------------------- + # // Base modification retrieval + + # /*! @typedef + # @abstract Holds a single base modification. + # @field modified_base The short base code (m, h, etc) or -ChEBI (negative) + # @field canonical_base The canonical base referred to in the MM tag. + # One of A, C, G, T or N. Note this may not be the + # explicit base recorded in the SEQ column (esp. if N). + # @field strand 0 or 1, indicating + or - strand from MM tag. + # @field qual Quality code (256*probability), or -1 if unknown + + # @discussion + # Note this doesn't hold any location data or information on which other + # modifications may be possible at this site. + ctypedef struct hts_base_mod: + int modified_base + int canonical_base + int strand + int qual + + # /// Allocates an hts_base_mode_state. + # /** + # * @return An hts_base_mode_state pointer on success, + # * NULL on failure. + # * + # * This just allocates the memory. The initialisation of the contents is + # * done using bam_parse_basemod. Successive calls may be made to that + # * without the need to free and allocate a new state. + # * + # * The state be destroyed using the hts_base_mode_state_free function. + # */ + ctypedef struct hts_base_mod_state + hts_base_mod_state *hts_base_mod_state_alloc() + + + # /// Destroys an hts_base_mode_state. + # /** + # * @param state The base modification state pointer. + # * + # * The should have previously been created by hts_base_mode_state_alloc. + # */ + void hts_base_mod_state_free(hts_base_mod_state *state) + + # /// Parses the Mm and Ml tags out of a bam record. + # /** + # * @param b BAM alignment record + # * @param state The base modification state pointer. + # * @return 0 on success, + # * -1 on failure. + # * + # * This fills out the contents of the modification state, resetting the + # * iterator location to the first sequence base. + # */ + int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) + + # /// Finds the next location containing base modifications and returns them + # /** + # * @param b BAM alignment record + # * @param state The base modification state pointer. + # * @param mods A supplied array for returning base modifications + # * @param n_mods The size of the mods array + # * @return The number of modifications found on success, + # * 0 if no more modifications are present, + # * -1 on failure. + # * + # * Unlike bam_mods_at_next_pos this skips ahead to the next site + # * with modifications. + # * + # * If more than n_mods modifications are found, the total found is returned. + # * Note this means the caller needs to check whether this is higher than + # * n_mods. + # */ + + int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,hts_base_mod *mods, int n_mods, int *pos) + # *********************************** # * BAQ calculation and realignment * # ***********************************/ @@ -1479,6 +1558,9 @@ cdef extern from "htslib/vcf.h" nogil: uint8_t VCF_MNP uint8_t VCF_INDEL uint8_t VCF_OTHER + uint8_t VCF_BND + uint8_t VCF_OVERLAP + ctypedef struct variant_t: int type, n # variant type and the number of bases affected, negative for deletions diff --git a/pysam/libchtslib.pyi b/pysam/libchtslib.pyi new file mode 100644 index 0000000..925828b --- /dev/null +++ b/pysam/libchtslib.pyi @@ -0,0 +1,115 @@ +import sys +from typing import List, Union, NoReturn, Iterable, Any, Tuple, Optional, TypeVar + +if sys.version_info < (3, 8): + from typing_extensions import Protocol +else: + from typing import Protocol + +class _HasFileNo(Protocol): + def fileno(self) -> int: ... + +def get_verbosity() -> int: ... +def set_verbosity(level: int): ... + +THFile = TypeVar("THFile", bound="HFile") + +class HFile: + def __init__(self, name: Union[int, str], mode: str = ...) -> None: ... + def __enter__(self: THFile) -> THFile: ... + def __exit__(self, type, value, tb): ... + def __iter__(self) -> Any: ... + def __next__(self) -> str: ... + @property + def closed(self) -> bool: ... + @property + def mode(self) -> str: ... + @property + def name(self) -> Union[int, str]: ... + def close(self) -> None: ... + def fileno(self) -> int: ... + def flush(self) -> None: ... + def isatty(self) -> bool: ... + def readable(self) -> bool: ... + def read(self, size: int = ...) -> bytes: ... + def readall(self) -> bytes: ... + def readinto(self, buf: Any) -> bytes: ... + def readline(self, size: int = ...) -> bytes: ... + def readlines(self) -> List[bytes]: ... + def seek(self, offset: int, whence: int = ...) -> int: ... + def seekable(self) -> bool: ... + def tell(self) -> int: ... + def truncate(self, *args) -> NoReturn: ... + def writable(self) -> bool: ... + def write(self, b: bytes) -> int: ... + def writelines(self, lines: Iterable[bytes]) -> None: ... + +THTSFile = TypeVar("THTSFile", bound="HTSFile") + +class HTSFile: + def __enter__(self: THTSFile) -> THTSFile: ... + def __exit__(self, type, value, traceback) -> Any: ... + @property + def filename(self) -> Any: ... + @property + def mode(self) -> str: ... + @property + def threads(self) -> int: ... + @property + def index_filename(self) -> Optional[str]: ... + @property + def is_stream(self) -> bool: ... + @property + def is_remote(self) -> bool: ... + @property + def duplicate_filehandle(self) -> bool: ... + def close(self) -> None: ... + def check_truncation(self, ignore_truncation: bool = ...) -> None: ... + @property + def category(self) -> str: ... + @property + def format(self) -> str: ... + @property + def version(self) -> Tuple[int, int]: ... + @property + def compression(self) -> str: ... + @property + def description(self) -> str: ... + @property + def is_open(self) -> bool: ... + @property + def is_closed(self) -> bool: ... + @property + def closed(self) -> bool: ... + @property + def is_write(self) -> bool: ... + @property + def is_read(self) -> bool: ... + @property + def is_sam(self) -> bool: ... + @property + def is_bam(self) -> bool: ... + @property + def is_cram(self) -> bool: ... + @property + def is_vcf(self) -> bool: ... + @property + def is_bcf(self) -> bool: ... + def reset(self) -> None: ... + def seek(self, offset: int) -> int: ... + def tell(self) -> int: ... + def add_hts_options(self, format_options: Optional[List[str]] = ...) -> None: ... + def parse_region( + self, + contig: Optional[str] = ..., + start: Optional[int] = ..., + stop: Optional[int] = ..., + region: Optional[str] = ..., + tid: Optional[int] = ..., + reference: Optional[str] = ..., + end: Optional[int] = ..., + ) -> Tuple[int, int, int, int]: ... + def is_valid_tid(self, tid: int) -> bool: ... + def is_valid_reference_name(self, contig: str) -> bool: ... + def get_tid(self, contig: str) -> int: ... + def get_reference_name(self, tid: int) -> Optional[str]: ... diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index 778fc23..3a9bbd2 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -473,7 +473,7 @@ cdef class HTSFile(object): Returns ------- - The file position after moving the file pointer. + The file position after moving the file pointer. : pointer """ return self.seek(self.start_offset) diff --git a/pysam/libcsamfile.pyi b/pysam/libcsamfile.pyi new file mode 100644 index 0000000..e9b5a25 --- /dev/null +++ b/pysam/libcsamfile.pyi @@ -0,0 +1,5 @@ +from pysam.libcalignedsegment import AlignedSegment +from pysam.libcalignmentfile import AlignmentFile + +class AlignedRead(AlignedSegment): ... +class Samfile(AlignmentFile): ... diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd index 3c39476..628d9a5 100644 --- a/pysam/libcsamtools.pxd +++ b/pysam/libcsamtools.pxd @@ -6,4 +6,3 @@ cdef extern from "samtools.pysam.h": void samtools_set_stdout(int fd) void samtools_set_stdout_fn(const char *) void samtools_close_stdout() - void samtools_set_optind(int) diff --git a/pysam/libcsamtools.pyi b/pysam/libcsamtools.pyi new file mode 100644 index 0000000..fe158dd --- /dev/null +++ b/pysam/libcsamtools.pyi @@ -0,0 +1 @@ +def py_samtools() -> None: ... diff --git a/pysam/libctabix.pyi b/pysam/libctabix.pyi new file mode 100644 index 0000000..e1ba211 --- /dev/null +++ b/pysam/libctabix.pyi @@ -0,0 +1,103 @@ +from typing import Optional, Literal, List, Any + +from pysam.libchtslib import HTSFile + +_ParseResult = Any + +class Parser: + def __init__(self, encoding: str = ...) -> None: ... + def get_encoding(self) -> str: ... + def set_encoding(self, encoding: str) -> None: ... + def __call__(self, buffer: str, length: int) -> _ParseResult: ... + +class asTuple(Parser): ... +class asGFF3(Parser): ... +class asGTF(Parser): ... +class asBed(Parser): ... +class asVCF(Parser): ... + +class TabixFile(HTSFile): + filename_index: bytes = ... + @property + def header(self) -> List[str]: ... + @property + def contigs(self) -> List[str]: ... + def __init__( + self, + filename: str, + mode: str = ..., + parser: Parser = ..., + index: Optional[str] = ..., + encoding: str = ..., + threads: int = ..., + *args, + **kwargs + ) -> None: ... + def fetch( + self, + reference: Optional[str] = ..., + start: Optional[int] = ..., + end: Optional[int] = ..., + region: Optional[str] = ..., + parser: Optional[Parser] = ..., + multiple_iterators: bool = ..., + ) -> Any: ... + def close(self) -> None: ... + +class TabixIterator: + def __init__(self, encoding: str = ...) -> None: ... + def __iter__(self) -> TabixIterator: ... + def __next__(self) -> str: ... + +class EmptyIterator: + def __iter__(self) -> Any: ... + def __next__(self) -> Any: ... + +class TabixIteratorParsed(TabixIterator): + def __init__(self, parser: Parser) -> None: ... + def __next__(self) -> Any: ... + +class GZIterator: + def __init__( + self, filename: str, bufer_size: int = ..., encoding: str = ... + ) -> None: ... + def __iter__(self) -> GZIterator: ... + def __next__(self) -> str: ... + +class GZIteratorHead(GZIterator): ... + +class GZIteratorParsed(GZIterator): + def __init__(self, parser: Parser) -> None: ... + def __next__(self) -> _ParseResult: ... + +def tabix_compress(filename_in: str, filename_out: str, force: bool = ...) -> None: ... +def tabix_index( + filename: str, + force: bool = ..., + seq_col: Optional[int] = ..., + start_col: Optional[int] = ..., + end_col: Optional[int] = ..., + preset: Optional[Literal["gff", "bed", "sam", "vcf", "psltbl", "pileup"]] = ..., + meta_char: str = ..., + line_skip: int = ..., + zerobased: bool = ..., + min_shift: int = ..., + index: Optional[str] = ..., + keep_original: bool = ..., + csi: bool = ..., +) -> str: ... + +class tabix_file_iterator: + def __init__(self, infile: str, parser: Parser, buffer_size: int = ...) -> None: ... + def __iter__(self) -> tabix_file_iterator: ... + def __next__(self) -> _ParseResult: ... + +class tabix_generic_iterator: + def __init__(self, infile: str, parser: Parser) -> None: ... + def __iter__(self) -> tabix_generic_iterator: ... + def __next__(self) -> _ParseResult: ... + +def tabix_iterator(infile: str, parser: Optional[Parser]) -> _ParseResult: ... + +# backwards compatibility +class Tabixfile(TabixFile): ... diff --git a/pysam/libctabixproxies.pyi b/pysam/libctabixproxies.pyi new file mode 100644 index 0000000..f720c7e --- /dev/null +++ b/pysam/libctabixproxies.pyi @@ -0,0 +1,62 @@ +from typing import ( + Optional, + overload, + List, + Dict, + OrderedDict, + Tuple, + Iterable, + KeysView, + Any, +) + +class TupleProxyIterator: + def __init__(self, proxy: Any): ... + def __iter__(self) -> TupleProxyIterator: ... + def __next__(self) -> Optional[str]: ... + +class TupleProxy: + def __init__(self, encoding: str = ...) -> None: ... + def __copy__(self) -> TupleProxy: ... + def compare(self, other: TupleProxy) -> int: ... + def getMinFields(self) -> int: ... + def getMaxFields(self) -> int: ... + def _getindex(self, index: int) -> str: ... + @overload + def __getitem__(self, key: slice) -> List[str]: ... + @overload + def __getitem__(self, key: int) -> Any: ... + def _setindex(self, index: int, value: Optional[str]) -> None: ... + def __setitem__(self, index: int, value: Optional[str]) -> None: ... + def __len__(self) -> int: ... + def __iter__(self) -> TupleProxyIterator: ... + +class NamedTupleProxy(TupleProxy): + def __setattr__(self, key: str, value: str) -> None: ... + def __getattr__(self, key: str) -> str: ... + +class GTFProxy(NamedTupleProxy): + def to_dict(self) -> Dict[str, Any]: ... + def from_dict(self, d: Dict[str, Any]) -> None: ... + def invert(self, lcontig: int) -> None: ... + def keys(self) -> KeysView[str]: ... + def setAttribute(self, key: str, value: Any) -> None: ... + def attribute_string2dict(self, s: str) -> OrderedDict[str, Any]: ... + def dict2attribute_string(self, d: Dict[str, Any]) -> str: ... + def attribute_string2iterator(self, s: str) -> Iterable[Tuple[str, Any]]: ... + def __getattr__(self, key: str) -> Any: ... + def __setattr__(self, key: str, value: Any) -> None: ... + # deprecated: + # def asDict(self) -> Any: ... + # def as_dict(self) -> Any: ... + # def fromDict(self, *args, **kwargs) -> Any: ... + +class GFF3Proxy(GTFProxy): ... + +class BedProxy(NamedTupleProxy): + def __setattr__(self, key: str, value: Any) -> None: ... + +class VCFProxy(NamedTupleProxy): + @property + def pos(self) -> int: ... + def __setattr__(self, key: str, value: Any) -> None: ... diff --git a/pysam/libcutils.pyi b/pysam/libcutils.pyi new file mode 100644 index 0000000..c82d2a6 --- /dev/null +++ b/pysam/libcutils.pyi @@ -0,0 +1,28 @@ +from array import array +from typing import Iterable, Optional, Tuple, Union + +def get_encoding_error_handler() -> str: ... +def set_encoding_error_handler(name: str) -> str: ... +def _pysam_dispatch( + collection: str, + method: str, + args: Optional[Iterable[str]], + catch_stdout: bool = ..., + is_usage: bool = ..., + save_stdout: Optional[str] = ..., +) -> Tuple[int, Union[bytes, str], Union[bytes, str]]: ... +def parse_region( + contig: Optional[str] = ..., + start: Optional[int] = ..., + stop: Optional[int] = ..., + region: Optional[str] = ..., + reference: Optional[str] = ..., + end: Optional[int] = ..., +) -> Tuple[str, int, int]: ... +def qualitystring_to_array( + input_str: Optional[str], offset: int = ... +) -> Optional[array]: ... +def array_to_qualitystring(qualities: array, offset: int = ...) -> Optional[str]: ... +def qualities_to_qualitystring( + qualities: Union[array, Iterable[str]], offset: int = ... +) -> Optional[str]: ... diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index d936dc6..81a19d3 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -19,10 +19,10 @@ from libc.stdio cimport stdout as c_stdout from posix.fcntl cimport open as c_open, O_WRONLY from libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \ - samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn, samtools_set_optind + samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn from libcbcftools cimport bcftools_dispatch, bcftools_set_stdout, bcftools_set_stderr, \ - bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn, bcftools_set_optind + bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn ##################################################################### # hard-coded constants @@ -401,16 +401,6 @@ def _pysam_dispatch(collection, l = len(args[i]) cargs[i + 2] = calloc(l + 1, sizeof(char)) strncpy(cargs[i + 2], args[i], l) - - # reset getopt. On OsX there getopt reset is different - # between getopt and getopt_long - if method in [b'index', b'cat', b'quickcheck', - b'faidx', b'kprobaln']: - samtools_set_optind(1) - bcftools_set_optind(1) - else: - samtools_set_optind(0) - bcftools_set_optind(0) # call samtools/bcftools if collection == b"samtools": diff --git a/pysam/libcvcf.pyi b/pysam/libcvcf.pyi new file mode 100644 index 0000000..e69de29 diff --git a/pysam/py.typed b/pysam/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/pysam/samtools.py b/pysam/samtools.py index 30d3edf..a359398 100644 --- a/pysam/samtools.py +++ b/pysam/samtools.py @@ -4,8 +4,10 @@ from pysam.utils import PysamDispatcher SAMTOOLS_DISPATCH = { # samtools 'documented' commands "view": ("view", None), + "head": ("head", None), "sort": ("sort", None), "mpileup": ("mpileup", None), + "consensus": ("consensus", None), "depth": ("depth", None), "faidx": ("faidx", None), "fqidx": ("fqidx", None), diff --git a/pysam/version.h b/pysam/version.h index 4794a2f..5f12054 100644 --- a/pysam/version.h +++ b/pysam/version.h @@ -1,5 +1,5 @@ // Version information used while compiling samtools, bcftools, and htslib -#define SAMTOOLS_VERSION "1.14 (pysam)" -#define BCFTOOLS_VERSION "1.14 (pysam)" -#define HTS_VERSION_TEXT "1.14 (pysam)" +#define SAMTOOLS_VERSION "1.15.1 (pysam)" +#define BCFTOOLS_VERSION "1.15.1 (pysam)" +#define HTS_VERSION_TEXT "1.15.1 (pysam)" diff --git a/pysam/version.py b/pysam/version.py index 97f673a..1251985 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,6 +1,6 @@ # pysam versioning information -__version__ = "0.18.0" +__version__ = "0.19.1" -__samtools_version__ = "1.14" -__bcftools_version__ = "1.14" -__htslib_version__ = "1.14" +__samtools_version__ = "1.15.1" +__bcftools_version__ = "1.15.1" +__htslib_version__ = "1.15.1" diff --git a/pysam/version.pyi b/pysam/version.pyi new file mode 100644 index 0000000..9d52128 --- /dev/null +++ b/pysam/version.pyi @@ -0,0 +1,4 @@ +__version__: str +__samtools_version__: str +__bcftools_version__: str +__htslib_version__: str diff --git a/samtools/LICENSE b/samtools/LICENSE index cd102b8..a14e403 100644 --- a/samtools/LICENSE +++ b/samtools/LICENSE @@ -1,6 +1,6 @@ The MIT/Expat License -Copyright (C) 2008-2021 Genome Research Ltd. +Copyright (C) 2008-2022 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/samtools/README b/samtools/README index 9aceb77..b7b08ae 100644 --- a/samtools/README +++ b/samtools/README @@ -9,7 +9,7 @@ Building samtools The typical simple case of building Samtools using the HTSlib bundled within this Samtools release tarball is done as follows: - cd .../samtools-1.14 # Within the unpacked release directory + cd .../samtools-1.15.1 # Within the unpacked release directory ./configure make @@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for installation using the HTSlib bundled within this Samtools release tarball, and building the various HTSlib utilities such as bgzip is done as follows: - cd .../samtools-1.14 # Within the unpacked release directory + cd .../samtools-1.15.1 # Within the unpacked release directory ./configure --prefix=/path/to/location make all all-htslib make install install-htslib @@ -48,7 +48,7 @@ There are two advantages to this: To build with plug-ins, you need to use the --enable-plugins configure option as follows: - cd .../samtools-1.14 # Within the unpacked release directory + cd .../samtools-1.15.1 # Within the unpacked release directory ./configure --enable-plugins --prefix=/path/to/location make all all-htslib make install install-htslib @@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from the source distribution instead of installing the package. In that case you can use: - cd .../samtools-1.14 # Within the unpacked release directory - ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.14 + cd .../samtools-1.15.1 # Within the unpacked release directory + ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.15.1 make all all-htslib It is possible to override the built-in search path using the HTS_PATH diff --git a/samtools/bam2bcf.c b/samtools/bam2bcf.c deleted file mode 100644 index 7cae49d..0000000 --- a/samtools/bam2bcf.c +++ /dev/null @@ -1,821 +0,0 @@ -/* bam2bcf.c -- variant calling. - - Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2015, 2021 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include "bam2bcf.h" - -extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); - -#define CALL_DEFTHETA 0.83 -#define DEF_MAPQ 20 - -#define CAP_DIST 25 - -bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) -{ - bcf_callaux_t *bca; - if (theta <= 0.) theta = CALL_DEFTHETA; - bca = calloc(1, sizeof(bcf_callaux_t)); - bca->capQ = 60; - bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; - bca->min_baseQ = min_baseQ; - bca->e = errmod_init(1. - theta); - bca->min_frac = 0.002; - bca->min_support = 1; - bca->per_sample_flt = 0; - bca->npos = 100; - bca->ref_pos = malloc(bca->npos*sizeof(int)); - bca->alt_pos = malloc(bca->npos*sizeof(int)); - bca->nqual = 60; - bca->ref_mq = malloc(bca->nqual*sizeof(int)); - bca->alt_mq = malloc(bca->nqual*sizeof(int)); - bca->ref_bq = malloc(bca->nqual*sizeof(int)); - bca->alt_bq = malloc(bca->nqual*sizeof(int)); - bca->fwd_mqs = malloc(bca->nqual*sizeof(int)); - bca->rev_mqs = malloc(bca->nqual*sizeof(int)); - return bca; -} - -void bcf_call_destroy(bcf_callaux_t *bca) -{ - if (bca == 0) return; - errmod_destroy(bca->e); - if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } - free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq); - free(bca->fwd_mqs); free(bca->rev_mqs); - bca->nqual = 0; - free(bca->bases); free(bca->inscns); free(bca); -} - -// position in the sequence with respect to the aligned part of the read -static int get_position(const bam_pileup1_t *p, int *len) -{ - int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; - for (icig=0; icigb->core.n_cigar; icig++) - { - int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK; - int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; - if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) - { - n_tot_bases += ncig; - iread += ncig; - continue; - } - if ( cig==BAM_CINS ) - { - n_tot_bases += ncig; - iread += ncig; - continue; - } - if ( cig==BAM_CSOFT_CLIP ) - { - iread += ncig; - if ( iread<=p->qpos ) edist -= ncig; - continue; - } - if ( cig==BAM_CDEL ) continue; - if ( cig==BAM_CHARD_CLIP ) continue; - if ( cig==BAM_CPAD ) continue; - if ( cig==BAM_CREF_SKIP ) continue; - fprintf(stderr,"todo: cigar %d\n", cig); - assert(0); - } - *len = n_tot_bases; - return edist; -} - -void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) -{ - memset(bca->ref_pos,0,sizeof(int)*bca->npos); - memset(bca->alt_pos,0,sizeof(int)*bca->npos); - memset(bca->ref_mq,0,sizeof(int)*bca->nqual); - memset(bca->alt_mq,0,sizeof(int)*bca->nqual); - memset(bca->ref_bq,0,sizeof(int)*bca->nqual); - memset(bca->alt_bq,0,sizeof(int)*bca->nqual); - memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual); - memset(bca->rev_mqs,0,sizeof(int)*bca->nqual); - if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); - if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); -} - -/* - Notes: - - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies - which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation. - Later it's used for multiallelic calling by bcftools -m - - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. - */ -/* - * This function is called once for each sample. - * _n is number of pilesups pl contributing reads to this sample - * pl is pointer to array of _n pileups (one pileup per read) - * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. - * bca is the settings to perform calls across all samples - * r is the returned value of the call - */ -int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) -{ - int i, n, ref4, is_indel, ori_depth = 0; - - // clean from previous run - r->ori_depth = 0; - r->mq0 = 0; - memset(r->qsum,0,sizeof(float)*4); - memset(r->anno,0,sizeof(double)*16); - memset(r->p,0,sizeof(float)*25); - - if (ref_base >= 0) { - ref4 = seq_nt16_int[ref_base]; - is_indel = 0; - } else ref4 = 4, is_indel = 1; - if (_n == 0) return -1; - // enlarge the bases array if necessary - if (bca->max_bases < _n) { - bca->max_bases = _n; - kroundup32(bca->max_bases); - bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); - } - // fill the bases array - for (i = n = 0; i < _n; ++i) { - const bam_pileup1_t *p = pl + i; - int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; - // set base - if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; - ++ori_depth; - mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 - if ( !mapQ ) r->mq0++; - baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality - seqQ = is_indel? (p->aux>>8&0xff) : 99; - if (q < bca->min_baseQ) continue; - if (q > seqQ) q = seqQ; - mapQ = mapQ < bca->capQ? mapQ : bca->capQ; - if (q > mapQ) q = mapQ; - if (q > 63) q = 63; - if (q < 4) q = 4; // MQ=0 reads count as BQ=4 - if (!is_indel) { - b = bam_seqi(bam_get_seq(p->b), p->qpos); // base - b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base - is_diff = (ref4 < 4 && b == ref4)? 0 : 1; - } else { - b = p->aux>>16&0x3f; - is_diff = (b != 0); - } - bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; - // collect annotations - if (b < 4) - { - r->qsum[b] += q; - if ( r->ADF ) - { - if ( bam_is_rev(p->b) ) - r->ADR[b]++; - else - r->ADF[b]++; - } - } - ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)]; - min_dist = p->b->core.l_qseq - 1 - p->qpos; - if (min_dist > p->qpos) min_dist = p->qpos; - if (min_dist > CAP_DIST) min_dist = CAP_DIST; - r->anno[1<<2|is_diff<<1|0] += baseQ; - r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; - r->anno[2<<2|is_diff<<1|0] += mapQ; - r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; - r->anno[3<<2|is_diff<<1|0] += min_dist; - r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; - - // collect for bias tests - if ( baseQ > 59 ) baseQ = 59; - if ( mapQ > 59 ) mapQ = 59; - int len, pos = get_position(p, &len); - int epos = (double)pos/(len+1) * bca->npos; - int ibq = baseQ/60. * bca->nqual; - int imq = mapQ/60. * bca->nqual; - if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; - else bca->fwd_mqs[imq]++; - if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) - { - bca->ref_pos[epos]++; - bca->ref_bq[ibq]++; - bca->ref_mq[imq]++; - } - else - { - bca->alt_pos[epos]++; - bca->alt_bq[ibq]++; - bca->alt_mq[imq]++; - } - } - r->ori_depth = ori_depth; - // glfgen - errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype - return n; -} - - -/* - * calc_vdb() - returns value between zero (most biased) and one (no bias) - * on success, or HUGE_VAL when VDB cannot be calculated because - * of insufficient depth (<2x) - * - * Variant Distance Bias tests if the variant bases are positioned within the - * reads with sufficient randomness. Unlike other tests, it looks only at - * variant reads and therefore gives different kind of information than Read - * Position Bias for instance. VDB was developed for detecting artefacts in - * RNA-seq calls where reads from spliced transcripts span splice site - * boundaries. The current implementation differs somewhat from the original - * version described in supplementary material of PMID:22524474, but the idea - * remains the same. (Here the random variable tested is the average distance - * from the averaged position, not the average pairwise distance.) - * - * For coverage of 2x, the calculation is exact but is approximated for the - * rest. The result is most accurate between 4-200x. For 3x or >200x, the - * reported values are slightly more favourable than those of a true random - * distribution. - */ -double calc_vdb(int *pos, int npos) -{ - // Note well: the parameters were obtained by fitting to simulated data of - // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen(). - const int readlen = 100; - assert( npos==readlen ); - - #define nparam 15 - const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5}, - {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8}, - {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7}, - {200,0.7,23.7} }; - - int i, dp = 0; - float mean_pos = 0, mean_diff = 0; - for (i=0; i=200 ) - i = nparam; // shortcut for big depths - else - { - for (i=0; i=dp ) break; - } - float pshift, pscale; - if ( i==nparam ) - { - // the depth is too high, go with 200x - pscale = param[nparam-1][1]; - pshift = param[nparam-1][2]; - } - else if ( i>0 && param[i][0]!=dp ) - { - // linear interpolation of parameters - pscale = (param[i-1][1] + param[i][1])*0.5; - pshift = (param[i-1][2] + param[i][2])*0.5; - } - else - { - pscale = param[i][1]; - pshift = param[i][2]; - } - return 0.5*kf_erfc(-(mean_diff-pshift)*pscale); -} - -double calc_chisq_bias(int *a, int *b, int n) -{ - int na = 0, nb = 0, i, ndf = n; - for (i=0; i=8 && nb>=8 and reasonable if na<8 or nb<8 - if ( na>=8 || nb>=8 ) - { - double mean = ((double)na*nb)*0.5; - double var2 = ((double)na*nb)*(na+nb+1)/12.0; - double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1) - return 2.0 - kf_erfc(z); // which is 1 + erf(z) - } - - // Exact calculation - double pval = 2*mann_whitney_1947_cdf(na,nb,U_min); - return pval>1 ? 1 : pval; -} - -double calc_mwu_bias(int *a, int *b, int n) -{ - int na = 0, nb = 0, i; - double U = 0; - for (i=0; imean ? (2.0*mean-U)/mean : U/mean; - } - double var2 = ((double)na*nb)*(na+nb+1)/12.0; - if ( na>=8 || nb>=8 ) - { - // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8 - return exp(-0.5*(U-mean)*(U-mean)/var2); - } - - // Exact calculation - return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2); -} - -static inline double logsumexp2(double a, double b) -{ - if ( a>b ) - return log(1 + exp(b-a)) + a; - else - return log(1 + exp(a-b)) + b; -} - -void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) -{ - call->seg_bias = HUGE_VAL; - if ( !bcr ) return; - - int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads - if ( !nr ) return; - - int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth - double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population - if ( M>call->n ) M = call->n; // clamp M at the number of samples - else if ( M==0 ) M = 1; - double f = M / 2. / call->n; // allele frequency - double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson) - double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson) - double sum = 0; - const double log2 = log(2.0); - - // fprintf(stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); - int i; - for (i=0; in; i++) - { - int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads - double tmp; - if ( oi ) - { - // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow - tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q); - tmp += log(f) + oi*log(q/p) - q + p; - } - else - tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p; - sum += tmp; - // fprintf(stderr,"oi=%d %e\n", oi,tmp); - } - call->seg_bias = sum; -} - -/** - * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles - * @n: number of samples - * @calls: each sample's calls - * @bca: auxiliary data structure for holding temporary values - * @ref_base: the reference base - * @call: filled with the annotations - * - * Combines calls across the various samples being studied - * 1. For each allele at each base across all samples the quality is summed so - * you end up with a set of quality sums for each allele present 2. The quality - * sums are sorted. - * 3. Using the sorted quality sums we now create the allele ordering array - * A\subN. This is done by doing the following: - * a) If the reference allele is known it always comes first, otherwise N - * comes first. - * b) Then the rest of the alleles are output in descending order of quality - * sum (which we already know the qsum array was sorted). Any allelles with - * qsum 0 will be excluded. - * 4. Using the allele ordering array we create the genotype ordering array. - * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1 - * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4 - * 5. The genotype ordering array is then used to extract data from the error - * model 5*5 matrix and is used to produce a Phread likelihood array for each - * sample. - */ -int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) -{ - int ref4, i, j; - float qsum[5] = {0,0,0,0,0}; - if (ref_base >= 0) { - call->ori_ref = ref4 = seq_nt16_int[ref_base]; - if (ref4 > 4) ref4 = 4; - } else call->ori_ref = -1, ref4 = 0; - - // calculate qsum, this is done by summing normalized qsum across all samples, - // to account for differences in coverage - for (i = 0; i < n; ++i) - { - float sum = 0; - for (j = 0; j < 4; ++j) sum += calls[i].qsum[j]; - if ( sum ) - for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum; - } - - // sort qsum in ascending order (insertion sort) - float *ptr[5], *tmp; - for (i=0; i<5; i++) ptr[i] = &qsum[i]; - for (i=1; i<4; i++) - for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--) - tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp; - - // Set the reference allele and alternative allele(s) - for (i=0; i<5; i++) call->a[i] = -1; - for (i=0; i<5; i++) call->qsum[i] = 0; - call->unseen = -1; - call->a[0] = ref4; - for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering - { - int ipos = ptr[i] - qsum; // position in sorted qsum array - if ( ipos==ref4 ) - call->qsum[0] = qsum[ipos]; // REF's qsum - else - { - if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup - call->qsum[j] = qsum[ipos]; - call->a[j++] = ipos; - } - } - if (ref_base >= 0) - { - // for SNPs, find the "unseen" base - if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) - call->unseen = j, call->a[j++] = ptr[i] - qsum; - call->n_alleles = j; - } - else - { - call->n_alleles = j; - if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything - } - /* - * Set the phread likelihood array (call->PL) This array is 15 entries long - * for each sample because that is size of an upper or lower triangle of a - * worst case 5x5 matrix of possible genotypes. This worst case matrix will - * occur when all 4 possible alleles are present and the reference allele - * is unknown. The sides of the matrix will correspond to the reference - * allele (if known) followed by the alleles present in descending order of - * quality sum - */ - { - int x, g[15], z; - double sum_min = 0.; - x = call->n_alleles * (call->n_alleles + 1) / 2; - // get the possible genotypes - // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix - for (i = z = 0; i < call->n_alleles; ++i) { - for (j = 0; j <= i; ++j) { - g[z++] = call->a[j] * 5 + call->a[i]; - } - } - // for each sample calculate the PL - for (i = 0; i < n; ++i) - { - int32_t *PL = call->PL + x * i; - const bcf_callret1_t *r = calls + i; - float min = FLT_MAX; - for (j = 0; j < x; ++j) { - if (min > r->p[g[j]]) min = r->p[g[j]]; - } - sum_min += min; - for (j = 0; j < x; ++j) { - int y; - y = (int)(r->p[g[j]] - min + .499); - if (y > 255) y = 255; - PL[j] = y; - } - } - if ( call->DP4 ) - { - for (i=0; iDP4[4*i] = calls[i].anno[0]; - call->DP4[4*i+1] = calls[i].anno[1]; - call->DP4[4*i+2] = calls[i].anno[2]; - call->DP4[4*i+3] = calls[i].anno[3]; - } - } - if ( call->ADF ) - { - assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well - - // reorder ADR,ADF to match the allele ordering at this site - int32_t tmp[B2B_MAX_ALLELES]; - int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES; - int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES; - int32_t *adr_tot = call->ADR; // the first bin stores total counts per site - int32_t *adf_tot = call->ADF; - for (i=0; in_alleles; j++) - { - tmp[j] = adr[ call->a[j] ]; - adr_tot[j] += tmp[j]; - } - for (j=0; jn_alleles; j++) adr_out[j] = tmp[j]; - for (j=0; jn_alleles; j++) - { - tmp[j] = adf[ call->a[j] ]; - adf_tot[j] += tmp[j]; - } - for (j=0; jn_alleles; j++) adf_out[j] = tmp[j]; - adf_out += call->n_alleles; - adr_out += call->n_alleles; - adr += B2B_MAX_ALLELES; - adf += B2B_MAX_ALLELES; - } - } - -// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); - call->shift = (int)(sum_min + .499); - } - // combine annotations - memset(call->anno, 0, 16 * sizeof(double)); - call->ori_depth = 0; - call->depth = 0; - call->mq0 = 0; - for (i = 0; i < n; ++i) { - call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3]; - call->ori_depth += calls[i].ori_depth; - call->mq0 += calls[i].mq0; - for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; - } - - calc_SegBias(calls, call); - - // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos); - // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); - // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - - call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); - -#if CDF_MWU_TESTS - call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); -#endif - - call->vdb = calc_vdb(bca->alt_pos, bca->npos); - - return 0; -} - -int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref) -{ - extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); - int i, j, nals = 1; - - bcf_hdr_t *hdr = bc->bcf_hdr; - rec->rid = bc->tid; - rec->pos = bc->pos; - rec->qual = 0; - - bc->tmp.l = 0; - if (bc->ori_ref < 0) // indel - { - // REF - kputc(ref[bc->pos], &bc->tmp); - for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); - - // ALT - for (i=1; i<4; i++) - { - if (bc->a[i] < 0) break; - kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp); - - if (bca->indel_types[bc->a[i]] < 0) { // deletion - for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j) - kputc(ref[bc->pos+1+j], &bc->tmp); - } else { // insertion; cannot be a reference unless a bug - char *inscns = &bca->inscns[bc->a[i] * bca->maxins]; - for (j = 0; j < bca->indel_types[bc->a[i]]; ++j) - kputc("ACGTN"[(int)inscns[j]], &bc->tmp); - for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); - } - nals++; - } - } - else // SNP - { - kputc("ACGTN"[bc->ori_ref], &bc->tmp); - for (i=1; i<5; i++) - { - if (bc->a[i] < 0) break; - kputc(',', &bc->tmp); - if ( bc->unseen==i ) kputs("<*>", &bc->tmp); - else kputc("ACGT"[bc->a[i]], &bc->tmp); - nals++; - } - } - bcf_update_alleles_str(hdr, rec, bc->tmp.s); - - bc->tmp.l = 0; - - // INFO - if (bc->ori_ref < 0) - { - bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1); - bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); - bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); - } - bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1); - if ( fmt_flag&B2B_INFO_ADF ) - bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele); - if ( fmt_flag&B2B_INFO_ADR ) - bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele); - if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) ) - { - for (i=0; in_allele; i++) bc->ADF[i] += bc->ADR[i]; - if ( fmt_flag&B2B_INFO_AD ) - bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele); - if ( fmt_flag&B2B_INFO_DPR ) - bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele); - } - - float tmpf[16]; - for (i=0; i<16; i++) tmpf[i] = bc->anno[i]; - bcf_update_info_float(hdr, rec, "I16", tmpf, 16); - bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals); - - if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); - if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); -#if CDF_MWU_TESTS - if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); - if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); - if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); - if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); -#endif - tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0; - bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); - - // FORMAT - rec->n_sample = bc->n; - bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample); - if ( fmt_flag&B2B_FMT_DP ) - { - int32_t *ptr = (int32_t*) bc->fmt_arr; - for (i=0; in; i++) - ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3]; - bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample); - } - if ( fmt_flag&B2B_FMT_DV ) - { - int32_t *ptr = (int32_t*) bc->fmt_arr; - for (i=0; in; i++) - ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3]; - bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample); - } - if ( fmt_flag&B2B_FMT_SP ) - { - int32_t *ptr = (int32_t*) bc->fmt_arr; - for (i=0; in; i++) - { - int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3]; - if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 ) - ptr[i] = 0; - else - { - double left, right, two; - kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two); - int32_t x = (int)(-4.343 * log(two) + .499); - if (x > 255) x = 255; - ptr[i] = x; - } - } - bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample); - } - if ( fmt_flag&B2B_FMT_DP4 ) - bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4); - if ( fmt_flag&B2B_FMT_ADF ) - bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - if ( fmt_flag&B2B_FMT_ADR ) - bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) ) - { - for (i=0; in_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i]; - if ( fmt_flag&B2B_FMT_AD ) - bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - if ( fmt_flag&B2B_FMT_DPR ) - bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - } - - return 0; -} diff --git a/samtools/bam2bcf.c.pysam.c b/samtools/bam2bcf.c.pysam.c deleted file mode 100644 index 70b8bee..0000000 --- a/samtools/bam2bcf.c.pysam.c +++ /dev/null @@ -1,823 +0,0 @@ -#include "samtools.pysam.h" - -/* bam2bcf.c -- variant calling. - - Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2015, 2021 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include "bam2bcf.h" - -extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); - -#define CALL_DEFTHETA 0.83 -#define DEF_MAPQ 20 - -#define CAP_DIST 25 - -bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) -{ - bcf_callaux_t *bca; - if (theta <= 0.) theta = CALL_DEFTHETA; - bca = calloc(1, sizeof(bcf_callaux_t)); - bca->capQ = 60; - bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; - bca->min_baseQ = min_baseQ; - bca->e = errmod_init(1. - theta); - bca->min_frac = 0.002; - bca->min_support = 1; - bca->per_sample_flt = 0; - bca->npos = 100; - bca->ref_pos = malloc(bca->npos*sizeof(int)); - bca->alt_pos = malloc(bca->npos*sizeof(int)); - bca->nqual = 60; - bca->ref_mq = malloc(bca->nqual*sizeof(int)); - bca->alt_mq = malloc(bca->nqual*sizeof(int)); - bca->ref_bq = malloc(bca->nqual*sizeof(int)); - bca->alt_bq = malloc(bca->nqual*sizeof(int)); - bca->fwd_mqs = malloc(bca->nqual*sizeof(int)); - bca->rev_mqs = malloc(bca->nqual*sizeof(int)); - return bca; -} - -void bcf_call_destroy(bcf_callaux_t *bca) -{ - if (bca == 0) return; - errmod_destroy(bca->e); - if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } - free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq); - free(bca->fwd_mqs); free(bca->rev_mqs); - bca->nqual = 0; - free(bca->bases); free(bca->inscns); free(bca); -} - -// position in the sequence with respect to the aligned part of the read -static int get_position(const bam_pileup1_t *p, int *len) -{ - int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; - for (icig=0; icigb->core.n_cigar; icig++) - { - int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK; - int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; - if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) - { - n_tot_bases += ncig; - iread += ncig; - continue; - } - if ( cig==BAM_CINS ) - { - n_tot_bases += ncig; - iread += ncig; - continue; - } - if ( cig==BAM_CSOFT_CLIP ) - { - iread += ncig; - if ( iread<=p->qpos ) edist -= ncig; - continue; - } - if ( cig==BAM_CDEL ) continue; - if ( cig==BAM_CHARD_CLIP ) continue; - if ( cig==BAM_CPAD ) continue; - if ( cig==BAM_CREF_SKIP ) continue; - fprintf(samtools_stderr,"todo: cigar %d\n", cig); - assert(0); - } - *len = n_tot_bases; - return edist; -} - -void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) -{ - memset(bca->ref_pos,0,sizeof(int)*bca->npos); - memset(bca->alt_pos,0,sizeof(int)*bca->npos); - memset(bca->ref_mq,0,sizeof(int)*bca->nqual); - memset(bca->alt_mq,0,sizeof(int)*bca->nqual); - memset(bca->ref_bq,0,sizeof(int)*bca->nqual); - memset(bca->alt_bq,0,sizeof(int)*bca->nqual); - memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual); - memset(bca->rev_mqs,0,sizeof(int)*bca->nqual); - if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); - if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); -} - -/* - Notes: - - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies - which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation. - Later it's used for multiallelic calling by bcftools -m - - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. - */ -/* - * This function is called once for each sample. - * _n is number of pilesups pl contributing reads to this sample - * pl is pointer to array of _n pileups (one pileup per read) - * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. - * bca is the settings to perform calls across all samples - * r is the returned value of the call - */ -int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) -{ - int i, n, ref4, is_indel, ori_depth = 0; - - // clean from previous run - r->ori_depth = 0; - r->mq0 = 0; - memset(r->qsum,0,sizeof(float)*4); - memset(r->anno,0,sizeof(double)*16); - memset(r->p,0,sizeof(float)*25); - - if (ref_base >= 0) { - ref4 = seq_nt16_int[ref_base]; - is_indel = 0; - } else ref4 = 4, is_indel = 1; - if (_n == 0) return -1; - // enlarge the bases array if necessary - if (bca->max_bases < _n) { - bca->max_bases = _n; - kroundup32(bca->max_bases); - bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); - } - // fill the bases array - for (i = n = 0; i < _n; ++i) { - const bam_pileup1_t *p = pl + i; - int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; - // set base - if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; - ++ori_depth; - mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 - if ( !mapQ ) r->mq0++; - baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality - seqQ = is_indel? (p->aux>>8&0xff) : 99; - if (q < bca->min_baseQ) continue; - if (q > seqQ) q = seqQ; - mapQ = mapQ < bca->capQ? mapQ : bca->capQ; - if (q > mapQ) q = mapQ; - if (q > 63) q = 63; - if (q < 4) q = 4; // MQ=0 reads count as BQ=4 - if (!is_indel) { - b = bam_seqi(bam_get_seq(p->b), p->qpos); // base - b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base - is_diff = (ref4 < 4 && b == ref4)? 0 : 1; - } else { - b = p->aux>>16&0x3f; - is_diff = (b != 0); - } - bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; - // collect annotations - if (b < 4) - { - r->qsum[b] += q; - if ( r->ADF ) - { - if ( bam_is_rev(p->b) ) - r->ADR[b]++; - else - r->ADF[b]++; - } - } - ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)]; - min_dist = p->b->core.l_qseq - 1 - p->qpos; - if (min_dist > p->qpos) min_dist = p->qpos; - if (min_dist > CAP_DIST) min_dist = CAP_DIST; - r->anno[1<<2|is_diff<<1|0] += baseQ; - r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; - r->anno[2<<2|is_diff<<1|0] += mapQ; - r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; - r->anno[3<<2|is_diff<<1|0] += min_dist; - r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; - - // collect for bias tests - if ( baseQ > 59 ) baseQ = 59; - if ( mapQ > 59 ) mapQ = 59; - int len, pos = get_position(p, &len); - int epos = (double)pos/(len+1) * bca->npos; - int ibq = baseQ/60. * bca->nqual; - int imq = mapQ/60. * bca->nqual; - if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; - else bca->fwd_mqs[imq]++; - if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) - { - bca->ref_pos[epos]++; - bca->ref_bq[ibq]++; - bca->ref_mq[imq]++; - } - else - { - bca->alt_pos[epos]++; - bca->alt_bq[ibq]++; - bca->alt_mq[imq]++; - } - } - r->ori_depth = ori_depth; - // glfgen - errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype - return n; -} - - -/* - * calc_vdb() - returns value between zero (most biased) and one (no bias) - * on success, or HUGE_VAL when VDB cannot be calculated because - * of insufficient depth (<2x) - * - * Variant Distance Bias tests if the variant bases are positioned within the - * reads with sufficient randomness. Unlike other tests, it looks only at - * variant reads and therefore gives different kind of information than Read - * Position Bias for instance. VDB was developed for detecting artefacts in - * RNA-seq calls where reads from spliced transcripts span splice site - * boundaries. The current implementation differs somewhat from the original - * version described in supplementary material of PMID:22524474, but the idea - * remains the same. (Here the random variable tested is the average distance - * from the averaged position, not the average pairwise distance.) - * - * For coverage of 2x, the calculation is exact but is approximated for the - * rest. The result is most accurate between 4-200x. For 3x or >200x, the - * reported values are slightly more favourable than those of a true random - * distribution. - */ -double calc_vdb(int *pos, int npos) -{ - // Note well: the parameters were obtained by fitting to simulated data of - // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen(). - const int readlen = 100; - assert( npos==readlen ); - - #define nparam 15 - const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5}, - {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8}, - {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7}, - {200,0.7,23.7} }; - - int i, dp = 0; - float mean_pos = 0, mean_diff = 0; - for (i=0; i=200 ) - i = nparam; // shortcut for big depths - else - { - for (i=0; i=dp ) break; - } - float pshift, pscale; - if ( i==nparam ) - { - // the depth is too high, go with 200x - pscale = param[nparam-1][1]; - pshift = param[nparam-1][2]; - } - else if ( i>0 && param[i][0]!=dp ) - { - // linear interpolation of parameters - pscale = (param[i-1][1] + param[i][1])*0.5; - pshift = (param[i-1][2] + param[i][2])*0.5; - } - else - { - pscale = param[i][1]; - pshift = param[i][2]; - } - return 0.5*kf_erfc(-(mean_diff-pshift)*pscale); -} - -double calc_chisq_bias(int *a, int *b, int n) -{ - int na = 0, nb = 0, i, ndf = n; - for (i=0; i=8 && nb>=8 and reasonable if na<8 or nb<8 - if ( na>=8 || nb>=8 ) - { - double mean = ((double)na*nb)*0.5; - double var2 = ((double)na*nb)*(na+nb+1)/12.0; - double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1) - return 2.0 - kf_erfc(z); // which is 1 + erf(z) - } - - // Exact calculation - double pval = 2*mann_whitney_1947_cdf(na,nb,U_min); - return pval>1 ? 1 : pval; -} - -double calc_mwu_bias(int *a, int *b, int n) -{ - int na = 0, nb = 0, i; - double U = 0; - for (i=0; imean ? (2.0*mean-U)/mean : U/mean; - } - double var2 = ((double)na*nb)*(na+nb+1)/12.0; - if ( na>=8 || nb>=8 ) - { - // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8 - return exp(-0.5*(U-mean)*(U-mean)/var2); - } - - // Exact calculation - return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2); -} - -static inline double logsumexp2(double a, double b) -{ - if ( a>b ) - return log(1 + exp(b-a)) + a; - else - return log(1 + exp(a-b)) + b; -} - -void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) -{ - call->seg_bias = HUGE_VAL; - if ( !bcr ) return; - - int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads - if ( !nr ) return; - - int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth - double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population - if ( M>call->n ) M = call->n; // clamp M at the number of samples - else if ( M==0 ) M = 1; - double f = M / 2. / call->n; // allele frequency - double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson) - double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson) - double sum = 0; - const double log2 = log(2.0); - - // fprintf(samtools_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); - int i; - for (i=0; in; i++) - { - int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads - double tmp; - if ( oi ) - { - // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow - tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q); - tmp += log(f) + oi*log(q/p) - q + p; - } - else - tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p; - sum += tmp; - // fprintf(samtools_stderr,"oi=%d %e\n", oi,tmp); - } - call->seg_bias = sum; -} - -/** - * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles - * @n: number of samples - * @calls: each sample's calls - * @bca: auxiliary data structure for holding temporary values - * @ref_base: the reference base - * @call: filled with the annotations - * - * Combines calls across the various samples being studied - * 1. For each allele at each base across all samples the quality is summed so - * you end up with a set of quality sums for each allele present 2. The quality - * sums are sorted. - * 3. Using the sorted quality sums we now create the allele ordering array - * A\subN. This is done by doing the following: - * a) If the reference allele is known it always comes first, otherwise N - * comes first. - * b) Then the rest of the alleles are output in descending order of quality - * sum (which we already know the qsum array was sorted). Any allelles with - * qsum 0 will be excluded. - * 4. Using the allele ordering array we create the genotype ordering array. - * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1 - * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4 - * 5. The genotype ordering array is then used to extract data from the error - * model 5*5 matrix and is used to produce a Phread likelihood array for each - * sample. - */ -int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) -{ - int ref4, i, j; - float qsum[5] = {0,0,0,0,0}; - if (ref_base >= 0) { - call->ori_ref = ref4 = seq_nt16_int[ref_base]; - if (ref4 > 4) ref4 = 4; - } else call->ori_ref = -1, ref4 = 0; - - // calculate qsum, this is done by summing normalized qsum across all samples, - // to account for differences in coverage - for (i = 0; i < n; ++i) - { - float sum = 0; - for (j = 0; j < 4; ++j) sum += calls[i].qsum[j]; - if ( sum ) - for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum; - } - - // sort qsum in ascending order (insertion sort) - float *ptr[5], *tmp; - for (i=0; i<5; i++) ptr[i] = &qsum[i]; - for (i=1; i<4; i++) - for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--) - tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp; - - // Set the reference allele and alternative allele(s) - for (i=0; i<5; i++) call->a[i] = -1; - for (i=0; i<5; i++) call->qsum[i] = 0; - call->unseen = -1; - call->a[0] = ref4; - for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering - { - int ipos = ptr[i] - qsum; // position in sorted qsum array - if ( ipos==ref4 ) - call->qsum[0] = qsum[ipos]; // REF's qsum - else - { - if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup - call->qsum[j] = qsum[ipos]; - call->a[j++] = ipos; - } - } - if (ref_base >= 0) - { - // for SNPs, find the "unseen" base - if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) - call->unseen = j, call->a[j++] = ptr[i] - qsum; - call->n_alleles = j; - } - else - { - call->n_alleles = j; - if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything - } - /* - * Set the phread likelihood array (call->PL) This array is 15 entries long - * for each sample because that is size of an upper or lower triangle of a - * worst case 5x5 matrix of possible genotypes. This worst case matrix will - * occur when all 4 possible alleles are present and the reference allele - * is unknown. The sides of the matrix will correspond to the reference - * allele (if known) followed by the alleles present in descending order of - * quality sum - */ - { - int x, g[15], z; - double sum_min = 0.; - x = call->n_alleles * (call->n_alleles + 1) / 2; - // get the possible genotypes - // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix - for (i = z = 0; i < call->n_alleles; ++i) { - for (j = 0; j <= i; ++j) { - g[z++] = call->a[j] * 5 + call->a[i]; - } - } - // for each sample calculate the PL - for (i = 0; i < n; ++i) - { - int32_t *PL = call->PL + x * i; - const bcf_callret1_t *r = calls + i; - float min = FLT_MAX; - for (j = 0; j < x; ++j) { - if (min > r->p[g[j]]) min = r->p[g[j]]; - } - sum_min += min; - for (j = 0; j < x; ++j) { - int y; - y = (int)(r->p[g[j]] - min + .499); - if (y > 255) y = 255; - PL[j] = y; - } - } - if ( call->DP4 ) - { - for (i=0; iDP4[4*i] = calls[i].anno[0]; - call->DP4[4*i+1] = calls[i].anno[1]; - call->DP4[4*i+2] = calls[i].anno[2]; - call->DP4[4*i+3] = calls[i].anno[3]; - } - } - if ( call->ADF ) - { - assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well - - // reorder ADR,ADF to match the allele ordering at this site - int32_t tmp[B2B_MAX_ALLELES]; - int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES; - int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES; - int32_t *adr_tot = call->ADR; // the first bin stores total counts per site - int32_t *adf_tot = call->ADF; - for (i=0; in_alleles; j++) - { - tmp[j] = adr[ call->a[j] ]; - adr_tot[j] += tmp[j]; - } - for (j=0; jn_alleles; j++) adr_out[j] = tmp[j]; - for (j=0; jn_alleles; j++) - { - tmp[j] = adf[ call->a[j] ]; - adf_tot[j] += tmp[j]; - } - for (j=0; jn_alleles; j++) adf_out[j] = tmp[j]; - adf_out += call->n_alleles; - adr_out += call->n_alleles; - adr += B2B_MAX_ALLELES; - adf += B2B_MAX_ALLELES; - } - } - -// if (ref_base < 0) fprintf(samtools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); - call->shift = (int)(sum_min + .499); - } - // combine annotations - memset(call->anno, 0, 16 * sizeof(double)); - call->ori_depth = 0; - call->depth = 0; - call->mq0 = 0; - for (i = 0; i < n; ++i) { - call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3]; - call->ori_depth += calls[i].ori_depth; - call->mq0 += calls[i].mq0; - for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; - } - - calc_SegBias(calls, call); - - // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos); - // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); - // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - - call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); - -#if CDF_MWU_TESTS - call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); -#endif - - call->vdb = calc_vdb(bca->alt_pos, bca->npos); - - return 0; -} - -int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref) -{ - extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); - int i, j, nals = 1; - - bcf_hdr_t *hdr = bc->bcf_hdr; - rec->rid = bc->tid; - rec->pos = bc->pos; - rec->qual = 0; - - bc->tmp.l = 0; - if (bc->ori_ref < 0) // indel - { - // REF - kputc(ref[bc->pos], &bc->tmp); - for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); - - // ALT - for (i=1; i<4; i++) - { - if (bc->a[i] < 0) break; - kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp); - - if (bca->indel_types[bc->a[i]] < 0) { // deletion - for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j) - kputc(ref[bc->pos+1+j], &bc->tmp); - } else { // insertion; cannot be a reference unless a bug - char *inscns = &bca->inscns[bc->a[i] * bca->maxins]; - for (j = 0; j < bca->indel_types[bc->a[i]]; ++j) - kputc("ACGTN"[(int)inscns[j]], &bc->tmp); - for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); - } - nals++; - } - } - else // SNP - { - kputc("ACGTN"[bc->ori_ref], &bc->tmp); - for (i=1; i<5; i++) - { - if (bc->a[i] < 0) break; - kputc(',', &bc->tmp); - if ( bc->unseen==i ) kputs("<*>", &bc->tmp); - else kputc("ACGT"[bc->a[i]], &bc->tmp); - nals++; - } - } - bcf_update_alleles_str(hdr, rec, bc->tmp.s); - - bc->tmp.l = 0; - - // INFO - if (bc->ori_ref < 0) - { - bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1); - bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); - bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); - } - bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1); - if ( fmt_flag&B2B_INFO_ADF ) - bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele); - if ( fmt_flag&B2B_INFO_ADR ) - bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele); - if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) ) - { - for (i=0; in_allele; i++) bc->ADF[i] += bc->ADR[i]; - if ( fmt_flag&B2B_INFO_AD ) - bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele); - if ( fmt_flag&B2B_INFO_DPR ) - bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele); - } - - float tmpf[16]; - for (i=0; i<16; i++) tmpf[i] = bc->anno[i]; - bcf_update_info_float(hdr, rec, "I16", tmpf, 16); - bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals); - - if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); - if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); -#if CDF_MWU_TESTS - if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); - if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); - if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); - if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); -#endif - tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0; - bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); - - // FORMAT - rec->n_sample = bc->n; - bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample); - if ( fmt_flag&B2B_FMT_DP ) - { - int32_t *ptr = (int32_t*) bc->fmt_arr; - for (i=0; in; i++) - ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3]; - bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample); - } - if ( fmt_flag&B2B_FMT_DV ) - { - int32_t *ptr = (int32_t*) bc->fmt_arr; - for (i=0; in; i++) - ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3]; - bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample); - } - if ( fmt_flag&B2B_FMT_SP ) - { - int32_t *ptr = (int32_t*) bc->fmt_arr; - for (i=0; in; i++) - { - int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3]; - if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 ) - ptr[i] = 0; - else - { - double left, right, two; - kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two); - int32_t x = (int)(-4.343 * log(two) + .499); - if (x > 255) x = 255; - ptr[i] = x; - } - } - bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample); - } - if ( fmt_flag&B2B_FMT_DP4 ) - bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4); - if ( fmt_flag&B2B_FMT_ADF ) - bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - if ( fmt_flag&B2B_FMT_ADR ) - bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) ) - { - for (i=0; in_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i]; - if ( fmt_flag&B2B_FMT_AD ) - bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - if ( fmt_flag&B2B_FMT_DPR ) - bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - } - - return 0; -} diff --git a/samtools/bam2bcf.h b/samtools/bam2bcf.h deleted file mode 100644 index 972923d..0000000 --- a/samtools/bam2bcf.h +++ /dev/null @@ -1,140 +0,0 @@ -/* bam2bcf.h -- variant calling. - - Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2014, 2019 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#ifndef BAM2BCF_H -#define BAM2BCF_H - -#include -#include -#include - -/** - * A simplified version of Mann-Whitney U-test is calculated - * by default (no CDF) because it is faster and seems to work - * better in machine learning filtering. When enabled by setting - * CDF_MWU_TESTS, additional annotations will appear on mpileup's - * output (RPB2 in addition to RPB, etc.). - */ -#ifndef CDF_MWU_TESTS -#define CDF_MWU_TESTS 0 -#endif - -#define B2B_INDEL_NULL 10000 - -#define B2B_FMT_DP (1<<0) -#define B2B_FMT_SP (1<<1) -#define B2B_FMT_DV (1<<2) -#define B2B_FMT_DP4 (1<<3) -#define B2B_FMT_DPR (1<<4) -#define B2B_INFO_DPR (1<<5) -#define B2B_FMT_AD (1<<6) -#define B2B_FMT_ADF (1<<7) -#define B2B_FMT_ADR (1<<8) -#define B2B_INFO_AD (1<<9) -#define B2B_INFO_ADF (1<<10) -#define B2B_INFO_ADR (1<<11) - -#define B2B_MAX_ALLELES 5 - -typedef struct __bcf_callaux_t { - int capQ, min_baseQ; - int openQ, extQ, tandemQ; // for indels - uint32_t min_support, max_support; // for collecting indel candidates - double min_frac; // for collecting indel candidates - float max_frac; // for collecting indel candidates - int per_sample_flt; // indel filtering strategy - int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests - // for internal uses - int max_bases; - int indel_types[4]; // indel lengths - int maxins, indelreg; - int read_len; - char *inscns; - uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types) - errmod_t *e; - void *rghash; -} bcf_callaux_t; - -typedef struct { - uint32_t ori_depth; - unsigned int mq0; - int32_t *ADF, *ADR; - float qsum[4]; - // The fields are: - // depth fwd .. ref (0) and non-ref (2) - // depth rev .. ref (1) and non-ref (3) - // baseQ .. ref (4) and non-ref (6) - // baseQ^2 .. ref (5) and non-ref (7) - // mapQ .. ref (8) and non-ref (10) - // mapQ^2 .. ref (9) and non-ref (11) - // minDist .. ref (12) and non-ref (14) - // minDist^2 .. ref (13) and non-ref (15) - // Note that this probably needs a more thorough fix: int types in - // bcf_call_t do overflow with high-coverage data, such as exomes, and - // BCFv2 supports only floats which may not suffice. - double anno[16]; - float p[25]; // phred-scaled likelihood of each genotype -} bcf_callret1_t; - -typedef struct { - int tid; - hts_pos_t pos; - bcf_hdr_t *bcf_hdr; - int a[5]; // alleles: ref, alt, alt2, alt3 - float qsum[5]; // for the QS tag - int n, n_alleles, shift, ori_ref, unseen; - int n_supp; // number of supporting non-reference reads - double anno[16]; - unsigned int depth, ori_depth, mq0; - int32_t *PL, *DP4, *ADR, *ADF; - uint8_t *fmt_arr; - float vdb; // variant distance bias - float mwu_pos, mwu_mq, mwu_bq, mwu_mqs; -#if CDF_MWU_TESTS - float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf; -#endif - float seg_bias; - kstring_t tmp; -} bcf_call_t; - -#ifdef __cplusplus -extern "C" { -#endif - - bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); - void bcf_call_destroy(bcf_callaux_t *bca); - int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); - int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); - int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, - const bcf_callaux_t *bca, const char *ref); - int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, - const void *rghash); - void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c deleted file mode 100644 index 17dedf0..0000000 --- a/samtools/bam2bcf_indel.c +++ /dev/null @@ -1,547 +0,0 @@ -/* bam2bcf_indel.c -- indel caller. - - Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012-2014, 2019 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include -#include "htslib/hts.h" -#include "htslib/sam.h" -#include "bam2bcf.h" -#include "htslib/khash.h" -KHASH_SET_INIT_STR(rg) - -#include "htslib/ksort.h" -KSORT_INIT_GENERIC(uint32_t) - -#define MINUS_CONST 0x10000000 -#define INDEL_WINDOW_SIZE 50 - -void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list) -{ - const char *s, *p, *q, *r, *t; - khash_t(rg) *hash; - if (list == 0 || hdtext == 0) return _hash; - if (_hash == 0) _hash = kh_init(rg); - hash = (khash_t(rg)*)_hash; - if ((s = strstr(hdtext, "@RG\t")) == 0) return hash; - do { - t = strstr(s + 4, "@RG\t"); // the next @RG - if ((p = strstr(s, "\tID:")) != 0) p += 4; - if ((q = strstr(s, "\tPL:")) != 0) q += 4; - if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present - int lp, lq; - char *x; - for (r = p; *r && *r != '\t' && *r != '\n'; ++r) { } - lp = r - p; - for (r = q; *r && *r != '\t' && *r != '\n'; ++r) { } - lq = r - q; - x = calloc((lp > lq? lp : lq) + 1, 1); - for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r; - if (strstr(list, x)) { // insert ID to the hash table - khint_t k; - int ret; - for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r; - x[r-p] = 0; - k = kh_get(rg, hash, x); - if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret); - else free(x); - } else free(x); - } - s = t; - } while (s); - return hash; -} - -void bcf_call_del_rghash(void *_hash) -{ - khint_t k; - khash_t(rg) *hash = (khash_t(rg)*)_hash; - if (hash == 0) return; - for (k = kh_begin(hash); k < kh_end(hash); ++k) - if (kh_exist(hash, k)) - free((char*)kh_key(hash, k)); - kh_destroy(rg, hash); -} - -static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos) -{ - int k, y = 0, last_y = 0; - hts_pos_t x = c->pos; - *_tpos = c->pos; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - int l = cigar[k] >> BAM_CIGAR_SHIFT; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (c->pos > tpos) return y; - if (x + l > tpos) { - *_tpos = tpos; - return y + (tpos - x); - } - x += l; y += l; - last_y = y; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { - if (x + l > tpos) { - *_tpos = is_left? x : x + l; - return y; - } - x += l; - } - } - *_tpos = x; - return last_y; -} -// FIXME: check if the inserted sequence is consistent with the homopolymer run -// l is the relative gap length and l_run is the length of the homopolymer on the reference -static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) -{ - int q, qh; - q = bca->openQ + bca->extQ * (abs(l) - 1); - qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000; - return q < qh? q : qh; -} - -static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4) -{ - int j, max = 0, score = 0; - hts_pos_t i, max_i = pos; - l = abs(l); - for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { - if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; - else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1; - if (score < 0) break; - if (max < score) max = score, max_i = i; - } - return max_i - pos; -} - -/* - notes: - - n .. number of samples - - the routine sets bam_pileup1_t.aux of each read as follows: - - 6: unused - - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f - - 8: estimated sequence quality .. (aux>>8)&0xff - - 8: indel quality .. aux&0xff - */ -int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, - const void *rghash) -{ - int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2; - int N, K, l_run, ref_type, n_alt; - hts_pos_t i, j, left, right; - char *inscns = 0, *ref2, *query, **ref_sample; - khash_t(rg) *hash = (khash_t(rg)*)rghash; - if (ref == 0 || bca == 0) return -1; - // mark filtered reads - if (rghash) { - N = 0; - for (s = N = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - const uint8_t *rg = bam_aux_get(p->b, "RG"); - p->aux = 1; // filtered by default - if (rg) { - khint_t k = kh_get(rg, hash, (const char*)(rg + 1)); - if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered - } - } - } - if (N == 0) return -1; // no reads left - } - // determine if there is a gap - for (s = N = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) - if (plp[s][i].indel != 0) break; - if (i < n_plp[s]) break; - } - if (s == n) return -1; // there is no indel at this position. - for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads - { // find out how many types of indels are present - bca->max_support = bca->max_frac = 0; - int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; - uint32_t *aux; - aux = calloc(N + 1, 4); - m = max_rd_len = 0; - aux[m++] = MINUS_CONST; // zero indel is always a type - for (s = 0; s < n; ++s) { - int na = 0, nt = 0; - for (i = 0; i < n_plp[s]; ++i) { - const bam_pileup1_t *p = plp[s] + i; - if (rghash == 0 || p->aux == 0) { - ++nt; - if (p->indel != 0) { - ++na; - aux[m++] = MINUS_CONST + p->indel; - } - } - j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); - if (j > max_rd_len) max_rd_len = j; - } - double frac = (double)na/nt; - if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) - indel_support_ok = 1; - if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; - n_alt += na; - n_tot += nt; - } - // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), - // check the number of N's in the sequence and skip places where half or more reference bases are Ns. - int nN=0; for (i=pos; i-pos(i-pos) ) { free(aux); return -1; } - - ks_introsort(uint32_t, m, aux); - // squeeze out identical types - for (i = 1, n_types = 1; i < m; ++i) - if (aux[i] != aux[i-1]) ++n_types; - // Taking totals makes it hard to call rare indels - if ( !bca->per_sample_flt ) - indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; - if ( n_types == 1 || !indel_support_ok ) { // then skip - free(aux); return -1; - } - if (n_types >= 64) { - free(aux); - // TODO revisit how/whether to control printing this warning - if (hts_verbose >= 2) - fprintf(stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1); - return -1; - } - types = (int*)calloc(n_types, sizeof(int)); - t = 0; - types[t++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) - if (aux[i] != aux[i-1]) - types[t++] = aux[i] - MINUS_CONST; - free(aux); - for (t = 0; t < n_types; ++t) - if (types[t] == 0) break; - ref_type = t; // the index of the reference type (0) - } - { // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; - if (types[0] < 0) right -= types[0]; - // in case the alignments stand out the reference - for (i = pos; i < right; ++i) - if (ref[i] == 0) break; - right = i; - } - /* The following block fixes a long-existing flaw in the INDEL - * calling model: the interference of nearby SNPs. However, it also - * reduces the power because sometimes, substitutions caused by - * indels are not distinguishable from true mutations. Multiple - * sequence realignment helps to increase the power. - * - * Masks mismatches present in at least 70% of the reads with 'N'. - */ - { // construct per-sample consensus - int L = right - left + 1, max_i, max2_i; - uint32_t *cns, max, max2; - char *ref0, *r; - ref_sample = calloc(n, sizeof(char*)); - cns = calloc(L, 4); - ref0 = calloc(L, 1); - for (i = 0; i < right - left; ++i) - ref0[i] = seq_nt16_table[(int)ref[i+left]]; - for (s = 0; s < n; ++s) { - r = ref_sample[s] = calloc(L, 1); - memset(cns, 0, sizeof(int) * L); - // collect ref and non-ref counts - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); - hts_pos_t x = b->core.pos, y = 0; - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) - if (x + j >= left && x + j < right) - cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - } - } - // determine the consensus - for (i = 0; i < right - left; ++i) r[i] = ref0[i]; - max = max2 = 0; max_i = max2_i = -1; - for (i = 0; i < right - left; ++i) { - if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; - else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; - } - if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; - if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; - if (max_i >= 0) r[max_i] = 15; - if (max2_i >= 0) r[max2_i] = 15; - //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); - } - free(ref0); free(cns); - } - { // the length of the homopolymer run around the current position - int c = seq_nt16_table[(int)ref[pos + 1]]; - if (c == 15) l_run = 1; - else { - for (i = pos + 2; ref[i]; ++i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run = i; - for (i = pos; i >= 0; --i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run -= i + 1; - } - } - // construct the consensus sequence - max_ins = types[n_types - 1]; // max_ins is at least 0 - if (max_ins > 0) { - int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int)); - // count the number of occurrences of each base at each position for each type of insertion - for (t = 0; t < n_types; ++t) { - if (types[t] > 0) { - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - if (p->indel == types[t]) { - uint8_t *seq = bam_get_seq(p->b); - for (k = 1; k <= p->indel; ++k) { - int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; - assert(c<5); - ++inscns_aux[(t*max_ins+(k-1))*5 + c]; - } - } - } - } - } - } - // use the majority rule to construct the consensus - inscns = calloc(n_types * max_ins, 1); - for (t = 0; t < n_types; ++t) { - for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; - for (k = 0; k < 5; ++k) - if (ia[k] > max) - max = ia[k], max_k = k; - inscns[t*max_ins + j] = max? max_k : 4; - if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's - } - } - free(inscns_aux); - } - // compute the likelihood given each type of indel for each read - max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); - ref2 = calloc(max_ref2, 1); - query = calloc(right - left + max_rd_len + max_ins + 2, 1); - score1 = calloc(N * n_types, sizeof(int)); - score2 = calloc(N * n_types, sizeof(int)); - bca->indelreg = 0; - for (t = 0; t < n_types; ++t) { - int l, ir; - probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; - apf1.bw = apf2.bw = abs(types[t]) + 3; - // compute indelreg - if (types[t] == 0) ir = 0; - else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); - else ir = est_indelreg(pos, ref, -types[t], 0); - if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir); - // realignment - for (s = K = 0; s < n; ++s) { - // write ref2 - for (k = 0, j = left; j <= pos; ++j) - ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - if (types[t] <= 0) j += -types[t]; - else for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; - for (; j < right && ref[j]; ++j) - ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - for (; k < max_ref2; ++k) ref2[k] = 4; - if (j < right) right = j; - // align each read to ref2 - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int qbeg, qend, sc, kk; - hts_pos_t tbeg, tend; - uint8_t *seq = bam_get_seq(p->b); - uint32_t *cigar = bam_get_cigar(p->b); - if (p->b->core.flag&4) continue; // unmapped reads - // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. - for (kk = 0; kk < p->b->core.n_cigar; ++kk) - if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; - if (kk < p->b->core.n_cigar) continue; - // FIXME: the following skips soft clips, but using them may be more sensitive. - // determine the start and end of sequences for alignment - qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); - qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); - if (types[t] < 0) { - int l = -types[t]; - tbeg = tbeg - l > left? tbeg - l : left; - } - // write the query sequence - for (l = qbeg; l < qend; ++l) - query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; - { // do realignment; this is the bottleneck - const uint8_t *qual = bam_get_qual(p->b), *bq; - uint8_t *qq; - if (qend < qbeg) { - fprintf(stderr, "Impossible data in bcf_call_gap_prep\n"); - exit(1); - } - qq = calloc(qend - qbeg, 1); - bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); - if (bq) ++bq; // skip type - for (l = qbeg; l < qend; ++l) { - qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; - if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; - if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; - } - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below - if (l > 255) l = 255; - score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; - if (sc > 5) { - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); - if (l > 255) l = 255; - score2[K*n_types + t] = sc<<8 | l; - } - free(qq); - } -/* - for (l = 0; l < tend - tbeg + abs(types[t]); ++l) - fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); - fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr); - fputc('\n', stderr); - fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); -*/ - } - } - } - free(ref2); free(query); - { // compute indelQ - int sc_a[16], sumq_a[16]; - int tmp, *sc = sc_a, *sumq = sumq_a; - if (n_types > 16) { - sc = (int *)malloc(n_types * sizeof(int)); - sumq = (int *)malloc(n_types * sizeof(int)); - } - memset(sumq, 0, n_types * sizeof(int)); - for (s = K = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - /* errmod_cal() assumes that if the call is wrong, the - * likelihoods of other events are equal. This is about - * right for substitutions, but is not desired for - * indels. To reuse errmod_cal(), I have to make - * compromise for multi-allelic indels. - */ - if ((sc[0]&0x3f) == ref_type) { - indelQ1 = (sc[1]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ1 = (sc[t]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); - } - tmp = sc[0]>>6 & 0xff; - indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ - sct = &score2[K*n_types]; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - if ((sc[0]&0x3f) == ref_type) { - indelQ2 = (sc[1]>>14) - (sc[0]>>14); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ2 = (sc[t]>>14) - (sc[0]>>14); - } - tmp = sc[0]>>6 & 0xff; - indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); - // pick the smaller between indelQ1 and indelQ2 - indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; - if (indelQ > 255) indelQ = 255; - if (seqQ > 255) seqQ = 255; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); - } - } - // determine bca->indel_types[] and bca->inscns - bca->maxins = max_ins; - bca->inscns = realloc(bca->inscns, bca->maxins * 4); - for (t = 0; t < n_types; ++t) - sumq[t] = sumq[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) - tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sumq[t]&0x3f) == ref_type) break; - if (t) { // then move the reference type to the first - tmp = sumq[t]; - for (; t > 0; --t) sumq[t] = sumq[t-1]; - sumq[0] = tmp; - } - for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; - for (t = 0; t < 4 && t < n_types; ++t) { - bca->indel_types[t] = types[sumq[t]&0x3f]; - memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); - } - // update p->aux - for (s = n_alt = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - int x = types[p->aux>>16&0x3f]; - for (j = 0; j < 4; ++j) - if (x == bca->indel_types[j]) break; - p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); - if ((p->aux>>16&0x3f) > 0) ++n_alt; - //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); - } - } - - if (sc != sc_a) free(sc); - if (sumq != sumq_a) free(sumq); - } - free(score1); free(score2); - // free - for (i = 0; i < n; ++i) free(ref_sample[i]); - free(ref_sample); - free(types); free(inscns); - return n_alt > 0? 0 : -1; -} diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c deleted file mode 100644 index 6706298..0000000 --- a/samtools/bam2bcf_indel.c.pysam.c +++ /dev/null @@ -1,549 +0,0 @@ -#include "samtools.pysam.h" - -/* bam2bcf_indel.c -- indel caller. - - Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012-2014, 2019 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include -#include "htslib/hts.h" -#include "htslib/sam.h" -#include "bam2bcf.h" -#include "htslib/khash.h" -KHASH_SET_INIT_STR(rg) - -#include "htslib/ksort.h" -KSORT_INIT_GENERIC(uint32_t) - -#define MINUS_CONST 0x10000000 -#define INDEL_WINDOW_SIZE 50 - -void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list) -{ - const char *s, *p, *q, *r, *t; - khash_t(rg) *hash; - if (list == 0 || hdtext == 0) return _hash; - if (_hash == 0) _hash = kh_init(rg); - hash = (khash_t(rg)*)_hash; - if ((s = strstr(hdtext, "@RG\t")) == 0) return hash; - do { - t = strstr(s + 4, "@RG\t"); // the next @RG - if ((p = strstr(s, "\tID:")) != 0) p += 4; - if ((q = strstr(s, "\tPL:")) != 0) q += 4; - if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present - int lp, lq; - char *x; - for (r = p; *r && *r != '\t' && *r != '\n'; ++r) { } - lp = r - p; - for (r = q; *r && *r != '\t' && *r != '\n'; ++r) { } - lq = r - q; - x = calloc((lp > lq? lp : lq) + 1, 1); - for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r; - if (strstr(list, x)) { // insert ID to the hash table - khint_t k; - int ret; - for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r; - x[r-p] = 0; - k = kh_get(rg, hash, x); - if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret); - else free(x); - } else free(x); - } - s = t; - } while (s); - return hash; -} - -void bcf_call_del_rghash(void *_hash) -{ - khint_t k; - khash_t(rg) *hash = (khash_t(rg)*)_hash; - if (hash == 0) return; - for (k = kh_begin(hash); k < kh_end(hash); ++k) - if (kh_exist(hash, k)) - free((char*)kh_key(hash, k)); - kh_destroy(rg, hash); -} - -static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos) -{ - int k, y = 0, last_y = 0; - hts_pos_t x = c->pos; - *_tpos = c->pos; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - int l = cigar[k] >> BAM_CIGAR_SHIFT; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (c->pos > tpos) return y; - if (x + l > tpos) { - *_tpos = tpos; - return y + (tpos - x); - } - x += l; y += l; - last_y = y; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { - if (x + l > tpos) { - *_tpos = is_left? x : x + l; - return y; - } - x += l; - } - } - *_tpos = x; - return last_y; -} -// FIXME: check if the inserted sequence is consistent with the homopolymer run -// l is the relative gap length and l_run is the length of the homopolymer on the reference -static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) -{ - int q, qh; - q = bca->openQ + bca->extQ * (abs(l) - 1); - qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000; - return q < qh? q : qh; -} - -static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4) -{ - int j, max = 0, score = 0; - hts_pos_t i, max_i = pos; - l = abs(l); - for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { - if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; - else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1; - if (score < 0) break; - if (max < score) max = score, max_i = i; - } - return max_i - pos; -} - -/* - notes: - - n .. number of samples - - the routine sets bam_pileup1_t.aux of each read as follows: - - 6: unused - - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f - - 8: estimated sequence quality .. (aux>>8)&0xff - - 8: indel quality .. aux&0xff - */ -int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, - const void *rghash) -{ - int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2; - int N, K, l_run, ref_type, n_alt; - hts_pos_t i, j, left, right; - char *inscns = 0, *ref2, *query, **ref_sample; - khash_t(rg) *hash = (khash_t(rg)*)rghash; - if (ref == 0 || bca == 0) return -1; - // mark filtered reads - if (rghash) { - N = 0; - for (s = N = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - const uint8_t *rg = bam_aux_get(p->b, "RG"); - p->aux = 1; // filtered by default - if (rg) { - khint_t k = kh_get(rg, hash, (const char*)(rg + 1)); - if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered - } - } - } - if (N == 0) return -1; // no reads left - } - // determine if there is a gap - for (s = N = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) - if (plp[s][i].indel != 0) break; - if (i < n_plp[s]) break; - } - if (s == n) return -1; // there is no indel at this position. - for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads - { // find out how many types of indels are present - bca->max_support = bca->max_frac = 0; - int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; - uint32_t *aux; - aux = calloc(N + 1, 4); - m = max_rd_len = 0; - aux[m++] = MINUS_CONST; // zero indel is always a type - for (s = 0; s < n; ++s) { - int na = 0, nt = 0; - for (i = 0; i < n_plp[s]; ++i) { - const bam_pileup1_t *p = plp[s] + i; - if (rghash == 0 || p->aux == 0) { - ++nt; - if (p->indel != 0) { - ++na; - aux[m++] = MINUS_CONST + p->indel; - } - } - j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); - if (j > max_rd_len) max_rd_len = j; - } - double frac = (double)na/nt; - if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) - indel_support_ok = 1; - if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; - n_alt += na; - n_tot += nt; - } - // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), - // check the number of N's in the sequence and skip places where half or more reference bases are Ns. - int nN=0; for (i=pos; i-pos(i-pos) ) { free(aux); return -1; } - - ks_introsort(uint32_t, m, aux); - // squeeze out identical types - for (i = 1, n_types = 1; i < m; ++i) - if (aux[i] != aux[i-1]) ++n_types; - // Taking totals makes it hard to call rare indels - if ( !bca->per_sample_flt ) - indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; - if ( n_types == 1 || !indel_support_ok ) { // then skip - free(aux); return -1; - } - if (n_types >= 64) { - free(aux); - // TODO revisit how/whether to control printing this warning - if (hts_verbose >= 2) - fprintf(samtools_stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1); - return -1; - } - types = (int*)calloc(n_types, sizeof(int)); - t = 0; - types[t++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) - if (aux[i] != aux[i-1]) - types[t++] = aux[i] - MINUS_CONST; - free(aux); - for (t = 0; t < n_types; ++t) - if (types[t] == 0) break; - ref_type = t; // the index of the reference type (0) - } - { // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; - if (types[0] < 0) right -= types[0]; - // in case the alignments stand out the reference - for (i = pos; i < right; ++i) - if (ref[i] == 0) break; - right = i; - } - /* The following block fixes a long-existing flaw in the INDEL - * calling model: the interference of nearby SNPs. However, it also - * reduces the power because sometimes, substitutions caused by - * indels are not distinguishable from true mutations. Multiple - * sequence realignment helps to increase the power. - * - * Masks mismatches present in at least 70% of the reads with 'N'. - */ - { // construct per-sample consensus - int L = right - left + 1, max_i, max2_i; - uint32_t *cns, max, max2; - char *ref0, *r; - ref_sample = calloc(n, sizeof(char*)); - cns = calloc(L, 4); - ref0 = calloc(L, 1); - for (i = 0; i < right - left; ++i) - ref0[i] = seq_nt16_table[(int)ref[i+left]]; - for (s = 0; s < n; ++s) { - r = ref_sample[s] = calloc(L, 1); - memset(cns, 0, sizeof(int) * L); - // collect ref and non-ref counts - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); - hts_pos_t x = b->core.pos, y = 0; - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) - if (x + j >= left && x + j < right) - cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - } - } - // determine the consensus - for (i = 0; i < right - left; ++i) r[i] = ref0[i]; - max = max2 = 0; max_i = max2_i = -1; - for (i = 0; i < right - left; ++i) { - if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; - else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; - } - if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; - if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; - if (max_i >= 0) r[max_i] = 15; - if (max2_i >= 0) r[max2_i] = 15; - //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], samtools_stderr); fputc('\n', samtools_stderr); - } - free(ref0); free(cns); - } - { // the length of the homopolymer run around the current position - int c = seq_nt16_table[(int)ref[pos + 1]]; - if (c == 15) l_run = 1; - else { - for (i = pos + 2; ref[i]; ++i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run = i; - for (i = pos; i >= 0; --i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run -= i + 1; - } - } - // construct the consensus sequence - max_ins = types[n_types - 1]; // max_ins is at least 0 - if (max_ins > 0) { - int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int)); - // count the number of occurrences of each base at each position for each type of insertion - for (t = 0; t < n_types; ++t) { - if (types[t] > 0) { - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - if (p->indel == types[t]) { - uint8_t *seq = bam_get_seq(p->b); - for (k = 1; k <= p->indel; ++k) { - int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; - assert(c<5); - ++inscns_aux[(t*max_ins+(k-1))*5 + c]; - } - } - } - } - } - } - // use the majority rule to construct the consensus - inscns = calloc(n_types * max_ins, 1); - for (t = 0; t < n_types; ++t) { - for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; - for (k = 0; k < 5; ++k) - if (ia[k] > max) - max = ia[k], max_k = k; - inscns[t*max_ins + j] = max? max_k : 4; - if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's - } - } - free(inscns_aux); - } - // compute the likelihood given each type of indel for each read - max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); - ref2 = calloc(max_ref2, 1); - query = calloc(right - left + max_rd_len + max_ins + 2, 1); - score1 = calloc(N * n_types, sizeof(int)); - score2 = calloc(N * n_types, sizeof(int)); - bca->indelreg = 0; - for (t = 0; t < n_types; ++t) { - int l, ir; - probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; - apf1.bw = apf2.bw = abs(types[t]) + 3; - // compute indelreg - if (types[t] == 0) ir = 0; - else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); - else ir = est_indelreg(pos, ref, -types[t], 0); - if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(samtools_stderr, "%d, %d, %d\n", pos, types[t], ir); - // realignment - for (s = K = 0; s < n; ++s) { - // write ref2 - for (k = 0, j = left; j <= pos; ++j) - ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - if (types[t] <= 0) j += -types[t]; - else for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; - for (; j < right && ref[j]; ++j) - ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - for (; k < max_ref2; ++k) ref2[k] = 4; - if (j < right) right = j; - // align each read to ref2 - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int qbeg, qend, sc, kk; - hts_pos_t tbeg, tend; - uint8_t *seq = bam_get_seq(p->b); - uint32_t *cigar = bam_get_cigar(p->b); - if (p->b->core.flag&4) continue; // unmapped reads - // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. - for (kk = 0; kk < p->b->core.n_cigar; ++kk) - if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; - if (kk < p->b->core.n_cigar) continue; - // FIXME: the following skips soft clips, but using them may be more sensitive. - // determine the start and end of sequences for alignment - qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); - qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); - if (types[t] < 0) { - int l = -types[t]; - tbeg = tbeg - l > left? tbeg - l : left; - } - // write the query sequence - for (l = qbeg; l < qend; ++l) - query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; - { // do realignment; this is the bottleneck - const uint8_t *qual = bam_get_qual(p->b), *bq; - uint8_t *qq; - if (qend < qbeg) { - fprintf(samtools_stderr, "Impossible data in bcf_call_gap_prep\n"); - samtools_exit(1); - } - qq = calloc(qend - qbeg, 1); - bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); - if (bq) ++bq; // skip type - for (l = qbeg; l < qend; ++l) { - qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; - if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; - if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; - } - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below - if (l > 255) l = 255; - score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; - if (sc > 5) { - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); - if (l > 255) l = 255; - score2[K*n_types + t] = sc<<8 | l; - } - free(qq); - } -/* - for (l = 0; l < tend - tbeg + abs(types[t]); ++l) - fputc("ACGTN"[(int)ref2[tbeg-left+l]], samtools_stderr); - fputc('\n', samtools_stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], samtools_stderr); - fputc('\n', samtools_stderr); - fprintf(samtools_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); -*/ - } - } - } - free(ref2); free(query); - { // compute indelQ - int sc_a[16], sumq_a[16]; - int tmp, *sc = sc_a, *sumq = sumq_a; - if (n_types > 16) { - sc = (int *)malloc(n_types * sizeof(int)); - sumq = (int *)malloc(n_types * sizeof(int)); - } - memset(sumq, 0, n_types * sizeof(int)); - for (s = K = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - /* errmod_cal() assumes that if the call is wrong, the - * likelihoods of other events are equal. This is about - * right for substitutions, but is not desired for - * indels. To reuse errmod_cal(), I have to make - * compromise for multi-allelic indels. - */ - if ((sc[0]&0x3f) == ref_type) { - indelQ1 = (sc[1]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ1 = (sc[t]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); - } - tmp = sc[0]>>6 & 0xff; - indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ - sct = &score2[K*n_types]; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - if ((sc[0]&0x3f) == ref_type) { - indelQ2 = (sc[1]>>14) - (sc[0]>>14); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ2 = (sc[t]>>14) - (sc[0]>>14); - } - tmp = sc[0]>>6 & 0xff; - indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); - // pick the smaller between indelQ1 and indelQ2 - indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; - if (indelQ > 255) indelQ = 255; - if (seqQ > 255) seqQ = 255; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(samtools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); - } - } - // determine bca->indel_types[] and bca->inscns - bca->maxins = max_ins; - bca->inscns = realloc(bca->inscns, bca->maxins * 4); - for (t = 0; t < n_types; ++t) - sumq[t] = sumq[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) - tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sumq[t]&0x3f) == ref_type) break; - if (t) { // then move the reference type to the first - tmp = sumq[t]; - for (; t > 0; --t) sumq[t] = sumq[t-1]; - sumq[0] = tmp; - } - for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; - for (t = 0; t < 4 && t < n_types; ++t) { - bca->indel_types[t] = types[sumq[t]&0x3f]; - memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); - } - // update p->aux - for (s = n_alt = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - int x = types[p->aux>>16&0x3f]; - for (j = 0; j < 4; ++j) - if (x == bca->indel_types[j]) break; - p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); - if ((p->aux>>16&0x3f) > 0) ++n_alt; - //fprintf(samtools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); - } - } - - if (sc != sc_a) free(sc); - if (sumq != sumq_a) free(sumq); - } - free(score1); free(score2); - // free - for (i = 0; i < n; ++i) free(ref_sample[i]); - free(ref_sample); - free(types); free(inscns); - return n_alt > 0? 0 : -1; -} diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index ac34316..5941f55 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -717,8 +717,10 @@ static void usage_exit(FILE *fp, int exit_status) fprintf(fp, " -H Print a file header line\n"); fprintf(fp, " -l INT Minimum read length [0]\n"); fprintf(fp, " -o FILE Write output to FILE [stdout]\n"); - fprintf(fp, " -q INT Minimum base quality [0]\n"); - fprintf(fp, " -Q INT Minimum mapping quality [0]\n"); + fprintf(fp, " -q, --min-BQ INT\n" + " Filter bases with base quality smaller than INT [0]\n"); + fprintf(fp, " -Q, --min-MQ INT\n" + " Filter alignments with mapping quality smaller than INT [0]\n"); fprintf(fp, " -H Print a file header\n"); fprintf(fp, " -J Include reads with deletions in depth computation\n"); fprintf(fp, " -s Do not count overlapping reads within a template\n"); @@ -750,6 +752,10 @@ int main_depth(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { + {"min-MQ", required_argument, NULL, 'Q'}, + {"min-mq", required_argument, NULL, 'Q'}, + {"min-BQ", required_argument, NULL, 'q'}, + {"min-bq", required_argument, NULL, 'q'}, SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), {NULL, 0, NULL, 0} }; diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index 7375ef7..edf4281 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -719,8 +719,10 @@ static void usage_exit(FILE *fp, int exit_status) fprintf(fp, " -H Print a file header line\n"); fprintf(fp, " -l INT Minimum read length [0]\n"); fprintf(fp, " -o FILE Write output to FILE [samtools_stdout]\n"); - fprintf(fp, " -q INT Minimum base quality [0]\n"); - fprintf(fp, " -Q INT Minimum mapping quality [0]\n"); + fprintf(fp, " -q, --min-BQ INT\n" + " Filter bases with base quality smaller than INT [0]\n"); + fprintf(fp, " -Q, --min-MQ INT\n" + " Filter alignments with mapping quality smaller than INT [0]\n"); fprintf(fp, " -H Print a file header\n"); fprintf(fp, " -J Include reads with deletions in depth computation\n"); fprintf(fp, " -s Do not count overlapping reads within a template\n"); @@ -752,6 +754,10 @@ int main_depth(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { + {"min-MQ", required_argument, NULL, 'Q'}, + {"min-mq", required_argument, NULL, 'Q'}, + {"min-BQ", required_argument, NULL, 'q'}, + {"min-bq", required_argument, NULL, 'q'}, SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), {NULL, 0, NULL, 0} }; diff --git a/samtools/bam_ampliconclip.c b/samtools/bam_ampliconclip.c index f3fe2bc..2cf1ac1 100644 --- a/samtools/bam_ampliconclip.c +++ b/samtools/bam_ampliconclip.c @@ -289,7 +289,7 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, uint32_t *new_cigar; uint8_t *new_qual; size_t orig_l_aux = bam_get_l_aux(rec); - uint32_t i, j, odd_base = 0; + uint32_t i, j; uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0; hts_pos_t new_pos = rec->core.pos; uint32_t cig_type, cig_op; @@ -387,14 +387,13 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, if (clipping == soft_clip) { qry_removed = 0; // Copy all the sequence and confidence values - odd_base = 1; // account for an odd number of bases } new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2; // Copy remaining SEQ if ((qry_removed & 1) == 0) { memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2), - (rec->core.l_qseq - qry_removed + odd_base) / 2); + (rec->core.l_qseq - qry_removed + 1) / 2); // +1 to account for odd numbers } else { uint8_t *in = orig_seq + qry_removed / 2; uint8_t *out = bam_get_seq(rec_out); diff --git a/samtools/bam_ampliconclip.c.pysam.c b/samtools/bam_ampliconclip.c.pysam.c index 3b2ed29..1feda1d 100644 --- a/samtools/bam_ampliconclip.c.pysam.c +++ b/samtools/bam_ampliconclip.c.pysam.c @@ -291,7 +291,7 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, uint32_t *new_cigar; uint8_t *new_qual; size_t orig_l_aux = bam_get_l_aux(rec); - uint32_t i, j, odd_base = 0; + uint32_t i, j; uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0; hts_pos_t new_pos = rec->core.pos; uint32_t cig_type, cig_op; @@ -389,14 +389,13 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, if (clipping == soft_clip) { qry_removed = 0; // Copy all the sequence and confidence values - odd_base = 1; // account for an odd number of bases } new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2; // Copy remaining SEQ if ((qry_removed & 1) == 0) { memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2), - (rec->core.l_qseq - qry_removed + odd_base) / 2); + (rec->core.l_qseq - qry_removed + 1) / 2); // +1 to account for odd numbers } else { uint8_t *in = orig_seq + qry_removed / 2; uint8_t *out = bam_get_seq(rec_out); diff --git a/samtools/bam_consensus.c b/samtools/bam_consensus.c new file mode 100644 index 0000000..072dcd3 --- /dev/null +++ b/samtools/bam_consensus.c @@ -0,0 +1,1712 @@ +/* bam_consensus.c -- consensus subcommand. + + Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source) + Copyright (C) 2003-2005,2007-2022 Genome Research Ltd. + + Author: James Bonfield + +The primary work here is GRL since 2021, under an MIT license. +Sections derived from Gap5, which include calculate_consensus_gap5() +associated functions, are mostly copyright Genome Research Limited from +2003 onwards. These were originally under a BSD license, but as GRL is +copyright holder these portions can be considered to also be under the +same MIT license below: + + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + * The Gap5 consensus algorithm was in turn derived from the earlier Gap4 + * tool, developed by the Medical Research Council as part of the + * Staden Package. It is unsure how much of this source code is still + * extant, without deep review, but the license used was a compatible + * modified BSD license, included below. + */ + +/* +Modified BSD license for any legacy components from the Staden Package: + +Copyright (c) 2003 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + . Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + . Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + . Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +// FIXME: also use strand to spot possible basecalling errors. +// Specifically het calls where mods are predominantly on one +// strand. So maybe require + and - calls and check concordance +// before calling a het as confident. (Still call, but low qual?) + +// TODO: call by kmers rather than individual bases? Or use kmers to skew +// quality at least. It can identify variants that are low quality due to +// neighbouring edits that aren't consistently correlated. + +// TODO: pileup callback ought to know when it's the last in the region / +// chromosome. This means the caller code doesn't have to handle the +// termination phase and deduplicates the code. (Changing from +// one chr to the next is the same as ending the last.) +// +// TODO: track which reads contribute to multiple confirmed (HQ) differences +// vs which contribute to only one (LQ) difference. Correlated changes +// are more likely to be real. Ie consensus more of a path than solely +// isolated columns. +// +// Either that or a dummy "end of data" call is made to signify end to +// permit tidying up. Maybe add a "start of data" call too? + +// Eg 50T 20A seems T/A het, +// but 30T+ 20T- 18A+ 2A- seems like a consistent A miscall on one strand +// only, while T is spread evenly across both strands. + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "samtools.h" +#include "sam_opts.h" +#include "bam_plbuf.h" +#include "consensus_pileup.h" + +#ifdef __SSE__ +# include +#else +# define _mm_prefetch(a,b) +#endif + +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif +#ifndef MAX +# define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +// Minimum cutoff for storing mod data; => at least 10% chance +#define MOD_CUTOFF 0.46 + +enum format { + FASTQ, + FASTA, + PILEUP +}; + +typedef unsigned char uc; + +typedef struct { + // User options + char *reg; + int use_qual; + int min_qual; + int adj_qual; + int use_mqual; + double scale_mqual; + int nm_adjust; + int nm_halo; + int sc_cost; + int low_mqual; + int high_mqual; + int min_depth; + double call_fract; + double het_fract; + int gap5; + enum format fmt; + int cons_cutoff; + int ambig; + int line_len; + int default_qual; + int het_only; + int all_bases; + int show_del; + int show_ins; + int excl_flags; + int incl_flags; + int min_mqual; + double P_het; + + // Internal state + samFile *fp; + FILE *fp_out; + sam_hdr_t *h; + hts_idx_t *idx; + hts_itr_t *iter; + kstring_t ks_line; + kstring_t ks_ins_seq; + kstring_t ks_ins_qual; + int last_tid; + hts_pos_t last_pos; +} consensus_opts; + +/* -------------------------------------------------------------------------- + * A bayesian consensus algorithm that analyses the data to work out + * which hypothesis of pure A/C/G/T/absent and all combinations of two + * such bases meets the observations. + * + * This has its origins in Gap4 (homozygous) -> Gap5 (heterozygous) + * -> Crumble (tidied up to use htslib's pileup) -> here. + * + */ + +#define CONS_DISCREP 4 +#define CONS_ALL 15 + +#define CONS_MQUAL 16 + +typedef struct { + /* the most likely base call - we never call N here */ + /* A=0, C=1, G=2, T=3, *=4 */ + int call; + + /* The most likely heterozygous base call */ + /* Use "ACGT*"[het / 5] vs "ACGT*"[het % 5] for the combination */ + int het_call; + + /* Log-odds for het_call */ + int het_logodd; + + /* Single phred style call */ + int phred; + + /* Sequence depth */ + int depth; + + /* Discrepancy search score */ + float discrep; +} consensus_t; + +#define P_HET 1e-4 + +#define LOG10 2.30258509299404568401 +#define TENOVERLOG10 4.34294481903251827652 +#define TENLOG2OVERLOG10 3.0103 + +#ifdef __GNUC__ +#define ALIGNED(x) __attribute((aligned(x))) +#else +#define ALIGNED(x) +#endif + +static double prior[25] ALIGNED(16); /* Sum to 1.0 */ +static double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */ + +/* Precomputed matrices for the consensus algorithm */ +static double pMM[101] ALIGNED(16); +static double p__[101] ALIGNED(16); +static double p_M[101] ALIGNED(16); + +static double e_tab_a[1002] ALIGNED(16); +static double *e_tab = &e_tab_a[500]; +static double e_tab2_a[1002] ALIGNED(16); +static double *e_tab2 = &e_tab2_a[500]; +static double e_log[501] ALIGNED(16); + +/* + * Lots of confusing matrix terms here, so some definitions will help. + * + * M = match base + * m = match pad + * _ = mismatch + * o = overcall + * u = undercall + * + * We need to distinguish between homozygous columns and heterozygous columns, + * done using a flat prior. This is implemented by treating every observation + * as coming from one of two alleles, giving us a 2D matrix of possibilities + * (the hypotheses) for each and every call (the observation). + * + * So pMM[] is the chance that given a call 'x' that it came from the + * x/x allele combination. Similarly p_o[] is the chance that call + * 'x' came from a mismatch (non-x) / overcall (consensus=*) combination. + * + * Examples with observation (call) C and * follows + * + * C | A C G T * * | A C G T * + * ----------------- ----------------- + * A | __ _M __ __ o_ A | uu uu uu uu um + * C | _M MM _M _M oM C | uu uu uu uu um + * G | __ _M __ __ o_ G | uu uu uu uu um + * T | __ _M __ __ o_ T | uu uu uu uu um + * * | o_ oM o_ o_ oo * | um um um um mm + * + * In calculation terms, the _M is half __ and half MM, similarly o_ and um. + * + * Relative weights of substitution vs overcall vs undercall are governed on a + * per base basis using the P_OVER and P_UNDER scores (subst is + * 1-P_OVER-P_UNDER). + * + * The heterozygosity weight though is a per column calculation as we're + * trying to model whether the column is pure or mixed. Hence this is done + * once via a prior and has no affect on the individual matrix cells. + */ + +static void consensus_init(double p_het) { + int i; + + for (i = -500; i <= 500; i++) + e_tab[i] = exp(i); + for (i = -500; i <= 500; i++) + e_tab2[i] = exp(i/10.); + for (i = 0; i <= 500; i++) + e_log[i] = log(i); + + // Heterozygous locations + for (i = 0; i < 25; i++) + prior[i] = p_het / 20; + prior[0] = prior[6] = prior[12] = prior[18] = prior[24] = (1-p_het)/5; + + lprior15[0] = log(prior[0]); + lprior15[1] = log(prior[1]*2); + lprior15[2] = log(prior[2]*2); + lprior15[3] = log(prior[3]*2); + lprior15[4] = log(prior[4]*2); + lprior15[5] = log(prior[6]); + lprior15[6] = log(prior[7]*2); + lprior15[7] = log(prior[8]*2); + lprior15[8] = log(prior[9]*2); + lprior15[9] = log(prior[12]); + lprior15[10] = log(prior[13]*2); + lprior15[11] = log(prior[14]*2); + lprior15[12] = log(prior[18]); + lprior15[13] = log(prior[19]*2); + lprior15[14] = log(prior[24]); + + + // Rewrite as new form + for (i = 1; i < 101; i++) { + double prob = 1 - pow(10, -i / 10.0); + + // May want to multiply all these by 5 so pMM[i] becomes close + // to -0 for most data. This makes the sums increment very slowly, + // keeping bit precision in the accumulator. + pMM[i] = log(prob/5); + p__[i] = log((1-prob)/20); + p_M[i] = log((exp(pMM[i]) + exp(p__[i]))/2); + } + + pMM[0] = pMM[1]; + p__[0] = p__[1]; + p_M[0] = p_M[1]; +} + +static inline double fast_exp(double y) { + if (y >= -50 && y <= 50) + return e_tab2[(int)(y*10)]; + + if (y < -500) + y = -500; + if (y > 500) + y = 500; + + return e_tab[(int)y]; +} + +/* Taylor (deg 3) implementation of the log */ +static inline double fast_log2(double val) +{ + // FP representation is exponent & mantissa, where + // value = 2^E * M. + // Hence log2(value) = log2(2^E * M) + // = log2(2^E)+ log2(M) + // = E + log2(M) + union { double d; uint64_t x; } u = {val}; + const int E = ((u.x >> 52) & 2047) - 1024; // exponent E + // Initial log2(M) based on mantissa + u.x &= ~(2047LL << 52); + u.x += 1023LL << 52; + + val = ((-1/3.) * u.d + 2) * u.d - 2/3.; + + return E + val; +} + +#define ph_log(x) (-TENLOG2OVERLOG10*fast_log2((x))) + + +int nins(const bam1_t *b){ + int i, indel = 0; + uint32_t *cig = bam_get_cigar(b); + for (i = 0; i < b->core.n_cigar; i++) { + int op = bam_cigar_op(cig[i]); + if (op == BAM_CINS || op == BAM_CDEL) + indel += bam_cigar_oplen(cig[i]); + } + return indel; +} + +// Return the local NM figure within halo (+/- HALO) of pos. +// This local NM is used as a way to modify MAPQ to get a localised MAPQ +// score via an adhoc fashion. +double nm_local(const pileup_t *p, const bam1_t *b, hts_pos_t pos) { + int *nm = (int *)p->cd; + if (!nm) + return 0; + pos -= b->core.pos; + if (pos < 0) + return nm[0]; + if (pos >= b->core.l_qseq) + return nm[b->core.l_qseq-1]; + + return nm[pos] / 10.0; +} + +/* + * Initialise a new sequence appearing in the pileup. We use this to + * precompute some metrics that we'll repeatedly use in the consensus + * caller; the localised NM score. + * + * We also directly amend the BAM record (which will be discarded later + * anyway) to modify qualities to account for local quality minima. + * + * Returns 0 (discard) or 1 (keep) on success, -1 on failure. + */ +int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { + consensus_opts *opts = (consensus_opts *)client_data; + if (!opts->use_mqual) + return 1; + + const bam1_t *b = &p->b; + int qlen = b->core.l_qseq, i; + int *local_nm = calloc(qlen, sizeof(*local_nm)); + if (!local_nm) + return -1; + p->cd = local_nm; + + if (opts->adj_qual) { +#if 0 + // Tweak by localised quality. + // Quality is reduced by a significant portion of the minimum quality + // in neighbouring bases, on the pretext that if the region is bad, then + // this base is bad even if it claims otherwise. + uint8_t *qual = bam_get_qual(b); + const int qhalo = 8; // 2? + int qmin = 50; // effectively caps PacBio qual too + for (i = 0; i < qlen && i < qhalo; i++) { + local_nm[i] = qual[i]; + if (qmin > qual[i]) + qmin = qual[i]; + } + for (;i < qlen-qhalo; i++) { + //int t = (qual[i]*1 + 3*qmin)/4; // good on 60x + int t = (qual[i] + 5*qmin)/4; // good on 15x + local_nm[i] = t < qual[i] ? t : qual[i]; + if (qmin > qual[i+qhalo]) + qmin = qual[i+qhalo]; + else if (qmin <= qual[i-qhalo]) { + int j; + qmin = 50; + for (j = i-qhalo+1; j <= i+qhalo; j++) + if (qmin > qual[j]) + qmin = qual[j]; + } + } + for (; i < qlen; i++) { + local_nm[i] = qual[i]; + local_nm[i] = (local_nm[i] + 6*qmin)/4; + } + + for (i = 0; i < qlen; i++) { + qual[i] = local_nm[i]; + + // Plus overall rescale. + // Lower becomes lower, very high becomes a little higher. + // Helps deep GIAB, but detrimental elsewhere. (What this really + // indicates is quality calibration differs per data set.) + // It's probably something best accounted for somewhere else. + + //qual[i] = qual[i]*qual[i]/40+1; + } + memset(local_nm, 0, qlen * sizeof(*local_nm)); +#else + // Skew local NM by qual vs min-qual delta + uint8_t *qual = bam_get_qual(b); + const int qhalo = 8; // 4 + int qmin = 99; + for (i = 0; i < qlen && i < qhalo; i++) { + if (qmin > qual[i]) + qmin = qual[i]; + } + for (;i < qlen-qhalo; i++) { + int t = (qual[i] + 5*qmin)/4; // good on 15x + local_nm[i] += t < qual[i] ? (qual[i]-t) : 0; + if (qmin > qual[i+qhalo]) + qmin = qual[i+qhalo]; + else if (qmin <= qual[i-qhalo]) { + int j; + qmin = 99; + for (j = i-qhalo+1; j <= i+qhalo; j++) + if (qmin > qual[j]) + qmin = qual[j]; + } + } + for (; i < qlen; i++) { + int t = (qual[i] + 5*qmin)/4; // good on 15x + local_nm[i] += t < qual[i] ? (qual[i]-t) : 0; + } +#endif + } + + // Adjust local_nm array by the number of edits within + // a defined region (pos +/- halo). + const int halo = opts->nm_halo; + const uint8_t *md = bam_aux_get(b, "MD"); + if (!md) + return 1; + md = (const uint8_t *)bam_aux2Z(md); + + // Handle cost of being near a soft-clip + uint32_t *cig = bam_get_cigar(b); + int ncig = b->core.n_cigar; + + if ( (cig[0] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP || + ((cig[0] & BAM_CIGAR_MASK) == BAM_CHARD_CLIP && ncig > 1 && + (cig[1] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP)) { + for (i = 0; i < halo && i < qlen; i++) + local_nm[i]+=opts->sc_cost; + for (; i < halo*2 && i < qlen; i++) + local_nm[i]+=opts->sc_cost>>1; + } + if ( (cig[ncig-1] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP || + ((cig[ncig-1] & BAM_CIGAR_MASK) == BAM_CHARD_CLIP && ncig > 1 && + (cig[ncig-2] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP)) { + for (i = qlen-1; i >= qlen-halo && i >= 0; i--) + local_nm[i]+=opts->sc_cost; + for (; i >= qlen-halo*2 && i >= 0; i--) + local_nm[i]+=opts->sc_cost>>1; + } + + // Now iterate over MD tag + int pos = 0; + while (*md) { + if (isdigit(*md)) { + uint8_t *endptr; + long i = strtol((char *)md, (char **)&endptr, 10); + md = endptr; + pos += i; + continue; + } + + // deletion. + // Should we bump local_nm here too? Maybe + if (*md == '^') { + while (*++md && !isdigit(*md)) + continue; + continue; + } + + // substitution + for (i = pos-halo*2 >= 0 ? pos-halo*2 : 0; i < pos-halo; i++) + local_nm[i]+=5; + for (; i < pos+halo && i < qlen; i++) + local_nm[i]+=10; + for (; i < pos+halo*2 && i < qlen; i++) + local_nm[i]+=5; + md++; + } + + return 1; +} + + +static +int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, + pileup_t *plp, consensus_opts *opts, + consensus_t *cons, int default_qual) { + int i, j; + static int init_done =0; + static double q2p[101], mqual_pow[256]; + double min_e_exp = DBL_MIN_EXP * log(2) + 1; + + double S[15] ALIGNED(16) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + double sumsC[6] = {0,0,0,0,0,0}; // A C G T * N + + // Small hash on seq to check for uniqueness of surrounding bases. + // If it's frequent, then it's more likely to be correctly called than + // if it's rare. + // Helps a bit on deep data, especially with K2=3, but detrimental on + // shallow and (currently) quite a slow down. + +//#define K2 2 +#ifdef K2 + int hashN[1<<(K2*4+2)] = {0}; + int hash1[1<<2] = {0}; +#endif + + /* Map the 15 possible combinations to 1-base or 2-base encodings */ + static int map_sing[15] ALIGNED(16) = + {0, 5, 5, 5, 5, + 1, 5, 5, 5, + 2, 5, 5, + 3, 5, + 4}; + static int map_het[15] ALIGNED(16) = + {0, 1, 2, 3, 4, + 6, 7, 8, 9, + 12, 13, 14, + 18, 19, + 24}; + + if (!init_done) { + init_done = 1; + consensus_init(opts->P_het); + + for (i = 0; i <= 100; i++) { + q2p[i] = pow(10, -i/10.0); + } + + for (i = 0; i < 255; i++) { + //mqual_pow[i] = 1-pow(10, -(i+.01)/10.0); + mqual_pow[i] = 1-pow(10, -(i*.9)/10.0); + //mqual_pow[i] = 1-pow(10, -(i/3+.1)/10.0); + //mqual_pow[i] = 1-pow(10, -(i/2+.05)/10.0); + } + // unknown mqual + mqual_pow[255] = mqual_pow[10]; + } + + /* Initialise */ + int counts[6] = {0}; + + /* Accumulate */ + +#ifdef K2 + const pileup_t *ptmp = plp; + for (; ptmp; ptmp = ptmp->next) { + const pileup_t *p = ptmp; + if (p->qual < opts->min_qual) + continue; + + int hb = 0; +#define _ 0 + static int X[16] = {_,0,1,_,2,_,_,_,3,_,_,_,_,_,_,_}; +#undef _ + uint8_t *seq = bam_get_seq(&p->b); + int i, base1 = X[p->base4]; + hash1[base1]++; + for (i = p->seq_offset-K2; i <= p->seq_offset+K2; i++) { + int base = i >= 0 && i < p->b.core.l_qseq ? X[bam_seqi(seq,i)] : _; + hb = (hb<<2)|base; + } + hashN[hb]++; + } +#endif + + int td = depth; // original depth + depth = 0; + for (; plp; plp = plp->next) { + pileup_t *p = plp; + + if (p->next) + _mm_prefetch(p->next, _MM_HINT_T0); + + if (p->qual < opts->min_qual) + continue; + + if (p->ref_skip) + continue; + +#ifdef K2 + int hb = 0; +#define _ 0 + static int X[16] = {_,0,1,_,2,_,_,_,3,_,_,_,_,_,_,_}; + int i, base1 = X[p->base4]; + for (i = p->seq_offset-K2; i <= p->seq_offset+K2; i++) { + int base = i >= 0 && i < p->b.core.l_qseq ? X[bam_seqi(seq,i)] : _; + hb = (hb<<2)|base; + } + // fprintf(stderr, "%c: %d %d of %d\t%d %d\n", p->base, hashN[hb], hash1[base1], td, p->qual, p->qual * hashN[hb] / hash1[base1]); +#undef _ +#endif + + const bam1_t *b = &p->b; + uint8_t base = p->base4; + uint8_t *qual_arr = bam_get_qual(b); + uint8_t qual = p->qual; + //qual = qual*qual/40+1; + if (qual == 255 || (qual == 0 && *qual_arr == 255)) + qual = default_qual; + +#ifdef K2 + //qual = qual * hashN[hb] / hash1[base1]; + qual -= -TENOVERLOG10*log(hashN[hb] / (hash1[base1]+.1)); + if (qual < 1) + qual = 1; +#endif + + // =ACM GRSV TWYH KDBN * + static int L[32] = { + 5,0,1,5, 2,5,5,5, 3,5,5,5, 5,5,5,5, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + }; + + // convert from sam base to acgt*n order. + base = L[base]; + + double MM, __, _M, qe; + + // Correction for mapping quality. Maybe speed up via lookups? + // Cannot nullify mapping quality completely. Lots of (true) + // SNPs means low mapping quality. (Ideally need to know + // hamming distance to next best location.) + + if (flags & CONS_MQUAL) { + int mqual = b->core.qual; + if (opts->nm_adjust) { + mqual /= (nm_local(p, b, pos)+1); + mqual *= 1 + 2*(0.5-(td>30?30:td)/60.0); // depth fudge + } + + // higher => call more; +FP, -FN + // lower => call less; -FP, +FN + mqual *= opts->scale_mqual; + + // Drop these? They don't seem to ever help. + if (mqual < opts->low_mqual) + mqual = opts->low_mqual; + if (mqual > opts->high_mqual) + mqual = opts->high_mqual; + + double _p = 1-q2p[qual]; + double _m = mqual_pow[mqual]; + qual = ph_log(1-(_m * _p + (1 - _m)/4)); // CURRENT + //qual = ph_log(1-_p*_m); // testing + //qual *= 6/sqrt(td); + } + + /* Quality 0 should never be permitted as it breaks the maths */ + if (qual < 1) + qual = 1; + + __ = p__[qual]; // neither match + MM = pMM[qual] - __; // both match + _M = p_M[qual] - __; // one allele only (half match) + + if (flags & CONS_DISCREP) { + qe = q2p[qual]; + sumsC[base] += 1 - qe; + } + + counts[base]++; + + switch (base) { + case 0: // A + S[0] += MM; + S[1] += _M; + S[2] += _M; + S[3] += _M; + S[4] += _M; + break; + + case 1: // C + S[1] += _M; + S[5] += MM; + S[6] += _M; + S[7] += _M; + S[8] += _M; + break; + + case 2: // G + S[ 2] += _M; + S[ 6] += _M; + S[ 9] += MM; + S[10] += _M; + S[11] += _M; + break; + + case 3: // T + S[ 3] += _M; + S[ 7] += _M; + S[10] += _M; + S[12] += MM; + S[13] += _M; + + break; + + case 4: // * + S[ 4] += _M; + S[ 8] += _M; + S[11] += _M; + S[13] += _M; + S[14] += MM; + break; + + case 5: /* N => equal weight to all A,C,G,T but not a pad */ + S[ 0] += MM; + S[ 1] += MM; + S[ 2] += MM; + S[ 3] += MM; + S[ 4] += _M; + + S[ 5] += MM; + S[ 6] += MM; + S[ 7] += MM; + S[ 8] += _M; + + S[ 9] += MM; + S[10] += MM; + S[11] += _M; + + S[12] += MM; + S[13] += _M; + break; + } + + depth++; + + if (p->eof && p->cd) { + free(p->cd); + p->cd = NULL; + } + } + + /* We've accumulated stats, so now we speculate on the consensus call */ + double shift, max, max_het, norm[15]; + int call = 0, het_call = 0; + double tot1 = 0, tot2 = 0; + + /* + * Scale numbers so the maximum score is 0. This shift is essentially + * a multiplication in non-log scale to both numerator and denominator, + * so it cancels out. We do this to avoid calling exp(-large_num) and + * ending up with norm == 0 and hence a 0/0 error. + * + * Can also generate the base-call here too. + */ + shift = -DBL_MAX; + max = -DBL_MAX; + max_het = -DBL_MAX; + + for (j = 0; j < 15; j++) { + S[j] += lprior15[j]; + if (shift < S[j]) + shift = S[j]; + + /* Only call pure AA, CC, GG, TT, ** for now */ + if (j != 0 && j != 5 && j != 9 && j != 12 && j != 14) { + if (max_het < S[j]) { + max_het = S[j]; + het_call = j; + } + continue; + } + + if (max < S[j]) { + max = S[j]; + call = j; + } + } + + /* + * Shift and normalise. + * If call is, say, b we want p = b/(a+b+c+...+n), but then we do + * p/(1-p) later on and this has exceptions when p is very close + * to 1. + * + * Hence we compute b/(a+b+c+...+n - b) and + * rearrange (p/norm) / (1 - (p/norm)) to be p/norm2. + */ + for (j = 0; j < 15; j++) { + S[j] -= shift; + double e = fast_exp(S[j]); + S[j] = (S[j] > min_e_exp) ? e : DBL_MIN; + norm[j] = 0; + } + + for (j = 0; j < 15; j++) { + norm[j] += tot1; + norm[14-j] += tot2; + tot1 += S[j]; + tot2 += S[14-j]; + } + + /* And store result */ + if (!depth || depth == counts[5] /* all N */) { + cons->call = 4; /* N */ + cons->het_call = 0; + cons->het_logodd = 0; + cons->phred = 0; + cons->depth = 0; + cons->discrep = 0; + return 0; + } + + cons->depth = depth; + + /* Call */ + if (norm[call] == 0) norm[call] = DBL_MIN; + // Approximation of phred for when S[call] ~= 1 and norm[call] + // is small. Otherwise we need the full calculation. + int ph; + if (S[call] == 1 && norm[call] < .01) + ph = ph_log(norm[call]) + .5; + else + ph = ph_log(1-S[call]/(norm[call]+S[call])) + .5; + + cons->call = map_sing[call]; + cons->phred = ph < 0 ? 0 : ph; + + if (norm[het_call] == 0) norm[het_call] = DBL_MIN; + ph = TENLOG2OVERLOG10 * (fast_log2(S[het_call]) + - fast_log2(norm[het_call])) + .5; + + cons->het_call = map_het[het_call]; + cons->het_logodd = ph; + + /* Compute discrepancy score */ + if (flags & CONS_DISCREP) { + double m = sumsC[0]+sumsC[1]+sumsC[2]+sumsC[3]+sumsC[4]; + double c; + if (cons->het_logodd > 0) + c = sumsC[cons->het_call%5] + sumsC[cons->het_call/5]; + else + c = sumsC[cons->call]; + cons->discrep = (m-c)/sqrt(m); + } + + return 0; +} + + +/* -------------------------------------------------------------------------- + * Main processing logic + */ + +static void dump_fastq(consensus_opts *opts, + const char *name, + const char *seq, size_t seq_l, + const char *qual, size_t qual_l) { + enum format fmt = opts->fmt; + int line_len = opts->line_len; + FILE *fp = opts->fp_out; + + fprintf(fp, "%c%s\n", ">@"[fmt==FASTQ], name); + size_t i; + for (i = 0; i < seq_l; i += line_len) + fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), seq+i); + + if (fmt == FASTQ) { + fprintf(fp, "+\n"); + for (i = 0; i < seq_l; i += line_len) + fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), qual+i); + } +} + +//--------------------------------------------------------------------------- + +/* + * Reads a single alignment record, using either the iterator + * or a direct sam_read1 call. + */ +static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { + consensus_opts *opts = (consensus_opts *)dat; + + for (;;) { + int ret = opts->iter + ? sam_itr_next(fp, opts->iter, b) + : sam_read1(fp, h, b); + if (ret < 0) + return ret; + + // Apply hard filters + if (opts->incl_flags && !(b->core.flag & opts->incl_flags)) + continue; + if (opts->excl_flags && (b->core.flag & opts->excl_flags)) + continue; + if (b->core.qual < opts->min_mqual) + continue; + + return ret; + } +} + +/* -------------------------------------------------------------------------- + * A simple summing algorithm, either pure base frequency, or by + * weighting them according to their quality values. + * + * This is crude, but easy to understand and fits with several + * standard pileup criteria (eg COG-UK / CLIMB Covid-19 seq project). + * + * + * call1 / score1 / depth1 is the highest scoring allele. + * call2 / score2 / depth2 is the second highest scoring allele. + * + * Het_fract: score2/score1 + * Call_fract: score1 or score1+score2 over total score + * Min_depth: minimum total depth of utilised bases (depth1+depth2) + * Min_score: minimum total score of utilised bases (score1+score2) + * + * Eg het_fract 0.66, call_fract 0.75 and min_depth 10. + * 11A, 2C, 2G (14 total depth) is A. + * 9A, 2C, 2G (12 total depth) is N as depth(A) < 10. + * 11A, 5C, 5G (21 total depth) is N as 11/21 < 0.75 (call_fract) + * + * + * 6A, 5G, 1C (12 total depth) is AG het as depth(A)+depth(G) >= 10 + * and 5/6 >= 0.66 and 11/12 >= 0.75. + * + * 6A, 5G, 4C (15 total depth) is N as (6+5)/15 < 0.75 (call_fract). + * + * + * Note for the purpose of deletions, a base/del has an ambiguity + * code of lower-case base (otherwise it is uppercase). + */ +static int calculate_consensus_simple(const pileup_t *plp, + consensus_opts *opts, int *qual) { + int i, min_qual = opts->min_qual; + + // Map "seqi" nt16 to A,C,G,T compatibility with weights on pure bases. + // where seqi is A | (C<<1) | (G<<2) | (T<<3) + // * A C M G R S V T W Y H K D B N + static int seqi2A[16] = { 0,8,0,4, 0,4,0,2, 0,4,0,2, 0,2,0,1 }; + static int seqi2C[16] = { 0,0,8,4, 0,0,4,2, 0,0,4,2, 0,0,2,1 }; + static int seqi2G[16] = { 0,0,0,0, 8,4,4,1, 0,0,0,0, 4,2,2,1 }; + static int seqi2T[16] = { 0,0,0,0, 0,0,0,0, 8,4,4,2, 8,2,2,1 }; + + // Ignore ambiguous bases in seq for now, so we don't treat R, Y, etc + // as part of one base and part another. Based on BAM seqi values. + // We also use freq[16] as "*" for gap. + int freq[17] = {0}; // base frequency, aka depth + int score[17] = {0}; // summation of base qualities + + // Accumulate + for (; plp; plp = plp->next) { + const pileup_t *p = plp; + if (p->next) + _mm_prefetch(p->next, _MM_HINT_T0); + + int q = p->qual; + if (q < min_qual) + // Should we still record these in freq[] somewhere so + // we can use them in the fracts? + // Difference between >= X% of high-qual bases calling Y + // and >= X% of all bases are high-quality Y calls. + continue; + + //int b = p->is_del ? 16 : bam_seqi(bam_get_seq(&p->b), p->seq_offset); + int b = p->base4; + + // Map ambiguity codes to one or more component bases. + if (b < 16) { + int Q = seqi2A[b] * (opts->use_qual ? q : 1); + freq[1] += Q?1:0; + score[1] += Q?Q:0; + Q = seqi2C[b] * (opts->use_qual ? q : 1); + freq[2] += Q?1:0; + score[2] += Q?Q:0; + Q = seqi2G[b] * (opts->use_qual ? q : 1); + freq[4] += Q?1:0; + score[4] += Q?Q:0; + Q = seqi2T[b] * (opts->use_qual ? q : 1); + freq[8] += Q?1:0; + score[8] += Q?Q:0; + } else { /* * */ + freq[16] ++; + score[16]+=8 * (opts->use_qual ? q : 1); + } + } + + // Total usable depth + int tscore = 0; + for (i = 0; i < 5; i++) + tscore += score[1<= opts->het_fract * score1 && opts->ambig) { + used_base |= call2; + used_score += score2; + used_depth += depth2; + } + + // N is too shallow, or insufficient proportion of total + if (used_depth < opts->min_depth || + used_score < opts->call_fract * tscore) { + used_depth = 0; + // But note shallow gaps are still called gaps, not N, as + // we're still more confident there is no base than it is + // A, C, G or T. + used_base = call1 == 16 /*&& depth1 >= call_fract * depth*/ + ? 16 : 0; // * or N + } + + // Our final call. "?" shouldn't be possible to generate + const char *het = + "NACMGRSVTWYHKDBN" + "*ac?g???t???????"; + + //printf("%c %d\n", het[used_base], used_depth); + if (qual) + *qual = used_base ? 100.0 * used_score / tscore : 0; + + return het[used_base]; +} + +static int empty_pileup2(consensus_opts *opts, sam_hdr_t *h, int tid, + hts_pos_t start, hts_pos_t end) { + const char *name = sam_hdr_tid2name(h, tid); + hts_pos_t i; + + int err = 0; + for (i = start; i < end; i++) + err |= fprintf(opts->fp_out, "%s\t%"PRIhts_pos"\t0\t0\tN\t0\t*\t*\n", name, i+1) < 0; + + return err ? -1 : 0; +} + +/* + * Returns 0 on success + * -1 on failure + */ +static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, + int depth, hts_pos_t pos, int nth, int is_insert) { + unsigned char *qp, *cp; + char *rp; + int ref, cb, cq; + consensus_opts *opts = (consensus_opts *)cd; + int tid = p->b.core.tid; + +// opts->show_ins=0; +// opts->show_del=1; + if (!opts->show_ins && nth) + return 0; + + if (opts->iter) { + if (opts->iter->beg >= pos || opts->iter->end < pos) + return 0; + } + + if (opts->all_bases) { + if (tid != opts->last_tid && opts->last_tid >= 0) { + hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); + if (opts->iter) + len = MIN(opts->iter->end, len); + if (empty_pileup2(opts, opts->h, opts->last_tid, opts->last_pos, + len) < 0) + return -1; + if (tid >= 0) { + if (empty_pileup2(opts, opts->h, tid, + opts->iter ? opts->iter->beg : 0, + pos-1) < 0) + return -1; + } + } + if (opts->last_pos >= 0 && pos > opts->last_pos+1) { + if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos, + pos-1) < 0) + return -1; + } else if (opts->last_pos < 0) { + if (empty_pileup2(opts, opts->h, p->b.core.tid, + opts->iter ? opts->iter->beg : 0, pos-1) < 0) + return -1; + } + } + + if (opts->gap5) { + consensus_t cons; + calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual); + if (cons.het_logodd > 0 && opts->ambig) { + cb = "AMRWa" // 5x5 matrix with ACGT* per row / col + "MCSYc" + "RSGKg" + "WYKTt" + "acgt*"[cons.het_call]; + cq = cons.het_logodd; + } else{ + cb = "ACGT*"[cons.call]; + cq = cons.phred; + } + if (cq < opts->cons_cutoff && cb != '*') { + cb = 'N'; + cq = 0; + } + } else { + cb = calculate_consensus_simple(p, opts, &cq); + } + if (cb < 0) + return -1; + + if (!p) + return 0; + + if (!opts->show_del && cb == '*') + return 0; + + /* Ref, pos, nth, score, seq, qual */ + kstring_t *ks = &opts->ks_line; + ks->l = 0; + ref = p->b.core.tid; + rp = (char *)sam_hdr_tid2name(h, ref); + + int err = 0; + err |= kputs(rp, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputw(pos, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputw(nth, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputw(depth, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputc_(cb, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputw(cq, ks) < 0; + err |= kputc_('\t', ks) < 0; + if (err) + return -1; + + /* Seq + qual at predetermined offsets */ + if (ks_resize(ks, ks->l + depth*2 + 2) < 0) + return -1; + + cp = (unsigned char *)ks->s + ks->l; + ks->l += depth*2 + 2; + qp = cp+depth+1; + for (; p; p = p->next) { + // Too tight a loop to help much, but some benefit still + if (p->next && p->next->next) + _mm_prefetch(p->next->next, _MM_HINT_T0); + if (p->b_is_rev) { + *cp++ = p->base == '*' ? '#' : tolower(p->base); + } else { + *cp++ = p->base; + } + *qp++ = MIN(p->qual,93) + '!'; + } + *cp++ = '\t'; + *qp++ = '\n'; + if (fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) + return -1; + + opts->last_pos = pos; + opts->last_tid = tid; + + return 0; +} + +static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, + int depth, hts_pos_t pos, int nth, int is_insert) { + int cb, cq; + consensus_opts *opts = (consensus_opts *)cd; + int tid = p->b.core.tid; + kstring_t *seq = &opts->ks_ins_seq; + kstring_t *qual = &opts->ks_ins_qual; + + if (!opts->show_ins && nth) + return 0; + + if (opts->iter) { + if (opts->iter->beg >= pos || opts->iter->end < pos) + return 0; + } + + if (tid != opts->last_tid) { + if (opts->last_tid != -1) { + if (opts->all_bases) { + int i, N; + if (opts->iter) { + opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1); + N = opts->iter->end; + } else { + N = INT_MAX; + } + N = MIN(N, sam_hdr_tid2len(opts->h, opts->last_tid)) + - opts->last_pos; + if (N > 0) { + if (ks_expand(seq, N+1) < 0) + return -1; + if (ks_expand(qual, N+1) < 0) + return -1; + for (i = 0; i < N; i++) { + seq->s[seq->l++] = 'N'; + qual->s[qual->l++] = '!'; + } + seq->s[seq->l] = 0; + qual->s[qual->l] = 0; + } + } + dump_fastq(opts, sam_hdr_tid2name(opts->h, opts->last_tid), + seq->s, seq->l, qual->s, qual->l); + } + + seq->l = 0; qual->l = 0; + opts->last_tid = tid; +// if (opts->all_bases) +// opts->last_pos = 0; + if (opts->iter) + opts->last_pos = opts->iter->beg; + else + opts->last_pos = opts->all_bases ? 0 : pos-1; + } + + // share this with basic_pileup + if (opts->gap5) { + consensus_t cons; + calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual); + if (cons.het_logodd > 0 && opts->ambig) { + cb = "AMRWa" // 5x5 matrix with ACGT* per row / col + "MCSYc" + "RSGKg" + "WYKTt" + "acgt*"[cons.het_call]; + cq = cons.het_logodd; + } else{ + cb = "ACGT*"[cons.call]; + cq = cons.phred; + } + if (cq < opts->cons_cutoff && cb != '*' && + cons.het_call % 5 != 4 && cons.het_call / 5 != 4) { + // het base/* keeps base or * as most likely pure call, else N. + // This is because we don't have a traditional way of representing + // base or not-base ambiguity. + cb = 'N'; + cq = 0; + } + } else { + cb = calculate_consensus_simple(p, opts, &cq); + } + if (cb < 0) + return -1; + + if (!p) + return 0; + + if (!opts->show_del && cb == '*') { + opts->last_pos = pos; + opts->last_tid = tid; + return 0; + } + // end of share + + // Append consensus base/qual to seqs + if (pos > opts->last_pos) { + if (opts->last_pos >= 0 || opts->all_bases) { + // FIXME: don't expand qual if fasta + if (ks_expand(seq, pos - opts->last_pos) < 0 || + ks_expand(qual, pos - opts->last_pos) < 0) + return -1; + memset(seq->s + seq->l, 'N', pos - (opts->last_pos+1)); + memset(qual->s + qual->l, '!', pos - (opts->last_pos+1)); + seq->l += pos - (opts->last_pos+1); + qual->l += pos - (opts->last_pos+1); + } + } + if ((nth && opts->show_ins && cb != '*') + || cb != '*' || (pos > opts->last_pos && opts->show_del)) { + int err = 0; + err |= kputc(cb, seq) < 0; + err |= kputc(MIN(cq, '~'-'!')+'!', qual) < 0; + if (err) + return -1; + } + + opts->last_pos = pos; + opts->last_tid = tid; + + return 0; +} +// END OF NEW PILEUP +//--------------------------------------------------------------------------- + +static void usage_exit(FILE *fp, int exit_status) { + fprintf(fp, "Usage: samtools consensus [options] \n"); + fprintf(fp, "\nOptions:\n"); + fprintf(fp, " -r, --region REG Limit query to REG. Requires an index\n"); + fprintf(fp, " -f, --format FMT Output in format FASTA, FASTQ or PILEUP [FASTA]\n"); + fprintf(fp, " -l, --line-len INT Wrap FASTA/Q at line length INT [70]\n"); + fprintf(fp, " -o, --output FILE Output consensus to FILE\n"); + fprintf(fp, " -m, --mode STR Switch consensus mode to \"simple\"/\"bayesian\" [bayesian]\n"); + fprintf(fp, " -a Output all bases (start/end of reference)\n"); + fprintf(fp, " --rf, --incl-flags STR|INT\n"); + fprintf(fp, " Only include reads with any flag bit set [0]\n"); + fprintf(fp, " --ff, --excl-flags STR|INT\n"); + fprintf(fp, " Exclude reads with any flag bit set\n"); + fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); + fprintf(fp, " --min-MQ INT Exclude reads with mapping quality below INT [0]\n"); + fprintf(fp, " --show-del yes/no Whether to show deletion as \"*\" [no]\n"); + fprintf(fp, " --show-ins yes/no Whether to show insertions [yes]\n"); + fprintf(fp, " -A, --ambig Enable IUPAC ambiguity codes [off]\n"); + fprintf(fp, "\nFor simple consensus mode:\n"); + fprintf(fp, " -q, --(no-)use-qual Use quality values in calculation [off]\n"); + fprintf(fp, " -c, --call-fract INT At least INT portion of bases must agree [0.75]\n"); + fprintf(fp, " -d, --min-depth INT Minimum depth of INT [1]\n"); + fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.5]\n"); + fprintf(fp, "\nFor default \"Bayesian\" consensus mode:\n"); + fprintf(fp, " -C, --cutoff C Consensus cutoff quality C [10]\n"); + fprintf(fp, " --(no-)adj-qual Modify quality with local minima [on]\n"); + fprintf(fp, " --(no-)use-MQ Use mapping quality in calculation [on]\n"); + fprintf(fp, " --(no-)adj-MQ Modify mapping quality by local NM [on]\n"); + fprintf(fp, " --NM-halo INT Size of window for NM count in --adj-MQ [50]\n"); + fprintf(fp, " --scale-MQ FLOAT Scale mapping quality by FLOAT [1.00]\n"); + fprintf(fp, " --low-MQ INT Cap minimum mapping quality [1]\n"); + fprintf(fp, " --high-MQ INT Cap maximum mapping quality [60]\n"); + fprintf(fp, " --P-het FLOAT Probability of heterozygous site[%.1e]\n", + P_HET); + + fprintf(fp, "\nGlobal options:\n"); + sam_global_opt_help(fp, "-.---@-."); + exit(exit_status); +} + +int main_consensus(int argc, char **argv) { + int c, ret = 1; + + consensus_opts opts = { + // User options + .gap5 = 1, + .use_qual = 0, + .min_qual = 0, + .adj_qual = 1, + .use_mqual = 1, + .scale_mqual = 1.00, + .nm_adjust = 1, + .nm_halo = 50, + .sc_cost = 60, + .low_mqual = 1, + .high_mqual = 60, + .min_depth = 1, + .call_fract = 0.75, + .het_fract = 0.5, + .het_only = 0, + .fmt = FASTA, + .cons_cutoff = 10, + .ambig = 0, + .line_len = 70, + .default_qual = 10, + .all_bases = 0, + .show_del = 0, + .show_ins = 1, + .incl_flags = 0, + .excl_flags = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, + .min_mqual = 0, + .P_het = P_HET, + + // Internal state + .ks_line = {0,0}, + .ks_ins_seq = {0,0}, + .ks_ins_qual = {0,0}, + .fp = NULL, + .fp_out = stdout, + .iter = NULL, + .idx = NULL, + .last_tid = -1, + .last_pos = -1, + }; + + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), + {"use-qual", no_argument, NULL, 'q'}, + {"no-use-qual", no_argument, NULL, 'q'+1000}, + {"adj-qual", no_argument, NULL, 'q'+100}, + {"no-adj-qual", no_argument, NULL, 'q'+101}, + {"use-MQ", no_argument, NULL, 'm'+1000}, + {"no-use-MQ", no_argument, NULL, 'm'+1001}, + {"adj-MQ", no_argument, NULL, 'm'+100}, + {"no-adj-MQ", no_argument, NULL, 'm'+101}, + {"NM-halo", required_argument, NULL, 'h'+100}, + {"SC-cost", required_argument, NULL, 'h'+101}, + {"scale-MQ", required_argument, NULL, 14}, + {"low-MQ" , required_argument, NULL, 9}, + {"high-MQ", required_argument, NULL, 10}, + {"min-depth", required_argument, NULL, 'd'}, + {"call-fract", required_argument, NULL, 'c'}, + {"het-fract", required_argument, NULL, 'H'}, + {"region", required_argument, NULL, 'r'}, + {"format", required_argument, NULL, 'f'}, + {"cutoff", required_argument, NULL, 'C'}, + {"ambig", no_argument, NULL, 'A'}, + {"line-len", required_argument, NULL, 'l'}, + {"default-qual", required_argument, NULL, 1}, + {"het-only", no_argument, NULL, 6}, + {"show-del", required_argument, NULL, 7}, + {"show-ins", required_argument, NULL, 8}, + {"output", required_argument, NULL, 'o'}, + {"incl-flags", required_argument, NULL, 11}, + {"rf", required_argument, NULL, 11}, + {"excl-flags", required_argument, NULL, 12}, + {"ff", required_argument, NULL, 12}, + {"min-MQ", required_argument, NULL, 13}, + {"P-het", required_argument, NULL, 15}, + {"mode", required_argument, NULL, 'm'}, + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:", + lopts, NULL)) >= 0) { + switch (c) { + case 'a': opts.all_bases++; break; + case 'q': opts.use_qual=1; break; + case 'q'+1000: opts.use_qual=0; break; + case 'm'+1000: opts.use_mqual=1; break; + case 'm'+1001: opts.use_mqual=0; break; + case 14: opts.scale_mqual = atof(optarg); break; + case 9: opts.low_mqual = atoi(optarg); break; + case 10: opts.high_mqual = atoi(optarg); break; + case 'd': opts.min_depth = atoi(optarg); break; + case 'c': opts.call_fract = atof(optarg); break; + case 'H': opts.het_fract = atof(optarg); break; + case 'r': opts.reg = optarg; break; + case 'C': opts.cons_cutoff = atoi(optarg); break; + case 'A': opts.ambig = 1; break; + case 1: opts.default_qual = atoi(optarg); break; + case 6: opts.het_only = 1; break; + case 7: opts.show_del = (*optarg == 'y' || *optarg == 'Y'); break; + case 8: opts.show_ins = (*optarg == 'y' || *optarg == 'Y'); break; + case 13: opts.min_mqual = atoi(optarg); break; + case 15: opts.P_het = atof(optarg); break; + case 'q'+100: opts.adj_qual = 1; break; + case 'q'+101: opts.adj_qual = 0; break; + case 'm'+100: opts.nm_adjust = 1; break; + case 'm'+101: opts.nm_adjust = 0; break; + case 'h'+100: opts.nm_halo = atoi(optarg); break; + case 'h'+101: opts.sc_cost = atoi(optarg); break; + + case 'm': // mode + if (strcasecmp(optarg, "simple") == 0) { + opts.gap5 = 0; + } else if (strcasecmp(optarg, "bayesian") == 0) { + opts.gap5 = 1; + } else { + fprintf(stderr, "Unknown mode %s\n", optarg); + return 1; + } + break; + + case 'l': + if ((opts.line_len = atoi(optarg)) <= 0) + opts.line_len = INT_MAX; + break; + + case 'f': + if (strcasecmp(optarg, "fasta") == 0) { + opts.fmt = FASTA; + } else if (strcasecmp(optarg, "fastq") == 0) { + opts.fmt = FASTQ; + } else if (strcasecmp(optarg, "pileup") == 0) { + opts.fmt = PILEUP; + } else { + fprintf(stderr, "Unknown format %s\n", optarg); + return 1; + } + break; + + case 'o': + if (!(opts.fp_out = fopen(optarg, "w"))) { + perror(optarg); + return 1; + } + break; + + case 11: + if ((opts.incl_flags = bam_str2flag(optarg)) < 0) { + print_error("consensus", "could not parse --rf %s", optarg); + return 1; + } + break; + case 12: + if ((opts.excl_flags = bam_str2flag(optarg)) < 0) { + print_error("consensus", "could not parse --ff %s", optarg); + return 1; + } + break; + + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': + usage_exit(stderr, EXIT_FAILURE); + } + } + + if (argc != optind+1) { + if (argc == optind) usage_exit(stdout, EXIT_SUCCESS); + else usage_exit(stderr, EXIT_FAILURE); + } + opts.fp = sam_open_format(argv[optind], "r", &ga.in); + if (opts.fp == NULL) { + print_error_errno("consensus", "Cannot open input file \"%s\"", + argv[optind]); + goto err; + } + if (ga.nthreads > 0) + hts_set_threads(opts.fp, ga.nthreads); + + if (hts_set_opt(opts.fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + goto err; + } + + if (!(opts.h = sam_hdr_read(opts.fp))) { + fprintf(stderr, "Failed to read header for \"%s\"\n", argv[optind]); + goto err; + } + + if (opts.reg) { + opts.idx = sam_index_load(opts.fp, argv[optind]); + if (!opts.idx) { + print_error("consensus", "Cannot load index for input file \"%s\"", + argv[optind]); + goto err; + } + opts.iter = sam_itr_querys(opts.idx, opts.h, opts.reg); + if (!opts.iter) { + print_error("consensus", "Failed to parse region \"%s\"", + opts.reg); + goto err; + } + } + + if (opts.fmt == PILEUP) { + if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL, + basic_pileup, &opts) < 0) + goto err; + + if (opts.all_bases) { + int tid = opts.iter ? opts.iter->tid : opts.last_tid; + int len = sam_hdr_tid2len(opts.h, tid); + int pos = opts.last_pos; + if (opts.iter) { + len = MIN(opts.iter->end, len); + pos = MAX(opts.iter->beg, pos); + } + if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0) + goto err; + } + } else { + if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL, + basic_fasta, + &opts) < 0) + goto err; + if (opts.all_bases) { + // fill out terminator + int tid = opts.iter ? opts.iter->tid : opts.last_tid; + int len = sam_hdr_tid2len(opts.h, tid); + int pos = opts.last_pos; + if (opts.iter) { + len = MIN(opts.iter->end, len); + pos = MAX(opts.iter->beg, pos); + opts.last_tid = opts.iter->tid; + } + if (pos < len) { + if (ks_expand(&opts.ks_ins_seq, len-pos+1) < 0) + goto err; + if (ks_expand(&opts.ks_ins_qual, len-pos+1) < 0) + goto err; + while (pos++ < len) { + opts.ks_ins_seq.s [opts.ks_ins_seq.l++] = 'N'; + opts.ks_ins_qual.s[opts.ks_ins_qual.l++] = '!'; + } + opts.ks_ins_seq.s [opts.ks_ins_seq.l] = 0; + opts.ks_ins_qual.s[opts.ks_ins_qual.l] = 0; + } + } + if (opts.last_tid >= 0) + dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid), + opts.ks_ins_seq.s, opts.ks_ins_seq.l, + opts.ks_ins_qual.s, opts.ks_ins_qual.l); +// if (consensus_loop(&opts) < 0) { +// print_error_errno("consensus", "Failed"); +// goto err; +// } + } + + ret = 0; + + err: + if (opts.iter) + hts_itr_destroy(opts.iter); + if (opts.idx) + hts_idx_destroy(opts.idx); + + if (opts.fp && sam_close(opts.fp) < 0) { + print_error_errno("consensus", "Closing input file \"%s\"", + argv[optind]); + ret = 1; + } + + if (opts.h) + sam_hdr_destroy(opts.h); + sam_global_args_free(&ga); + + if (opts.fp_out && opts.fp_out != stdout) + ret |= fclose(opts.fp_out) != 0; + else + ret |= fflush(stdout) != 0; + + ks_free(&opts.ks_line); + ks_free(&opts.ks_ins_seq); + ks_free(&opts.ks_ins_qual); + + if (ret) + print_error("consensus", "failed"); + + return ret; +} diff --git a/samtools/bam_consensus.c.pysam.c b/samtools/bam_consensus.c.pysam.c new file mode 100644 index 0000000..08536c6 --- /dev/null +++ b/samtools/bam_consensus.c.pysam.c @@ -0,0 +1,1714 @@ +#include "samtools.pysam.h" + +/* bam_consensus.c -- consensus subcommand. + + Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source) + Copyright (C) 2003-2005,2007-2022 Genome Research Ltd. + + Author: James Bonfield + +The primary work here is GRL since 2021, under an MIT license. +Sections derived from Gap5, which include calculate_consensus_gap5() +associated functions, are mostly copyright Genome Research Limited from +2003 onwards. These were originally under a BSD license, but as GRL is +copyright holder these portions can be considered to also be under the +same MIT license below: + + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + * The Gap5 consensus algorithm was in turn derived from the earlier Gap4 + * tool, developed by the Medical Research Council as part of the + * Staden Package. It is unsure how much of this source code is still + * extant, without deep review, but the license used was a compatible + * modified BSD license, included below. + */ + +/* +Modified BSD license for any legacy components from the Staden Package: + +Copyright (c) 2003 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + . Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + . Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + . Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +// FIXME: also use strand to spot possible basecalling errors. +// Specifically het calls where mods are predominantly on one +// strand. So maybe require + and - calls and check concordance +// before calling a het as confident. (Still call, but low qual?) + +// TODO: call by kmers rather than individual bases? Or use kmers to skew +// quality at least. It can identify variants that are low quality due to +// neighbouring edits that aren't consistently correlated. + +// TODO: pileup callback ought to know when it's the last in the region / +// chromosome. This means the caller code doesn't have to handle the +// termination phase and deduplicates the code. (Changing from +// one chr to the next is the same as ending the last.) +// +// TODO: track which reads contribute to multiple confirmed (HQ) differences +// vs which contribute to only one (LQ) difference. Correlated changes +// are more likely to be real. Ie consensus more of a path than solely +// isolated columns. +// +// Either that or a dummy "end of data" call is made to signify end to +// permit tidying up. Maybe add a "start of data" call too? + +// Eg 50T 20A seems T/A het, +// but 30T+ 20T- 18A+ 2A- seems like a consistent A miscall on one strand +// only, while T is spread evenly across both strands. + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "samtools.h" +#include "sam_opts.h" +#include "bam_plbuf.h" +#include "consensus_pileup.h" + +#ifdef __SSE__ +# include +#else +# define _mm_prefetch(a,b) +#endif + +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif +#ifndef MAX +# define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +// Minimum cutoff for storing mod data; => at least 10% chance +#define MOD_CUTOFF 0.46 + +enum format { + FASTQ, + FASTA, + PILEUP +}; + +typedef unsigned char uc; + +typedef struct { + // User options + char *reg; + int use_qual; + int min_qual; + int adj_qual; + int use_mqual; + double scale_mqual; + int nm_adjust; + int nm_halo; + int sc_cost; + int low_mqual; + int high_mqual; + int min_depth; + double call_fract; + double het_fract; + int gap5; + enum format fmt; + int cons_cutoff; + int ambig; + int line_len; + int default_qual; + int het_only; + int all_bases; + int show_del; + int show_ins; + int excl_flags; + int incl_flags; + int min_mqual; + double P_het; + + // Internal state + samFile *fp; + FILE *fp_out; + sam_hdr_t *h; + hts_idx_t *idx; + hts_itr_t *iter; + kstring_t ks_line; + kstring_t ks_ins_seq; + kstring_t ks_ins_qual; + int last_tid; + hts_pos_t last_pos; +} consensus_opts; + +/* -------------------------------------------------------------------------- + * A bayesian consensus algorithm that analyses the data to work out + * which hypothesis of pure A/C/G/T/absent and all combinations of two + * such bases meets the observations. + * + * This has its origins in Gap4 (homozygous) -> Gap5 (heterozygous) + * -> Crumble (tidied up to use htslib's pileup) -> here. + * + */ + +#define CONS_DISCREP 4 +#define CONS_ALL 15 + +#define CONS_MQUAL 16 + +typedef struct { + /* the most likely base call - we never call N here */ + /* A=0, C=1, G=2, T=3, *=4 */ + int call; + + /* The most likely heterozygous base call */ + /* Use "ACGT*"[het / 5] vs "ACGT*"[het % 5] for the combination */ + int het_call; + + /* Log-odds for het_call */ + int het_logodd; + + /* Single phred style call */ + int phred; + + /* Sequence depth */ + int depth; + + /* Discrepancy search score */ + float discrep; +} consensus_t; + +#define P_HET 1e-4 + +#define LOG10 2.30258509299404568401 +#define TENOVERLOG10 4.34294481903251827652 +#define TENLOG2OVERLOG10 3.0103 + +#ifdef __GNUC__ +#define ALIGNED(x) __attribute((aligned(x))) +#else +#define ALIGNED(x) +#endif + +static double prior[25] ALIGNED(16); /* Sum to 1.0 */ +static double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */ + +/* Precomputed matrices for the consensus algorithm */ +static double pMM[101] ALIGNED(16); +static double p__[101] ALIGNED(16); +static double p_M[101] ALIGNED(16); + +static double e_tab_a[1002] ALIGNED(16); +static double *e_tab = &e_tab_a[500]; +static double e_tab2_a[1002] ALIGNED(16); +static double *e_tab2 = &e_tab2_a[500]; +static double e_log[501] ALIGNED(16); + +/* + * Lots of confusing matrix terms here, so some definitions will help. + * + * M = match base + * m = match pad + * _ = mismatch + * o = overcall + * u = undercall + * + * We need to distinguish between homozygous columns and heterozygous columns, + * done using a flat prior. This is implemented by treating every observation + * as coming from one of two alleles, giving us a 2D matrix of possibilities + * (the hypotheses) for each and every call (the observation). + * + * So pMM[] is the chance that given a call 'x' that it came from the + * x/x allele combination. Similarly p_o[] is the chance that call + * 'x' came from a mismatch (non-x) / overcall (consensus=*) combination. + * + * Examples with observation (call) C and * follows + * + * C | A C G T * * | A C G T * + * ----------------- ----------------- + * A | __ _M __ __ o_ A | uu uu uu uu um + * C | _M MM _M _M oM C | uu uu uu uu um + * G | __ _M __ __ o_ G | uu uu uu uu um + * T | __ _M __ __ o_ T | uu uu uu uu um + * * | o_ oM o_ o_ oo * | um um um um mm + * + * In calculation terms, the _M is half __ and half MM, similarly o_ and um. + * + * Relative weights of substitution vs overcall vs undercall are governed on a + * per base basis using the P_OVER and P_UNDER scores (subst is + * 1-P_OVER-P_UNDER). + * + * The heterozygosity weight though is a per column calculation as we're + * trying to model whether the column is pure or mixed. Hence this is done + * once via a prior and has no affect on the individual matrix cells. + */ + +static void consensus_init(double p_het) { + int i; + + for (i = -500; i <= 500; i++) + e_tab[i] = exp(i); + for (i = -500; i <= 500; i++) + e_tab2[i] = exp(i/10.); + for (i = 0; i <= 500; i++) + e_log[i] = log(i); + + // Heterozygous locations + for (i = 0; i < 25; i++) + prior[i] = p_het / 20; + prior[0] = prior[6] = prior[12] = prior[18] = prior[24] = (1-p_het)/5; + + lprior15[0] = log(prior[0]); + lprior15[1] = log(prior[1]*2); + lprior15[2] = log(prior[2]*2); + lprior15[3] = log(prior[3]*2); + lprior15[4] = log(prior[4]*2); + lprior15[5] = log(prior[6]); + lprior15[6] = log(prior[7]*2); + lprior15[7] = log(prior[8]*2); + lprior15[8] = log(prior[9]*2); + lprior15[9] = log(prior[12]); + lprior15[10] = log(prior[13]*2); + lprior15[11] = log(prior[14]*2); + lprior15[12] = log(prior[18]); + lprior15[13] = log(prior[19]*2); + lprior15[14] = log(prior[24]); + + + // Rewrite as new form + for (i = 1; i < 101; i++) { + double prob = 1 - pow(10, -i / 10.0); + + // May want to multiply all these by 5 so pMM[i] becomes close + // to -0 for most data. This makes the sums increment very slowly, + // keeping bit precision in the accumulator. + pMM[i] = log(prob/5); + p__[i] = log((1-prob)/20); + p_M[i] = log((exp(pMM[i]) + exp(p__[i]))/2); + } + + pMM[0] = pMM[1]; + p__[0] = p__[1]; + p_M[0] = p_M[1]; +} + +static inline double fast_exp(double y) { + if (y >= -50 && y <= 50) + return e_tab2[(int)(y*10)]; + + if (y < -500) + y = -500; + if (y > 500) + y = 500; + + return e_tab[(int)y]; +} + +/* Taylor (deg 3) implementation of the log */ +static inline double fast_log2(double val) +{ + // FP representation is exponent & mantissa, where + // value = 2^E * M. + // Hence log2(value) = log2(2^E * M) + // = log2(2^E)+ log2(M) + // = E + log2(M) + union { double d; uint64_t x; } u = {val}; + const int E = ((u.x >> 52) & 2047) - 1024; // exponent E + // Initial log2(M) based on mantissa + u.x &= ~(2047LL << 52); + u.x += 1023LL << 52; + + val = ((-1/3.) * u.d + 2) * u.d - 2/3.; + + return E + val; +} + +#define ph_log(x) (-TENLOG2OVERLOG10*fast_log2((x))) + + +int nins(const bam1_t *b){ + int i, indel = 0; + uint32_t *cig = bam_get_cigar(b); + for (i = 0; i < b->core.n_cigar; i++) { + int op = bam_cigar_op(cig[i]); + if (op == BAM_CINS || op == BAM_CDEL) + indel += bam_cigar_oplen(cig[i]); + } + return indel; +} + +// Return the local NM figure within halo (+/- HALO) of pos. +// This local NM is used as a way to modify MAPQ to get a localised MAPQ +// score via an adhoc fashion. +double nm_local(const pileup_t *p, const bam1_t *b, hts_pos_t pos) { + int *nm = (int *)p->cd; + if (!nm) + return 0; + pos -= b->core.pos; + if (pos < 0) + return nm[0]; + if (pos >= b->core.l_qseq) + return nm[b->core.l_qseq-1]; + + return nm[pos] / 10.0; +} + +/* + * Initialise a new sequence appearing in the pileup. We use this to + * precompute some metrics that we'll repeatedly use in the consensus + * caller; the localised NM score. + * + * We also directly amend the BAM record (which will be discarded later + * anyway) to modify qualities to account for local quality minima. + * + * Returns 0 (discard) or 1 (keep) on success, -1 on failure. + */ +int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { + consensus_opts *opts = (consensus_opts *)client_data; + if (!opts->use_mqual) + return 1; + + const bam1_t *b = &p->b; + int qlen = b->core.l_qseq, i; + int *local_nm = calloc(qlen, sizeof(*local_nm)); + if (!local_nm) + return -1; + p->cd = local_nm; + + if (opts->adj_qual) { +#if 0 + // Tweak by localised quality. + // Quality is reduced by a significant portion of the minimum quality + // in neighbouring bases, on the pretext that if the region is bad, then + // this base is bad even if it claims otherwise. + uint8_t *qual = bam_get_qual(b); + const int qhalo = 8; // 2? + int qmin = 50; // effectively caps PacBio qual too + for (i = 0; i < qlen && i < qhalo; i++) { + local_nm[i] = qual[i]; + if (qmin > qual[i]) + qmin = qual[i]; + } + for (;i < qlen-qhalo; i++) { + //int t = (qual[i]*1 + 3*qmin)/4; // good on 60x + int t = (qual[i] + 5*qmin)/4; // good on 15x + local_nm[i] = t < qual[i] ? t : qual[i]; + if (qmin > qual[i+qhalo]) + qmin = qual[i+qhalo]; + else if (qmin <= qual[i-qhalo]) { + int j; + qmin = 50; + for (j = i-qhalo+1; j <= i+qhalo; j++) + if (qmin > qual[j]) + qmin = qual[j]; + } + } + for (; i < qlen; i++) { + local_nm[i] = qual[i]; + local_nm[i] = (local_nm[i] + 6*qmin)/4; + } + + for (i = 0; i < qlen; i++) { + qual[i] = local_nm[i]; + + // Plus overall rescale. + // Lower becomes lower, very high becomes a little higher. + // Helps deep GIAB, but detrimental elsewhere. (What this really + // indicates is quality calibration differs per data set.) + // It's probably something best accounted for somewhere else. + + //qual[i] = qual[i]*qual[i]/40+1; + } + memset(local_nm, 0, qlen * sizeof(*local_nm)); +#else + // Skew local NM by qual vs min-qual delta + uint8_t *qual = bam_get_qual(b); + const int qhalo = 8; // 4 + int qmin = 99; + for (i = 0; i < qlen && i < qhalo; i++) { + if (qmin > qual[i]) + qmin = qual[i]; + } + for (;i < qlen-qhalo; i++) { + int t = (qual[i] + 5*qmin)/4; // good on 15x + local_nm[i] += t < qual[i] ? (qual[i]-t) : 0; + if (qmin > qual[i+qhalo]) + qmin = qual[i+qhalo]; + else if (qmin <= qual[i-qhalo]) { + int j; + qmin = 99; + for (j = i-qhalo+1; j <= i+qhalo; j++) + if (qmin > qual[j]) + qmin = qual[j]; + } + } + for (; i < qlen; i++) { + int t = (qual[i] + 5*qmin)/4; // good on 15x + local_nm[i] += t < qual[i] ? (qual[i]-t) : 0; + } +#endif + } + + // Adjust local_nm array by the number of edits within + // a defined region (pos +/- halo). + const int halo = opts->nm_halo; + const uint8_t *md = bam_aux_get(b, "MD"); + if (!md) + return 1; + md = (const uint8_t *)bam_aux2Z(md); + + // Handle cost of being near a soft-clip + uint32_t *cig = bam_get_cigar(b); + int ncig = b->core.n_cigar; + + if ( (cig[0] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP || + ((cig[0] & BAM_CIGAR_MASK) == BAM_CHARD_CLIP && ncig > 1 && + (cig[1] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP)) { + for (i = 0; i < halo && i < qlen; i++) + local_nm[i]+=opts->sc_cost; + for (; i < halo*2 && i < qlen; i++) + local_nm[i]+=opts->sc_cost>>1; + } + if ( (cig[ncig-1] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP || + ((cig[ncig-1] & BAM_CIGAR_MASK) == BAM_CHARD_CLIP && ncig > 1 && + (cig[ncig-2] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP)) { + for (i = qlen-1; i >= qlen-halo && i >= 0; i--) + local_nm[i]+=opts->sc_cost; + for (; i >= qlen-halo*2 && i >= 0; i--) + local_nm[i]+=opts->sc_cost>>1; + } + + // Now iterate over MD tag + int pos = 0; + while (*md) { + if (isdigit(*md)) { + uint8_t *endptr; + long i = strtol((char *)md, (char **)&endptr, 10); + md = endptr; + pos += i; + continue; + } + + // deletion. + // Should we bump local_nm here too? Maybe + if (*md == '^') { + while (*++md && !isdigit(*md)) + continue; + continue; + } + + // substitution + for (i = pos-halo*2 >= 0 ? pos-halo*2 : 0; i < pos-halo; i++) + local_nm[i]+=5; + for (; i < pos+halo && i < qlen; i++) + local_nm[i]+=10; + for (; i < pos+halo*2 && i < qlen; i++) + local_nm[i]+=5; + md++; + } + + return 1; +} + + +static +int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, + pileup_t *plp, consensus_opts *opts, + consensus_t *cons, int default_qual) { + int i, j; + static int init_done =0; + static double q2p[101], mqual_pow[256]; + double min_e_exp = DBL_MIN_EXP * log(2) + 1; + + double S[15] ALIGNED(16) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + double sumsC[6] = {0,0,0,0,0,0}; // A C G T * N + + // Small hash on seq to check for uniqueness of surrounding bases. + // If it's frequent, then it's more likely to be correctly called than + // if it's rare. + // Helps a bit on deep data, especially with K2=3, but detrimental on + // shallow and (currently) quite a slow down. + +//#define K2 2 +#ifdef K2 + int hashN[1<<(K2*4+2)] = {0}; + int hash1[1<<2] = {0}; +#endif + + /* Map the 15 possible combinations to 1-base or 2-base encodings */ + static int map_sing[15] ALIGNED(16) = + {0, 5, 5, 5, 5, + 1, 5, 5, 5, + 2, 5, 5, + 3, 5, + 4}; + static int map_het[15] ALIGNED(16) = + {0, 1, 2, 3, 4, + 6, 7, 8, 9, + 12, 13, 14, + 18, 19, + 24}; + + if (!init_done) { + init_done = 1; + consensus_init(opts->P_het); + + for (i = 0; i <= 100; i++) { + q2p[i] = pow(10, -i/10.0); + } + + for (i = 0; i < 255; i++) { + //mqual_pow[i] = 1-pow(10, -(i+.01)/10.0); + mqual_pow[i] = 1-pow(10, -(i*.9)/10.0); + //mqual_pow[i] = 1-pow(10, -(i/3+.1)/10.0); + //mqual_pow[i] = 1-pow(10, -(i/2+.05)/10.0); + } + // unknown mqual + mqual_pow[255] = mqual_pow[10]; + } + + /* Initialise */ + int counts[6] = {0}; + + /* Accumulate */ + +#ifdef K2 + const pileup_t *ptmp = plp; + for (; ptmp; ptmp = ptmp->next) { + const pileup_t *p = ptmp; + if (p->qual < opts->min_qual) + continue; + + int hb = 0; +#define _ 0 + static int X[16] = {_,0,1,_,2,_,_,_,3,_,_,_,_,_,_,_}; +#undef _ + uint8_t *seq = bam_get_seq(&p->b); + int i, base1 = X[p->base4]; + hash1[base1]++; + for (i = p->seq_offset-K2; i <= p->seq_offset+K2; i++) { + int base = i >= 0 && i < p->b.core.l_qseq ? X[bam_seqi(seq,i)] : _; + hb = (hb<<2)|base; + } + hashN[hb]++; + } +#endif + + int td = depth; // original depth + depth = 0; + for (; plp; plp = plp->next) { + pileup_t *p = plp; + + if (p->next) + _mm_prefetch(p->next, _MM_HINT_T0); + + if (p->qual < opts->min_qual) + continue; + + if (p->ref_skip) + continue; + +#ifdef K2 + int hb = 0; +#define _ 0 + static int X[16] = {_,0,1,_,2,_,_,_,3,_,_,_,_,_,_,_}; + int i, base1 = X[p->base4]; + for (i = p->seq_offset-K2; i <= p->seq_offset+K2; i++) { + int base = i >= 0 && i < p->b.core.l_qseq ? X[bam_seqi(seq,i)] : _; + hb = (hb<<2)|base; + } + // fprintf(samtools_stderr, "%c: %d %d of %d\t%d %d\n", p->base, hashN[hb], hash1[base1], td, p->qual, p->qual * hashN[hb] / hash1[base1]); +#undef _ +#endif + + const bam1_t *b = &p->b; + uint8_t base = p->base4; + uint8_t *qual_arr = bam_get_qual(b); + uint8_t qual = p->qual; + //qual = qual*qual/40+1; + if (qual == 255 || (qual == 0 && *qual_arr == 255)) + qual = default_qual; + +#ifdef K2 + //qual = qual * hashN[hb] / hash1[base1]; + qual -= -TENOVERLOG10*log(hashN[hb] / (hash1[base1]+.1)); + if (qual < 1) + qual = 1; +#endif + + // =ACM GRSV TWYH KDBN * + static int L[32] = { + 5,0,1,5, 2,5,5,5, 3,5,5,5, 5,5,5,5, + 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, + }; + + // convert from sam base to acgt*n order. + base = L[base]; + + double MM, __, _M, qe; + + // Correction for mapping quality. Maybe speed up via lookups? + // Cannot nullify mapping quality completely. Lots of (true) + // SNPs means low mapping quality. (Ideally need to know + // hamming distance to next best location.) + + if (flags & CONS_MQUAL) { + int mqual = b->core.qual; + if (opts->nm_adjust) { + mqual /= (nm_local(p, b, pos)+1); + mqual *= 1 + 2*(0.5-(td>30?30:td)/60.0); // depth fudge + } + + // higher => call more; +FP, -FN + // lower => call less; -FP, +FN + mqual *= opts->scale_mqual; + + // Drop these? They don't seem to ever help. + if (mqual < opts->low_mqual) + mqual = opts->low_mqual; + if (mqual > opts->high_mqual) + mqual = opts->high_mqual; + + double _p = 1-q2p[qual]; + double _m = mqual_pow[mqual]; + qual = ph_log(1-(_m * _p + (1 - _m)/4)); // CURRENT + //qual = ph_log(1-_p*_m); // testing + //qual *= 6/sqrt(td); + } + + /* Quality 0 should never be permitted as it breaks the maths */ + if (qual < 1) + qual = 1; + + __ = p__[qual]; // neither match + MM = pMM[qual] - __; // both match + _M = p_M[qual] - __; // one allele only (half match) + + if (flags & CONS_DISCREP) { + qe = q2p[qual]; + sumsC[base] += 1 - qe; + } + + counts[base]++; + + switch (base) { + case 0: // A + S[0] += MM; + S[1] += _M; + S[2] += _M; + S[3] += _M; + S[4] += _M; + break; + + case 1: // C + S[1] += _M; + S[5] += MM; + S[6] += _M; + S[7] += _M; + S[8] += _M; + break; + + case 2: // G + S[ 2] += _M; + S[ 6] += _M; + S[ 9] += MM; + S[10] += _M; + S[11] += _M; + break; + + case 3: // T + S[ 3] += _M; + S[ 7] += _M; + S[10] += _M; + S[12] += MM; + S[13] += _M; + + break; + + case 4: // * + S[ 4] += _M; + S[ 8] += _M; + S[11] += _M; + S[13] += _M; + S[14] += MM; + break; + + case 5: /* N => equal weight to all A,C,G,T but not a pad */ + S[ 0] += MM; + S[ 1] += MM; + S[ 2] += MM; + S[ 3] += MM; + S[ 4] += _M; + + S[ 5] += MM; + S[ 6] += MM; + S[ 7] += MM; + S[ 8] += _M; + + S[ 9] += MM; + S[10] += MM; + S[11] += _M; + + S[12] += MM; + S[13] += _M; + break; + } + + depth++; + + if (p->eof && p->cd) { + free(p->cd); + p->cd = NULL; + } + } + + /* We've accumulated stats, so now we speculate on the consensus call */ + double shift, max, max_het, norm[15]; + int call = 0, het_call = 0; + double tot1 = 0, tot2 = 0; + + /* + * Scale numbers so the maximum score is 0. This shift is essentially + * a multiplication in non-log scale to both numerator and denominator, + * so it cancels out. We do this to avoid calling exp(-large_num) and + * ending up with norm == 0 and hence a 0/0 error. + * + * Can also generate the base-call here too. + */ + shift = -DBL_MAX; + max = -DBL_MAX; + max_het = -DBL_MAX; + + for (j = 0; j < 15; j++) { + S[j] += lprior15[j]; + if (shift < S[j]) + shift = S[j]; + + /* Only call pure AA, CC, GG, TT, ** for now */ + if (j != 0 && j != 5 && j != 9 && j != 12 && j != 14) { + if (max_het < S[j]) { + max_het = S[j]; + het_call = j; + } + continue; + } + + if (max < S[j]) { + max = S[j]; + call = j; + } + } + + /* + * Shift and normalise. + * If call is, say, b we want p = b/(a+b+c+...+n), but then we do + * p/(1-p) later on and this has exceptions when p is very close + * to 1. + * + * Hence we compute b/(a+b+c+...+n - b) and + * rearrange (p/norm) / (1 - (p/norm)) to be p/norm2. + */ + for (j = 0; j < 15; j++) { + S[j] -= shift; + double e = fast_exp(S[j]); + S[j] = (S[j] > min_e_exp) ? e : DBL_MIN; + norm[j] = 0; + } + + for (j = 0; j < 15; j++) { + norm[j] += tot1; + norm[14-j] += tot2; + tot1 += S[j]; + tot2 += S[14-j]; + } + + /* And store result */ + if (!depth || depth == counts[5] /* all N */) { + cons->call = 4; /* N */ + cons->het_call = 0; + cons->het_logodd = 0; + cons->phred = 0; + cons->depth = 0; + cons->discrep = 0; + return 0; + } + + cons->depth = depth; + + /* Call */ + if (norm[call] == 0) norm[call] = DBL_MIN; + // Approximation of phred for when S[call] ~= 1 and norm[call] + // is small. Otherwise we need the full calculation. + int ph; + if (S[call] == 1 && norm[call] < .01) + ph = ph_log(norm[call]) + .5; + else + ph = ph_log(1-S[call]/(norm[call]+S[call])) + .5; + + cons->call = map_sing[call]; + cons->phred = ph < 0 ? 0 : ph; + + if (norm[het_call] == 0) norm[het_call] = DBL_MIN; + ph = TENLOG2OVERLOG10 * (fast_log2(S[het_call]) + - fast_log2(norm[het_call])) + .5; + + cons->het_call = map_het[het_call]; + cons->het_logodd = ph; + + /* Compute discrepancy score */ + if (flags & CONS_DISCREP) { + double m = sumsC[0]+sumsC[1]+sumsC[2]+sumsC[3]+sumsC[4]; + double c; + if (cons->het_logodd > 0) + c = sumsC[cons->het_call%5] + sumsC[cons->het_call/5]; + else + c = sumsC[cons->call]; + cons->discrep = (m-c)/sqrt(m); + } + + return 0; +} + + +/* -------------------------------------------------------------------------- + * Main processing logic + */ + +static void dump_fastq(consensus_opts *opts, + const char *name, + const char *seq, size_t seq_l, + const char *qual, size_t qual_l) { + enum format fmt = opts->fmt; + int line_len = opts->line_len; + FILE *fp = opts->fp_out; + + fprintf(fp, "%c%s\n", ">@"[fmt==FASTQ], name); + size_t i; + for (i = 0; i < seq_l; i += line_len) + fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), seq+i); + + if (fmt == FASTQ) { + fprintf(fp, "+\n"); + for (i = 0; i < seq_l; i += line_len) + fprintf(fp, "%.*s\n", (int)MIN(line_len, seq_l - i), qual+i); + } +} + +//--------------------------------------------------------------------------- + +/* + * Reads a single alignment record, using either the iterator + * or a direct sam_read1 call. + */ +static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { + consensus_opts *opts = (consensus_opts *)dat; + + for (;;) { + int ret = opts->iter + ? sam_itr_next(fp, opts->iter, b) + : sam_read1(fp, h, b); + if (ret < 0) + return ret; + + // Apply hard filters + if (opts->incl_flags && !(b->core.flag & opts->incl_flags)) + continue; + if (opts->excl_flags && (b->core.flag & opts->excl_flags)) + continue; + if (b->core.qual < opts->min_mqual) + continue; + + return ret; + } +} + +/* -------------------------------------------------------------------------- + * A simple summing algorithm, either pure base frequency, or by + * weighting them according to their quality values. + * + * This is crude, but easy to understand and fits with several + * standard pileup criteria (eg COG-UK / CLIMB Covid-19 seq project). + * + * + * call1 / score1 / depth1 is the highest scoring allele. + * call2 / score2 / depth2 is the second highest scoring allele. + * + * Het_fract: score2/score1 + * Call_fract: score1 or score1+score2 over total score + * Min_depth: minimum total depth of utilised bases (depth1+depth2) + * Min_score: minimum total score of utilised bases (score1+score2) + * + * Eg het_fract 0.66, call_fract 0.75 and min_depth 10. + * 11A, 2C, 2G (14 total depth) is A. + * 9A, 2C, 2G (12 total depth) is N as depth(A) < 10. + * 11A, 5C, 5G (21 total depth) is N as 11/21 < 0.75 (call_fract) + * + * + * 6A, 5G, 1C (12 total depth) is AG het as depth(A)+depth(G) >= 10 + * and 5/6 >= 0.66 and 11/12 >= 0.75. + * + * 6A, 5G, 4C (15 total depth) is N as (6+5)/15 < 0.75 (call_fract). + * + * + * Note for the purpose of deletions, a base/del has an ambiguity + * code of lower-case base (otherwise it is uppercase). + */ +static int calculate_consensus_simple(const pileup_t *plp, + consensus_opts *opts, int *qual) { + int i, min_qual = opts->min_qual; + + // Map "seqi" nt16 to A,C,G,T compatibility with weights on pure bases. + // where seqi is A | (C<<1) | (G<<2) | (T<<3) + // * A C M G R S V T W Y H K D B N + static int seqi2A[16] = { 0,8,0,4, 0,4,0,2, 0,4,0,2, 0,2,0,1 }; + static int seqi2C[16] = { 0,0,8,4, 0,0,4,2, 0,0,4,2, 0,0,2,1 }; + static int seqi2G[16] = { 0,0,0,0, 8,4,4,1, 0,0,0,0, 4,2,2,1 }; + static int seqi2T[16] = { 0,0,0,0, 0,0,0,0, 8,4,4,2, 8,2,2,1 }; + + // Ignore ambiguous bases in seq for now, so we don't treat R, Y, etc + // as part of one base and part another. Based on BAM seqi values. + // We also use freq[16] as "*" for gap. + int freq[17] = {0}; // base frequency, aka depth + int score[17] = {0}; // summation of base qualities + + // Accumulate + for (; plp; plp = plp->next) { + const pileup_t *p = plp; + if (p->next) + _mm_prefetch(p->next, _MM_HINT_T0); + + int q = p->qual; + if (q < min_qual) + // Should we still record these in freq[] somewhere so + // we can use them in the fracts? + // Difference between >= X% of high-qual bases calling Y + // and >= X% of all bases are high-quality Y calls. + continue; + + //int b = p->is_del ? 16 : bam_seqi(bam_get_seq(&p->b), p->seq_offset); + int b = p->base4; + + // Map ambiguity codes to one or more component bases. + if (b < 16) { + int Q = seqi2A[b] * (opts->use_qual ? q : 1); + freq[1] += Q?1:0; + score[1] += Q?Q:0; + Q = seqi2C[b] * (opts->use_qual ? q : 1); + freq[2] += Q?1:0; + score[2] += Q?Q:0; + Q = seqi2G[b] * (opts->use_qual ? q : 1); + freq[4] += Q?1:0; + score[4] += Q?Q:0; + Q = seqi2T[b] * (opts->use_qual ? q : 1); + freq[8] += Q?1:0; + score[8] += Q?Q:0; + } else { /* * */ + freq[16] ++; + score[16]+=8 * (opts->use_qual ? q : 1); + } + } + + // Total usable depth + int tscore = 0; + for (i = 0; i < 5; i++) + tscore += score[1<= opts->het_fract * score1 && opts->ambig) { + used_base |= call2; + used_score += score2; + used_depth += depth2; + } + + // N is too shallow, or insufficient proportion of total + if (used_depth < opts->min_depth || + used_score < opts->call_fract * tscore) { + used_depth = 0; + // But note shallow gaps are still called gaps, not N, as + // we're still more confident there is no base than it is + // A, C, G or T. + used_base = call1 == 16 /*&& depth1 >= call_fract * depth*/ + ? 16 : 0; // * or N + } + + // Our final call. "?" shouldn't be possible to generate + const char *het = + "NACMGRSVTWYHKDBN" + "*ac?g???t???????"; + + //printf("%c %d\n", het[used_base], used_depth); + if (qual) + *qual = used_base ? 100.0 * used_score / tscore : 0; + + return het[used_base]; +} + +static int empty_pileup2(consensus_opts *opts, sam_hdr_t *h, int tid, + hts_pos_t start, hts_pos_t end) { + const char *name = sam_hdr_tid2name(h, tid); + hts_pos_t i; + + int err = 0; + for (i = start; i < end; i++) + err |= fprintf(opts->fp_out, "%s\t%"PRIhts_pos"\t0\t0\tN\t0\t*\t*\n", name, i+1) < 0; + + return err ? -1 : 0; +} + +/* + * Returns 0 on success + * -1 on failure + */ +static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, + int depth, hts_pos_t pos, int nth, int is_insert) { + unsigned char *qp, *cp; + char *rp; + int ref, cb, cq; + consensus_opts *opts = (consensus_opts *)cd; + int tid = p->b.core.tid; + +// opts->show_ins=0; +// opts->show_del=1; + if (!opts->show_ins && nth) + return 0; + + if (opts->iter) { + if (opts->iter->beg >= pos || opts->iter->end < pos) + return 0; + } + + if (opts->all_bases) { + if (tid != opts->last_tid && opts->last_tid >= 0) { + hts_pos_t len = sam_hdr_tid2len(opts->h, opts->last_tid); + if (opts->iter) + len = MIN(opts->iter->end, len); + if (empty_pileup2(opts, opts->h, opts->last_tid, opts->last_pos, + len) < 0) + return -1; + if (tid >= 0) { + if (empty_pileup2(opts, opts->h, tid, + opts->iter ? opts->iter->beg : 0, + pos-1) < 0) + return -1; + } + } + if (opts->last_pos >= 0 && pos > opts->last_pos+1) { + if (empty_pileup2(opts, opts->h, p->b.core.tid, opts->last_pos, + pos-1) < 0) + return -1; + } else if (opts->last_pos < 0) { + if (empty_pileup2(opts, opts->h, p->b.core.tid, + opts->iter ? opts->iter->beg : 0, pos-1) < 0) + return -1; + } + } + + if (opts->gap5) { + consensus_t cons; + calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual); + if (cons.het_logodd > 0 && opts->ambig) { + cb = "AMRWa" // 5x5 matrix with ACGT* per row / col + "MCSYc" + "RSGKg" + "WYKTt" + "acgt*"[cons.het_call]; + cq = cons.het_logodd; + } else{ + cb = "ACGT*"[cons.call]; + cq = cons.phred; + } + if (cq < opts->cons_cutoff && cb != '*') { + cb = 'N'; + cq = 0; + } + } else { + cb = calculate_consensus_simple(p, opts, &cq); + } + if (cb < 0) + return -1; + + if (!p) + return 0; + + if (!opts->show_del && cb == '*') + return 0; + + /* Ref, pos, nth, score, seq, qual */ + kstring_t *ks = &opts->ks_line; + ks->l = 0; + ref = p->b.core.tid; + rp = (char *)sam_hdr_tid2name(h, ref); + + int err = 0; + err |= kputs(rp, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputw(pos, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputw(nth, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputw(depth, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputc_(cb, ks) < 0; + err |= kputc_('\t', ks) < 0; + err |= kputw(cq, ks) < 0; + err |= kputc_('\t', ks) < 0; + if (err) + return -1; + + /* Seq + qual at predetermined offsets */ + if (ks_resize(ks, ks->l + depth*2 + 2) < 0) + return -1; + + cp = (unsigned char *)ks->s + ks->l; + ks->l += depth*2 + 2; + qp = cp+depth+1; + for (; p; p = p->next) { + // Too tight a loop to help much, but some benefit still + if (p->next && p->next->next) + _mm_prefetch(p->next->next, _MM_HINT_T0); + if (p->b_is_rev) { + *cp++ = p->base == '*' ? '#' : tolower(p->base); + } else { + *cp++ = p->base; + } + *qp++ = MIN(p->qual,93) + '!'; + } + *cp++ = '\t'; + *qp++ = '\n'; + if (fwrite(ks->s, 1, ks->l, opts->fp_out) != ks->l) + return -1; + + opts->last_pos = pos; + opts->last_tid = tid; + + return 0; +} + +static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, + int depth, hts_pos_t pos, int nth, int is_insert) { + int cb, cq; + consensus_opts *opts = (consensus_opts *)cd; + int tid = p->b.core.tid; + kstring_t *seq = &opts->ks_ins_seq; + kstring_t *qual = &opts->ks_ins_qual; + + if (!opts->show_ins && nth) + return 0; + + if (opts->iter) { + if (opts->iter->beg >= pos || opts->iter->end < pos) + return 0; + } + + if (tid != opts->last_tid) { + if (opts->last_tid != -1) { + if (opts->all_bases) { + int i, N; + if (opts->iter) { + opts->last_pos = MAX(opts->last_pos, opts->iter->beg-1); + N = opts->iter->end; + } else { + N = INT_MAX; + } + N = MIN(N, sam_hdr_tid2len(opts->h, opts->last_tid)) + - opts->last_pos; + if (N > 0) { + if (ks_expand(seq, N+1) < 0) + return -1; + if (ks_expand(qual, N+1) < 0) + return -1; + for (i = 0; i < N; i++) { + seq->s[seq->l++] = 'N'; + qual->s[qual->l++] = '!'; + } + seq->s[seq->l] = 0; + qual->s[qual->l] = 0; + } + } + dump_fastq(opts, sam_hdr_tid2name(opts->h, opts->last_tid), + seq->s, seq->l, qual->s, qual->l); + } + + seq->l = 0; qual->l = 0; + opts->last_tid = tid; +// if (opts->all_bases) +// opts->last_pos = 0; + if (opts->iter) + opts->last_pos = opts->iter->beg; + else + opts->last_pos = opts->all_bases ? 0 : pos-1; + } + + // share this with basic_pileup + if (opts->gap5) { + consensus_t cons; + calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual); + if (cons.het_logodd > 0 && opts->ambig) { + cb = "AMRWa" // 5x5 matrix with ACGT* per row / col + "MCSYc" + "RSGKg" + "WYKTt" + "acgt*"[cons.het_call]; + cq = cons.het_logodd; + } else{ + cb = "ACGT*"[cons.call]; + cq = cons.phred; + } + if (cq < opts->cons_cutoff && cb != '*' && + cons.het_call % 5 != 4 && cons.het_call / 5 != 4) { + // het base/* keeps base or * as most likely pure call, else N. + // This is because we don't have a traditional way of representing + // base or not-base ambiguity. + cb = 'N'; + cq = 0; + } + } else { + cb = calculate_consensus_simple(p, opts, &cq); + } + if (cb < 0) + return -1; + + if (!p) + return 0; + + if (!opts->show_del && cb == '*') { + opts->last_pos = pos; + opts->last_tid = tid; + return 0; + } + // end of share + + // Append consensus base/qual to seqs + if (pos > opts->last_pos) { + if (opts->last_pos >= 0 || opts->all_bases) { + // FIXME: don't expand qual if fasta + if (ks_expand(seq, pos - opts->last_pos) < 0 || + ks_expand(qual, pos - opts->last_pos) < 0) + return -1; + memset(seq->s + seq->l, 'N', pos - (opts->last_pos+1)); + memset(qual->s + qual->l, '!', pos - (opts->last_pos+1)); + seq->l += pos - (opts->last_pos+1); + qual->l += pos - (opts->last_pos+1); + } + } + if ((nth && opts->show_ins && cb != '*') + || cb != '*' || (pos > opts->last_pos && opts->show_del)) { + int err = 0; + err |= kputc(cb, seq) < 0; + err |= kputc(MIN(cq, '~'-'!')+'!', qual) < 0; + if (err) + return -1; + } + + opts->last_pos = pos; + opts->last_tid = tid; + + return 0; +} +// END OF NEW PILEUP +//--------------------------------------------------------------------------- + +static void usage_exit(FILE *fp, int exit_status) { + fprintf(fp, "Usage: samtools consensus [options] \n"); + fprintf(fp, "\nOptions:\n"); + fprintf(fp, " -r, --region REG Limit query to REG. Requires an index\n"); + fprintf(fp, " -f, --format FMT Output in format FASTA, FASTQ or PILEUP [FASTA]\n"); + fprintf(fp, " -l, --line-len INT Wrap FASTA/Q at line length INT [70]\n"); + fprintf(fp, " -o, --output FILE Output consensus to FILE\n"); + fprintf(fp, " -m, --mode STR Switch consensus mode to \"simple\"/\"bayesian\" [bayesian]\n"); + fprintf(fp, " -a Output all bases (start/end of reference)\n"); + fprintf(fp, " --rf, --incl-flags STR|INT\n"); + fprintf(fp, " Only include reads with any flag bit set [0]\n"); + fprintf(fp, " --ff, --excl-flags STR|INT\n"); + fprintf(fp, " Exclude reads with any flag bit set\n"); + fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); + fprintf(fp, " --min-MQ INT Exclude reads with mapping quality below INT [0]\n"); + fprintf(fp, " --show-del yes/no Whether to show deletion as \"*\" [no]\n"); + fprintf(fp, " --show-ins yes/no Whether to show insertions [yes]\n"); + fprintf(fp, " -A, --ambig Enable IUPAC ambiguity codes [off]\n"); + fprintf(fp, "\nFor simple consensus mode:\n"); + fprintf(fp, " -q, --(no-)use-qual Use quality values in calculation [off]\n"); + fprintf(fp, " -c, --call-fract INT At least INT portion of bases must agree [0.75]\n"); + fprintf(fp, " -d, --min-depth INT Minimum depth of INT [1]\n"); + fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.5]\n"); + fprintf(fp, "\nFor default \"Bayesian\" consensus mode:\n"); + fprintf(fp, " -C, --cutoff C Consensus cutoff quality C [10]\n"); + fprintf(fp, " --(no-)adj-qual Modify quality with local minima [on]\n"); + fprintf(fp, " --(no-)use-MQ Use mapping quality in calculation [on]\n"); + fprintf(fp, " --(no-)adj-MQ Modify mapping quality by local NM [on]\n"); + fprintf(fp, " --NM-halo INT Size of window for NM count in --adj-MQ [50]\n"); + fprintf(fp, " --scale-MQ FLOAT Scale mapping quality by FLOAT [1.00]\n"); + fprintf(fp, " --low-MQ INT Cap minimum mapping quality [1]\n"); + fprintf(fp, " --high-MQ INT Cap maximum mapping quality [60]\n"); + fprintf(fp, " --P-het FLOAT Probability of heterozygous site[%.1e]\n", + P_HET); + + fprintf(fp, "\nGlobal options:\n"); + sam_global_opt_help(fp, "-.---@-."); + samtools_exit(exit_status); +} + +int main_consensus(int argc, char **argv) { + int c, ret = 1; + + consensus_opts opts = { + // User options + .gap5 = 1, + .use_qual = 0, + .min_qual = 0, + .adj_qual = 1, + .use_mqual = 1, + .scale_mqual = 1.00, + .nm_adjust = 1, + .nm_halo = 50, + .sc_cost = 60, + .low_mqual = 1, + .high_mqual = 60, + .min_depth = 1, + .call_fract = 0.75, + .het_fract = 0.5, + .het_only = 0, + .fmt = FASTA, + .cons_cutoff = 10, + .ambig = 0, + .line_len = 70, + .default_qual = 10, + .all_bases = 0, + .show_del = 0, + .show_ins = 1, + .incl_flags = 0, + .excl_flags = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, + .min_mqual = 0, + .P_het = P_HET, + + // Internal state + .ks_line = {0,0}, + .ks_ins_seq = {0,0}, + .ks_ins_qual = {0,0}, + .fp = NULL, + .fp_out = samtools_stdout, + .iter = NULL, + .idx = NULL, + .last_tid = -1, + .last_pos = -1, + }; + + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), + {"use-qual", no_argument, NULL, 'q'}, + {"no-use-qual", no_argument, NULL, 'q'+1000}, + {"adj-qual", no_argument, NULL, 'q'+100}, + {"no-adj-qual", no_argument, NULL, 'q'+101}, + {"use-MQ", no_argument, NULL, 'm'+1000}, + {"no-use-MQ", no_argument, NULL, 'm'+1001}, + {"adj-MQ", no_argument, NULL, 'm'+100}, + {"no-adj-MQ", no_argument, NULL, 'm'+101}, + {"NM-halo", required_argument, NULL, 'h'+100}, + {"SC-cost", required_argument, NULL, 'h'+101}, + {"scale-MQ", required_argument, NULL, 14}, + {"low-MQ" , required_argument, NULL, 9}, + {"high-MQ", required_argument, NULL, 10}, + {"min-depth", required_argument, NULL, 'd'}, + {"call-fract", required_argument, NULL, 'c'}, + {"het-fract", required_argument, NULL, 'H'}, + {"region", required_argument, NULL, 'r'}, + {"format", required_argument, NULL, 'f'}, + {"cutoff", required_argument, NULL, 'C'}, + {"ambig", no_argument, NULL, 'A'}, + {"line-len", required_argument, NULL, 'l'}, + {"default-qual", required_argument, NULL, 1}, + {"het-only", no_argument, NULL, 6}, + {"show-del", required_argument, NULL, 7}, + {"show-ins", required_argument, NULL, 8}, + {"output", required_argument, NULL, 'o'}, + {"incl-flags", required_argument, NULL, 11}, + {"rf", required_argument, NULL, 11}, + {"excl-flags", required_argument, NULL, 12}, + {"ff", required_argument, NULL, 12}, + {"min-MQ", required_argument, NULL, 13}, + {"P-het", required_argument, NULL, 15}, + {"mode", required_argument, NULL, 'm'}, + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:", + lopts, NULL)) >= 0) { + switch (c) { + case 'a': opts.all_bases++; break; + case 'q': opts.use_qual=1; break; + case 'q'+1000: opts.use_qual=0; break; + case 'm'+1000: opts.use_mqual=1; break; + case 'm'+1001: opts.use_mqual=0; break; + case 14: opts.scale_mqual = atof(optarg); break; + case 9: opts.low_mqual = atoi(optarg); break; + case 10: opts.high_mqual = atoi(optarg); break; + case 'd': opts.min_depth = atoi(optarg); break; + case 'c': opts.call_fract = atof(optarg); break; + case 'H': opts.het_fract = atof(optarg); break; + case 'r': opts.reg = optarg; break; + case 'C': opts.cons_cutoff = atoi(optarg); break; + case 'A': opts.ambig = 1; break; + case 1: opts.default_qual = atoi(optarg); break; + case 6: opts.het_only = 1; break; + case 7: opts.show_del = (*optarg == 'y' || *optarg == 'Y'); break; + case 8: opts.show_ins = (*optarg == 'y' || *optarg == 'Y'); break; + case 13: opts.min_mqual = atoi(optarg); break; + case 15: opts.P_het = atof(optarg); break; + case 'q'+100: opts.adj_qual = 1; break; + case 'q'+101: opts.adj_qual = 0; break; + case 'm'+100: opts.nm_adjust = 1; break; + case 'm'+101: opts.nm_adjust = 0; break; + case 'h'+100: opts.nm_halo = atoi(optarg); break; + case 'h'+101: opts.sc_cost = atoi(optarg); break; + + case 'm': // mode + if (strcasecmp(optarg, "simple") == 0) { + opts.gap5 = 0; + } else if (strcasecmp(optarg, "bayesian") == 0) { + opts.gap5 = 1; + } else { + fprintf(samtools_stderr, "Unknown mode %s\n", optarg); + return 1; + } + break; + + case 'l': + if ((opts.line_len = atoi(optarg)) <= 0) + opts.line_len = INT_MAX; + break; + + case 'f': + if (strcasecmp(optarg, "fasta") == 0) { + opts.fmt = FASTA; + } else if (strcasecmp(optarg, "fastq") == 0) { + opts.fmt = FASTQ; + } else if (strcasecmp(optarg, "pileup") == 0) { + opts.fmt = PILEUP; + } else { + fprintf(samtools_stderr, "Unknown format %s\n", optarg); + return 1; + } + break; + + case 'o': + if (!(opts.fp_out = fopen(optarg, "w"))) { + perror(optarg); + return 1; + } + break; + + case 11: + if ((opts.incl_flags = bam_str2flag(optarg)) < 0) { + print_error("consensus", "could not parse --rf %s", optarg); + return 1; + } + break; + case 12: + if ((opts.excl_flags = bam_str2flag(optarg)) < 0) { + print_error("consensus", "could not parse --ff %s", optarg); + return 1; + } + break; + + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': + usage_exit(samtools_stderr, EXIT_FAILURE); + } + } + + if (argc != optind+1) { + if (argc == optind) usage_exit(samtools_stdout, EXIT_SUCCESS); + else usage_exit(samtools_stderr, EXIT_FAILURE); + } + opts.fp = sam_open_format(argv[optind], "r", &ga.in); + if (opts.fp == NULL) { + print_error_errno("consensus", "Cannot open input file \"%s\"", + argv[optind]); + goto err; + } + if (ga.nthreads > 0) + hts_set_threads(opts.fp, ga.nthreads); + + if (hts_set_opt(opts.fp, CRAM_OPT_DECODE_MD, 0)) { + fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + goto err; + } + + if (!(opts.h = sam_hdr_read(opts.fp))) { + fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", argv[optind]); + goto err; + } + + if (opts.reg) { + opts.idx = sam_index_load(opts.fp, argv[optind]); + if (!opts.idx) { + print_error("consensus", "Cannot load index for input file \"%s\"", + argv[optind]); + goto err; + } + opts.iter = sam_itr_querys(opts.idx, opts.h, opts.reg); + if (!opts.iter) { + print_error("consensus", "Failed to parse region \"%s\"", + opts.reg); + goto err; + } + } + + if (opts.fmt == PILEUP) { + if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL, + basic_pileup, &opts) < 0) + goto err; + + if (opts.all_bases) { + int tid = opts.iter ? opts.iter->tid : opts.last_tid; + int len = sam_hdr_tid2len(opts.h, tid); + int pos = opts.last_pos; + if (opts.iter) { + len = MIN(opts.iter->end, len); + pos = MAX(opts.iter->beg, pos); + } + if (empty_pileup2(&opts, opts.h, tid, pos, len) < 0) + goto err; + } + } else { + if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL, + basic_fasta, + &opts) < 0) + goto err; + if (opts.all_bases) { + // fill out terminator + int tid = opts.iter ? opts.iter->tid : opts.last_tid; + int len = sam_hdr_tid2len(opts.h, tid); + int pos = opts.last_pos; + if (opts.iter) { + len = MIN(opts.iter->end, len); + pos = MAX(opts.iter->beg, pos); + opts.last_tid = opts.iter->tid; + } + if (pos < len) { + if (ks_expand(&opts.ks_ins_seq, len-pos+1) < 0) + goto err; + if (ks_expand(&opts.ks_ins_qual, len-pos+1) < 0) + goto err; + while (pos++ < len) { + opts.ks_ins_seq.s [opts.ks_ins_seq.l++] = 'N'; + opts.ks_ins_qual.s[opts.ks_ins_qual.l++] = '!'; + } + opts.ks_ins_seq.s [opts.ks_ins_seq.l] = 0; + opts.ks_ins_qual.s[opts.ks_ins_qual.l] = 0; + } + } + if (opts.last_tid >= 0) + dump_fastq(&opts, sam_hdr_tid2name(opts.h, opts.last_tid), + opts.ks_ins_seq.s, opts.ks_ins_seq.l, + opts.ks_ins_qual.s, opts.ks_ins_qual.l); +// if (consensus_loop(&opts) < 0) { +// print_error_errno("consensus", "Failed"); +// goto err; +// } + } + + ret = 0; + + err: + if (opts.iter) + hts_itr_destroy(opts.iter); + if (opts.idx) + hts_idx_destroy(opts.idx); + + if (opts.fp && sam_close(opts.fp) < 0) { + print_error_errno("consensus", "Closing input file \"%s\"", + argv[optind]); + ret = 1; + } + + if (opts.h) + sam_hdr_destroy(opts.h); + sam_global_args_free(&ga); + + if (opts.fp_out && opts.fp_out != samtools_stdout) + ret |= fclose(opts.fp_out) != 0; + else + ret |= fflush(samtools_stdout) != 0; + + ks_free(&opts.ks_line); + ks_free(&opts.ks_ins_seq); + ks_free(&opts.ks_ins_qual); + + if (ret) + print_error("consensus", "failed"); + + return ret; +} diff --git a/samtools/bam_lpileup.c b/samtools/bam_lpileup.c index 3e7d1df..58c428f 100644 --- a/samtools/bam_lpileup.c +++ b/samtools/bam_lpileup.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "bam_plbuf.h" #include "bam_lpileup.h" -#include +#include "splaysort.h" #define TV_GAP 2 @@ -39,7 +39,7 @@ typedef struct __freenode_t { } freenode_t, *freenode_p; #define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) -KSORT_INIT(node, freenode_p, freenode_lt) +SPLAYSORT_INIT(node, freenode_p, freenode_lt) /* Memory pool, similar to the one in bam_pileup.c */ typedef struct { @@ -163,7 +163,7 @@ static int tview_func(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *p tv->aux[i] = tv->tail; // add a proper tail for the loop below tv->n_nodes = i; if (tv->n_nodes) { - ks_introsort(node, tv->n_nodes, tv->aux); + splaysort(node, tv->n_nodes, tv->aux); for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; tv->head = tv->aux[0]; } else tv->head = tv->tail; diff --git a/samtools/bam_lpileup.c.pysam.c b/samtools/bam_lpileup.c.pysam.c index 2a8432d..ea76417 100644 --- a/samtools/bam_lpileup.c.pysam.c +++ b/samtools/bam_lpileup.c.pysam.c @@ -31,7 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "bam_plbuf.h" #include "bam_lpileup.h" -#include +#include "splaysort.h" #define TV_GAP 2 @@ -41,7 +41,7 @@ typedef struct __freenode_t { } freenode_t, *freenode_p; #define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) -KSORT_INIT(node, freenode_p, freenode_lt) +SPLAYSORT_INIT(node, freenode_p, freenode_lt) /* Memory pool, similar to the one in bam_pileup.c */ typedef struct { @@ -165,7 +165,7 @@ static int tview_func(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *p tv->aux[i] = tv->tail; // add a proper tail for the loop below tv->n_nodes = i; if (tv->n_nodes) { - ks_introsort(node, tv->n_nodes, tv->aux); + splaysort(node, tv->n_nodes, tv->aux); for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; tv->head = tv->aux[0]; } else tv->head = tv->tail; diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c index 84ec1ec..83e8f73 100644 --- a/samtools/bam_markdup.c +++ b/samtools/bam_markdup.c @@ -37,6 +37,7 @@ Copyright (c) 2009,2018 The Broad Institute. MIT license. #include #include #include +#include #include "htslib/thread_pool.h" #include "htslib/sam.h" #include "sam_opts.h" @@ -66,6 +67,10 @@ typedef struct { char *stats_file; char *arg_list; char *out_fn; + regex_t *rgx; + int rgx_x; + int rgx_y; + int rgx_t; } md_param_t; typedef struct { @@ -103,7 +108,8 @@ typedef struct { long x; long y; int opt; - int xpos; + int beg; + int end; } check_t; @@ -683,7 +689,7 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n /* Get the position of the coordinates from the read name. */ -static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { +static inline int get_coordinate_positions_colons(const char *qname, int *xpos, int *ypos) { int sep = 0; int pos = 0; @@ -711,14 +717,47 @@ static inline int get_coordinate_positions(const char *qname, int *xpos, int *yp return sep; } +/* Get the position of the coordinates from the read name. + Positions returned are of the x and y coordinate and an optional section of + the read name to test (t) for string equality e.g. lane and tile part. */ +static inline int get_coordinate_positions_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, int *xpos, int *ypos) { + regmatch_t matches[5]; + size_t max_matches = 5; -static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) { - int ret = 1; - int seps, xpos = 0, ypos = 0; - long x = 0, y = 0; - char *end; + if (!param->rgx_t) + max_matches = 4; + + if (regexec(param->rgx, qname, max_matches, matches, 0)) + return -1; + + *xpos = matches[param->rgx_x].rm_so; + *ypos = matches[param->rgx_y].rm_so; + + if (param->rgx_t) { + *t_beg = matches[param->rgx_t].rm_so; + *t_end = matches[param->rgx_t].rm_eo; + } else { + *t_beg = *t_end = 0; + } + + if (*xpos == -1 || *ypos == -1 || *t_beg == -1) + return -1; + + return 7; // 3, 4, 6 and 7 are successes in the previous function +} + + +static int get_coordinate_positions(md_param_t *param, const char *qname, int *beg, int *end, int *xpos, int *ypos, long *warnings) { + int ret = 0; + int seps; - seps = get_coordinate_positions(name, &xpos, &ypos); + if (param->rgx == NULL) { + seps = get_coordinate_positions_colons(qname, xpos, ypos); + *beg = 0; + *end = *xpos; + } else { + seps = get_coordinate_positions_regex(param, qname, beg, end, xpos, ypos); + } /* The most current Illumina read format at time of writing is: @machine:run:flowcell:lane:tile:x:y:UMI or @@ -732,9 +771,23 @@ static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name); + fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname); } + ret = 1; + } + + return ret; +} + + +static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { + int ret = 1; + int xpos = 0, ypos = 0; + long x = 0, y = 0; + char *end; + + if (get_coordinate_positions(param, name, t_beg, t_end, &xpos, &ypos, warnings)) { return ret; } @@ -764,7 +817,6 @@ static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *x_coord = x; *y_coord = y; - *xpos_out = xpos; ret = 0; return ret; @@ -774,41 +826,25 @@ static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long /* Using the coordinates from the Illumina read name, see whether the duplicated read is close enough (set by max_dist) to the original to be counted as optical.*/ -static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { - int ret = 0, seps; +static int optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { + int ret = 0; char *original, *duplicate; int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0; + int o_beg = 0, o_end = 0, d_beg = 0, d_end = 0; original = bam_get_qname(ori); duplicate = bam_get_qname(dup); - seps = get_coordinate_positions(original, &oxpos, &oypos); - - if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", original); - } - + if (get_coordinate_positions(param, original, &o_beg, &o_end, &oxpos, &oypos, warnings)) { return ret; } - seps = get_coordinate_positions(duplicate, &dxpos, &dypos); - - if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { - - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", duplicate); - } - + if (get_coordinate_positions(param, duplicate, &d_beg, &d_end, &dxpos, &dypos, warnings)) { return ret; } - if (strncmp(original, duplicate, oxpos - 1) == 0) { + if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) { // the initial parts match, look at the numbers long ox, oy, dx, dy, xdiff, ydiff; char *end; @@ -889,19 +925,19 @@ static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warn This function needs the values from the first read to be already calculated. */ -static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) { +static int optical_duplicate_partial(md_param_t *param, const char *name, const int o_beg, const int o_end, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) { int ret = 0; char *duplicate; - int dxpos = 0; + int d_beg = 0, d_end = 0; long dx, dy; duplicate = bam_get_qname(dup); - if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) { + if (get_coordinates(param, duplicate, &d_beg, &d_end, &dx, &dy, warnings)) { return ret; } - if (strncmp(name, duplicate, oxpos - 1) == 0) { + if (strncmp(name + o_beg, duplicate + d_beg, o_end - o_beg) == 0) { // the initial parts match, look at the numbers long xdiff, ydiff; @@ -926,7 +962,8 @@ static int optical_duplicate_partial(const char *name, const int oxpos, const lo c->x = dx; c->y = dy; - c->xpos = dxpos; + c->beg = d_beg; + c->end = d_end; return ret; } @@ -948,7 +985,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam } if (param->opt_dist) { // mark optical duplicates - if (optical_duplicate(ori, dup, param->opt_dist, warn)) { + if (optical_duplicate(param, ori, dup, param->opt_dist, warn)) { bam_aux_update_str(dup, "dt", 3, "SQ"); dup_type = 'O'; (*optical)++; @@ -1026,16 +1063,14 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, check_list_t *list, long *warn, long *optical_single, long *optical_pair) { - int ret = 0; + int ret = 0, coord_fail = 0; char *ori_name = bam_get_qname(ori->b); read_queue_t *current = ori->duplicate; - int xpos; + int t_beg = 0, t_end = 0; long x, y; if (param->opt_dist) { - if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) { - return ret; - } + coord_fail = get_coordinates(param, ori_name, &t_beg, &t_end, &x, &y, warn); } list->length = 0; @@ -1090,7 +1125,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * } } - if (param->opt_dist) { + if (param->opt_dist && !coord_fail) { uint8_t *data; char *dup_type; int is_opt = 0; @@ -1105,7 +1140,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * } // need to run this to get the duplicates x and y scores - is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn); + is_opt = optical_duplicate_partial(param, ori_name, t_beg, t_end, x, y, current->b, c, param->opt_dist, warn); if (!c->opt && is_opt) { if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { @@ -1131,6 +1166,9 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * list->length++; } + if (!ret && coord_fail) + ret = coord_fail; + return ret; } @@ -1178,7 +1216,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has continue; // the number are right, check the names - if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0) + if (strncmp(cur_name + current->beg, bam_get_qname(chk->b) + chk->beg, current->end - current->beg) != 0) continue; // optical duplicates @@ -2039,24 +2077,26 @@ static int markdup_usage(void) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools markdup \n\n"); fprintf(stderr, "Option: \n"); - fprintf(stderr, " -r Remove duplicate reads\n"); - fprintf(stderr, " -l INT Max read length (default 300 bases)\n"); - fprintf(stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); - fprintf(stderr, " -s Report stats.\n"); - fprintf(stderr, " -f NAME Write stats to named file. Implies -s.\n"); - fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); - fprintf(stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); - fprintf(stderr, " -c Clear previous duplicate settings and tags.\n"); - fprintf(stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" - " TYPE = t measure positions based on template start/end (default).\n" - " s measure positions based on sequence start.\n"); - fprintf(stderr, " -n Reduce optical duplicate accuracy (faster results with many duplicates).\n"); - fprintf(stderr, " -u Output uncompressed data\n"); - fprintf(stderr, " --include-fails Include quality check failed reads.\n"); - fprintf(stderr, " --no-PG Do not add a PG line\n"); - fprintf(stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n"); - fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." - " Mainly for information and debugging.\n"); + fprintf(stderr, " -r Remove duplicate reads\n"); + fprintf(stderr, " -l INT Max read length (default 300 bases)\n"); + fprintf(stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); + fprintf(stderr, " -s Report stats.\n"); + fprintf(stderr, " -f NAME Write stats to named file. Implies -s.\n"); + fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); + fprintf(stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); + fprintf(stderr, " -c Clear previous duplicate settings and tags.\n"); + fprintf(stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" + " TYPE = t measure positions based on template start/end (default).\n" + " s measure positions based on sequence start.\n"); + fprintf(stderr, " -u Output uncompressed data\n"); + fprintf(stderr, " --include-fails Include quality check failed reads.\n"); + fprintf(stderr, " --no-PG Do not add a PG line\n"); + fprintf(stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n"); + fprintf(stderr, " --read-coords STR Regex for coords from read name.\n"); + fprintf(stderr, " --coords-order STR Order of regex elements. txy (default). With t being a part of\n" + " the read names that must be equal and x/y being coordinates.\n"); + fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." + " Mainly for information and debugging.\n"); sam_global_opt_help(stderr, "-.O..@.."); @@ -2075,7 +2115,9 @@ int bam_markdup(int argc, char **argv) { kstring_t tmpprefix = {0, 0, NULL}; struct stat st; unsigned int t; - md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL}; + char *regex = NULL; + char *regex_order = "txy"; + md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL, NULL, 0, 0, 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -2083,6 +2125,8 @@ int bam_markdup(int argc, char **argv) { {"no-PG", no_argument, NULL, 1002}, {"mode", required_argument, NULL, 'm'}, {"no-multi-dup", no_argument, NULL, 1003}, + {"read-coords", required_argument, NULL, 1004}, + {"coords-order", required_argument, NULL, 1005}, {NULL, 0, NULL, 0} }; @@ -2112,6 +2156,8 @@ int bam_markdup(int argc, char **argv) { case 1001: param.include_fails = 1; break; case 1002: param.no_pg = 1; break; case 1003: param.check_chain = 0; break; + case 1004: regex = optarg; break; + case 1005: regex_order = optarg; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); @@ -2124,6 +2170,50 @@ int bam_markdup(int argc, char **argv) { if (param.opt_dist < 0) param.opt_dist = 0; if (param.max_length < 0) param.max_length = 300; + if (regex) { + int result; + + // set the order the elements of the regex are assigned to. + // x and y being coordinates, t being any other important part of the read + // e.g. tile and lane + // x and y order does not matter as long as it is consistent + + if ((strncmp(regex_order, "txy", 3) == 0) || (strncmp(regex_order, "tyx", 3) == 0)) { + param.rgx_t = 1; + param.rgx_x = 2; + param.rgx_y = 3; + } else if ((strncmp(regex_order, "xyt", 3) == 0) || (strncmp(regex_order, "yxt", 3) == 0)) { + param.rgx_x = 1; + param.rgx_y = 2; + param.rgx_t = 3; + } else if ((strncmp(regex_order, "xty", 3) == 0) || (strncmp(regex_order, "ytx", 3) == 0)) { + param.rgx_x = 1; + param.rgx_t = 2; + param.rgx_y = 3; + } else if ((strncmp(regex_order, "xy", 2) == 0) || (strncmp(regex_order, "yx", 2) == 0)) { + param.rgx_x = 1; + param.rgx_y = 2; + param.rgx_t = 0; + } else { + fprintf(stderr, "[markdup] error: could not recognise regex coorindate order \"%s\".\n", regex_order); + return 1; + } + + if ((param.rgx = malloc(sizeof(regex_t))) == NULL) { + fprintf(stderr, "[markdup] error: could not allocate memory for regex.\n"); + return 1; + } + + if ((result = regcomp(param.rgx, regex, REG_EXTENDED))) { + char err_msg[256]; + + regerror(result, param.rgx, err_msg, 256); + fprintf(stderr, "[markdup] error: regex error \"%s\"\n", err_msg); + free(param.rgx); + return 1; + } + } + param.in = sam_open_format(argv[optind], "r", &ga.in); if (!param.in) { @@ -2183,6 +2273,11 @@ int bam_markdup(int argc, char **argv) { if (p.pool) hts_tpool_destroy(p.pool); + if (param.rgx) { + regfree(param.rgx); + free(param.rgx); + } + free(param.arg_list); free(tmpprefix.s); sam_global_args_free(&ga); diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c index a478956..06fb361 100644 --- a/samtools/bam_markdup.c.pysam.c +++ b/samtools/bam_markdup.c.pysam.c @@ -39,6 +39,7 @@ Copyright (c) 2009,2018 The Broad Institute. MIT license. #include #include #include +#include #include "htslib/thread_pool.h" #include "htslib/sam.h" #include "sam_opts.h" @@ -68,6 +69,10 @@ typedef struct { char *stats_file; char *arg_list; char *out_fn; + regex_t *rgx; + int rgx_x; + int rgx_y; + int rgx_t; } md_param_t; typedef struct { @@ -105,7 +110,8 @@ typedef struct { long x; long y; int opt; - int xpos; + int beg; + int end; } check_t; @@ -685,7 +691,7 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n /* Get the position of the coordinates from the read name. */ -static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { +static inline int get_coordinate_positions_colons(const char *qname, int *xpos, int *ypos) { int sep = 0; int pos = 0; @@ -713,14 +719,47 @@ static inline int get_coordinate_positions(const char *qname, int *xpos, int *yp return sep; } +/* Get the position of the coordinates from the read name. + Positions returned are of the x and y coordinate and an optional section of + the read name to test (t) for string equality e.g. lane and tile part. */ +static inline int get_coordinate_positions_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, int *xpos, int *ypos) { + regmatch_t matches[5]; + size_t max_matches = 5; -static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) { - int ret = 1; - int seps, xpos = 0, ypos = 0; - long x = 0, y = 0; - char *end; + if (!param->rgx_t) + max_matches = 4; + + if (regexec(param->rgx, qname, max_matches, matches, 0)) + return -1; + + *xpos = matches[param->rgx_x].rm_so; + *ypos = matches[param->rgx_y].rm_so; + + if (param->rgx_t) { + *t_beg = matches[param->rgx_t].rm_so; + *t_end = matches[param->rgx_t].rm_eo; + } else { + *t_beg = *t_end = 0; + } + + if (*xpos == -1 || *ypos == -1 || *t_beg == -1) + return -1; + + return 7; // 3, 4, 6 and 7 are successes in the previous function +} + + +static int get_coordinate_positions(md_param_t *param, const char *qname, int *beg, int *end, int *xpos, int *ypos, long *warnings) { + int ret = 0; + int seps; - seps = get_coordinate_positions(name, &xpos, &ypos); + if (param->rgx == NULL) { + seps = get_coordinate_positions_colons(qname, xpos, ypos); + *beg = 0; + *end = *xpos; + } else { + seps = get_coordinate_positions_regex(param, qname, beg, end, xpos, ypos); + } /* The most current Illumina read format at time of writing is: @machine:run:flowcell:lane:tile:x:y:UMI or @@ -734,9 +773,23 @@ static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name); + fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname); } + ret = 1; + } + + return ret; +} + + +static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { + int ret = 1; + int xpos = 0, ypos = 0; + long x = 0, y = 0; + char *end; + + if (get_coordinate_positions(param, name, t_beg, t_end, &xpos, &ypos, warnings)) { return ret; } @@ -766,7 +819,6 @@ static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *x_coord = x; *y_coord = y; - *xpos_out = xpos; ret = 0; return ret; @@ -776,41 +828,25 @@ static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long /* Using the coordinates from the Illumina read name, see whether the duplicated read is close enough (set by max_dist) to the original to be counted as optical.*/ -static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { - int ret = 0, seps; +static int optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { + int ret = 0; char *original, *duplicate; int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0; + int o_beg = 0, o_end = 0, d_beg = 0, d_end = 0; original = bam_get_qname(ori); duplicate = bam_get_qname(dup); - seps = get_coordinate_positions(original, &oxpos, &oypos); - - if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", original); - } - + if (get_coordinate_positions(param, original, &o_beg, &o_end, &oxpos, &oypos, warnings)) { return ret; } - seps = get_coordinate_positions(duplicate, &dxpos, &dypos); - - if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { - - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", duplicate); - } - + if (get_coordinate_positions(param, duplicate, &d_beg, &d_end, &dxpos, &dypos, warnings)) { return ret; } - if (strncmp(original, duplicate, oxpos - 1) == 0) { + if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) { // the initial parts match, look at the numbers long ox, oy, dx, dy, xdiff, ydiff; char *end; @@ -891,19 +927,19 @@ static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warn This function needs the values from the first read to be already calculated. */ -static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) { +static int optical_duplicate_partial(md_param_t *param, const char *name, const int o_beg, const int o_end, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) { int ret = 0; char *duplicate; - int dxpos = 0; + int d_beg = 0, d_end = 0; long dx, dy; duplicate = bam_get_qname(dup); - if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) { + if (get_coordinates(param, duplicate, &d_beg, &d_end, &dx, &dy, warnings)) { return ret; } - if (strncmp(name, duplicate, oxpos - 1) == 0) { + if (strncmp(name + o_beg, duplicate + d_beg, o_end - o_beg) == 0) { // the initial parts match, look at the numbers long xdiff, ydiff; @@ -928,7 +964,8 @@ static int optical_duplicate_partial(const char *name, const int oxpos, const lo c->x = dx; c->y = dy; - c->xpos = dxpos; + c->beg = d_beg; + c->end = d_end; return ret; } @@ -950,7 +987,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam } if (param->opt_dist) { // mark optical duplicates - if (optical_duplicate(ori, dup, param->opt_dist, warn)) { + if (optical_duplicate(param, ori, dup, param->opt_dist, warn)) { bam_aux_update_str(dup, "dt", 3, "SQ"); dup_type = 'O'; (*optical)++; @@ -1028,16 +1065,14 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, check_list_t *list, long *warn, long *optical_single, long *optical_pair) { - int ret = 0; + int ret = 0, coord_fail = 0; char *ori_name = bam_get_qname(ori->b); read_queue_t *current = ori->duplicate; - int xpos; + int t_beg = 0, t_end = 0; long x, y; if (param->opt_dist) { - if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) { - return ret; - } + coord_fail = get_coordinates(param, ori_name, &t_beg, &t_end, &x, &y, warn); } list->length = 0; @@ -1092,7 +1127,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * } } - if (param->opt_dist) { + if (param->opt_dist && !coord_fail) { uint8_t *data; char *dup_type; int is_opt = 0; @@ -1107,7 +1142,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * } // need to run this to get the duplicates x and y scores - is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn); + is_opt = optical_duplicate_partial(param, ori_name, t_beg, t_end, x, y, current->b, c, param->opt_dist, warn); if (!c->opt && is_opt) { if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { @@ -1133,6 +1168,9 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * list->length++; } + if (!ret && coord_fail) + ret = coord_fail; + return ret; } @@ -1180,7 +1218,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has continue; // the number are right, check the names - if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0) + if (strncmp(cur_name + current->beg, bam_get_qname(chk->b) + chk->beg, current->end - current->beg) != 0) continue; // optical duplicates @@ -2041,24 +2079,26 @@ static int markdup_usage(void) { fprintf(samtools_stderr, "\n"); fprintf(samtools_stderr, "Usage: samtools markdup \n\n"); fprintf(samtools_stderr, "Option: \n"); - fprintf(samtools_stderr, " -r Remove duplicate reads\n"); - fprintf(samtools_stderr, " -l INT Max read length (default 300 bases)\n"); - fprintf(samtools_stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); - fprintf(samtools_stderr, " -s Report stats.\n"); - fprintf(samtools_stderr, " -f NAME Write stats to named file. Implies -s.\n"); - fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); - fprintf(samtools_stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); - fprintf(samtools_stderr, " -c Clear previous duplicate settings and tags.\n"); - fprintf(samtools_stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" - " TYPE = t measure positions based on template start/end (default).\n" - " s measure positions based on sequence start.\n"); - fprintf(samtools_stderr, " -n Reduce optical duplicate accuracy (faster results with many duplicates).\n"); - fprintf(samtools_stderr, " -u Output uncompressed data\n"); - fprintf(samtools_stderr, " --include-fails Include quality check failed reads.\n"); - fprintf(samtools_stderr, " --no-PG Do not add a PG line\n"); - fprintf(samtools_stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n"); - fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." - " Mainly for information and debugging.\n"); + fprintf(samtools_stderr, " -r Remove duplicate reads\n"); + fprintf(samtools_stderr, " -l INT Max read length (default 300 bases)\n"); + fprintf(samtools_stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); + fprintf(samtools_stderr, " -s Report stats.\n"); + fprintf(samtools_stderr, " -f NAME Write stats to named file. Implies -s.\n"); + fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); + fprintf(samtools_stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); + fprintf(samtools_stderr, " -c Clear previous duplicate settings and tags.\n"); + fprintf(samtools_stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" + " TYPE = t measure positions based on template start/end (default).\n" + " s measure positions based on sequence start.\n"); + fprintf(samtools_stderr, " -u Output uncompressed data\n"); + fprintf(samtools_stderr, " --include-fails Include quality check failed reads.\n"); + fprintf(samtools_stderr, " --no-PG Do not add a PG line\n"); + fprintf(samtools_stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n"); + fprintf(samtools_stderr, " --read-coords STR Regex for coords from read name.\n"); + fprintf(samtools_stderr, " --coords-order STR Order of regex elements. txy (default). With t being a part of\n" + " the read names that must be equal and x/y being coordinates.\n"); + fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." + " Mainly for information and debugging.\n"); sam_global_opt_help(samtools_stderr, "-.O..@.."); @@ -2077,7 +2117,9 @@ int bam_markdup(int argc, char **argv) { kstring_t tmpprefix = {0, 0, NULL}; struct stat st; unsigned int t; - md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL}; + char *regex = NULL; + char *regex_order = "txy"; + md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL, NULL, 0, 0, 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -2085,6 +2127,8 @@ int bam_markdup(int argc, char **argv) { {"no-PG", no_argument, NULL, 1002}, {"mode", required_argument, NULL, 'm'}, {"no-multi-dup", no_argument, NULL, 1003}, + {"read-coords", required_argument, NULL, 1004}, + {"coords-order", required_argument, NULL, 1005}, {NULL, 0, NULL, 0} }; @@ -2114,6 +2158,8 @@ int bam_markdup(int argc, char **argv) { case 1001: param.include_fails = 1; break; case 1002: param.no_pg = 1; break; case 1003: param.check_chain = 0; break; + case 1004: regex = optarg; break; + case 1005: regex_order = optarg; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); @@ -2126,6 +2172,50 @@ int bam_markdup(int argc, char **argv) { if (param.opt_dist < 0) param.opt_dist = 0; if (param.max_length < 0) param.max_length = 300; + if (regex) { + int result; + + // set the order the elements of the regex are assigned to. + // x and y being coordinates, t being any other important part of the read + // e.g. tile and lane + // x and y order does not matter as long as it is consistent + + if ((strncmp(regex_order, "txy", 3) == 0) || (strncmp(regex_order, "tyx", 3) == 0)) { + param.rgx_t = 1; + param.rgx_x = 2; + param.rgx_y = 3; + } else if ((strncmp(regex_order, "xyt", 3) == 0) || (strncmp(regex_order, "yxt", 3) == 0)) { + param.rgx_x = 1; + param.rgx_y = 2; + param.rgx_t = 3; + } else if ((strncmp(regex_order, "xty", 3) == 0) || (strncmp(regex_order, "ytx", 3) == 0)) { + param.rgx_x = 1; + param.rgx_t = 2; + param.rgx_y = 3; + } else if ((strncmp(regex_order, "xy", 2) == 0) || (strncmp(regex_order, "yx", 2) == 0)) { + param.rgx_x = 1; + param.rgx_y = 2; + param.rgx_t = 0; + } else { + fprintf(samtools_stderr, "[markdup] error: could not recognise regex coorindate order \"%s\".\n", regex_order); + return 1; + } + + if ((param.rgx = malloc(sizeof(regex_t))) == NULL) { + fprintf(samtools_stderr, "[markdup] error: could not allocate memory for regex.\n"); + return 1; + } + + if ((result = regcomp(param.rgx, regex, REG_EXTENDED))) { + char err_msg[256]; + + regerror(result, param.rgx, err_msg, 256); + fprintf(samtools_stderr, "[markdup] error: regex error \"%s\"\n", err_msg); + free(param.rgx); + return 1; + } + } + param.in = sam_open_format(argv[optind], "r", &ga.in); if (!param.in) { @@ -2185,6 +2275,11 @@ int bam_markdup(int argc, char **argv) { if (p.pool) hts_tpool_destroy(p.pool); + if (param.rgx) { + regfree(param.rgx); + free(param.rgx); + } + free(param.arg_list); free(tmpprefix.s); sam_global_args_free(&ga); diff --git a/samtools/bam_plbuf.h b/samtools/bam_plbuf.h index acf3338..9a718e0 100644 --- a/samtools/bam_plbuf.h +++ b/samtools/bam_plbuf.h @@ -1,6 +1,6 @@ /* bam_plbuf.h -- plbuf routines (declarations copied from bam.h). - Copyright (C) 2008, 2013 Genome Research Ltd. + Copyright (C) 2008, 2013, 2021 Genome Research Ltd. Author: Heng Li @@ -41,13 +41,20 @@ typedef struct { #ifdef __cplusplus extern "C" { #endif - void bam_plbuf_reset(bam_plbuf_t *buf); +/* Exported from bam_plbuf.c */ +void bam_plbuf_reset(bam_plbuf_t *buf); - bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); +bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); - void bam_plbuf_destroy(bam_plbuf_t *buf); +void bam_plbuf_destroy(bam_plbuf_t *buf); - int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); +int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); + +/* Exported from bam_plcmd.c */ +int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, + hts_pos_t ref_len, const char *ref, kstring_t *ks, + int rev_del, int no_ins, int no_ins_mods, + int no_del, int no_ends); #ifdef __cplusplus } #endif diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index 10e79c6..9b49500 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -46,6 +46,7 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" #include "bedidx.h" #include "sam_opts.h" +#include "bam_plbuf.h" #define dummy_free(p) KLIST_INIT(auxlist, char *, dummy_free) @@ -65,10 +66,10 @@ static inline int printw(int c, FILE *fp) return 0; } -static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, - hts_pos_t ref_len, const char *ref, kstring_t *ks, - int rev_del, int no_ins, int no_ins_mods, - int no_del, int no_ends) +int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, + hts_pos_t ref_len, const char *ref, kstring_t *ks, + int rev_del, int no_ins, int no_ins_mods, + int no_del, int no_ends) { no_ins_mods |= no_ins; int j; @@ -161,12 +162,8 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, return 0; } -#include -#include "bam2bcf.h" #include "sample.h" -#define MPLP_BCF 1 -#define MPLP_VCF (1<<1) #define MPLP_NO_COMP (1<<2) #define MPLP_NO_ORPHAN (1<<3) #define MPLP_REALN (1<<4) @@ -174,7 +171,6 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, #define MPLP_REDO_BAQ (1<<6) #define MPLP_ILLUMINA13 (1<<7) #define MPLP_IGNORE_RG (1<<8) -#define MPLP_PER_SAMPLE (1<<9) #define MPLP_SMART_OVERLAPS (1<<10) #define MPLP_PRINT_MAPQ_CHAR (1<<11) @@ -199,10 +195,8 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, #define MPLP_MAX_INDEL_DEPTH 250 typedef struct { - int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all, rev_del; + int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, all, rev_del; int rflag_require, rflag_filter; - int openQ, extQ, tandemQ, min_support; // for indels - double min_frac; // for indels char *reg, *pl_list, *fai_fname, *output_fname; faidx_t *fai; void *bed, *rghash, *auxlist; @@ -439,33 +433,6 @@ static int mplp_func(void *data, bam1_t *b) return ret; } -static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, - int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp, int ignore_rg) -{ - int i, j; - memset(m->n_plp, 0, m->n * sizeof(int)); - for (i = 0; i < n; ++i) { - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - uint8_t *q; - int id = -1; - q = ignore_rg? NULL : bam_aux_get(p->b, "RG"); - if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); - if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); - if (id < 0 || id >= m->n) { - assert(q); // otherwise a bug - fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); - exit(EXIT_FAILURE); - } - if (m->n_plp[id] == m->m_plp[id]) { - m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; - m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); - } - m->plp[id][m->n_plp[id]++] = *p; - } - } -} - /* * Performs pileup * @param conf configuration for this pileup @@ -475,32 +442,22 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, */ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) { - extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); - extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; - int i, tid, *n_plp, tid0 = 0, max_depth, max_indel_depth; + int i, tid, *n_plp, tid0 = 0, max_depth; hts_pos_t pos, beg0 = 0, end0 = HTS_POS_MAX, ref_len; const bam_pileup1_t **plp; mplp_ref_t mp_ref = MPLP_REF_INIT; bam_mplp_t iter; sam_hdr_t *h = NULL; /* header of first file in input list */ char *ref; - void *rghash = NULL; FILE *pileup_fp = NULL; - bcf_callaux_t *bca = NULL; - bcf_callret1_t *bcr = NULL; - bcf_call_t bc; - htsFile *bcf_fp = NULL; - bcf_hdr_t *bcf_hdr = NULL; - bam_sample_t *sm = NULL; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); - memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(mplp_aux_t*)); plp = calloc(n, sizeof(bam_pileup1_t*)); n_plp = calloc(n, sizeof(int)); @@ -550,10 +507,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) exit(EXIT_FAILURE); } bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp)); - if (conf->flag & MPLP_BCF) { - // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) - rghash = bcf_call_add_rg(rghash, sam_hdr_str(h_tmp), conf->pl_list); - } if (conf->reg) { hts_idx_t *idx = NULL; // If index filename has not been specfied, look in BAM folder @@ -588,151 +541,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) } } fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); - if (conf->flag & MPLP_BCF) - { - const char *mode; - // allocate data storage proportionate to number of samples being studied sm->n - gplp.n = sm->n; - gplp.n_plp = calloc(sm->n, sizeof(int)); - gplp.m_plp = calloc(sm->n, sizeof(int)); - gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); - // write the VCF header + pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout; - if ( conf->flag & MPLP_VCF ) - mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF - else - mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb"; // uncompressed BCF or compressed BCF - - bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); - if (bcf_fp == NULL) { - fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); - exit(EXIT_FAILURE); - } - autoflush_if_stdout(bcf_fp, conf->output_fname); - - // BCF header creation - bcf_hdr = bcf_hdr_init("w"); - kstring_t str = {0,0,NULL}; - - ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version()); - bcf_hdr_append(bcf_hdr, str.s); - - str.l = 0; - ksprintf(&str, "##samtoolsCommand=samtools mpileup"); - for (i=1; iargc; i++) ksprintf(&str, " %s", conf->argv[i]); - kputc('\n', &str); - bcf_hdr_append(bcf_hdr, str.s); - - if (conf->fai_fname) - { - str.l = 0; - ksprintf(&str, "##reference=file://%s\n", conf->fai_fname); - bcf_hdr_append(bcf_hdr, str.s); - } - - // Translate BAM @SQ tags to BCF ##contig tags - // todo: use/write new BAM header manipulation routines, fill also UR, M5 - for (i=0; i < sam_hdr_nref(h); i++) - { - str.l = 0; - ksprintf(&str, "##contig=", sam_hdr_tid2name(h, i), (int64_t) sam_hdr_tid2len(h, i)); - bcf_hdr_append(bcf_hdr, str.s); - } - free(str.s); - bcf_hdr_append(bcf_hdr,"##ALT="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); -#if CDF_MWU_TESTS - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); -#endif - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_DP ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_DV ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_DPR ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_DPR ) - bcf_hdr_append(bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_FMT_DP4 ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_SP ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_AD ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_ADF ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_ADR ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_AD ) - bcf_hdr_append(bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_ADF ) - bcf_hdr_append(bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_ADR ) - bcf_hdr_append(bcf_hdr,"##INFO="); - - for (i=0; in; i++) - bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); - bcf_hdr_add_sample(bcf_hdr, NULL); - if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) { - print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"", - conf->output_fname? conf->output_fname : "standard output"); - exit(EXIT_FAILURE); - } - // End of BCF header creation - - // Initialise the calling algorithm - bca = bcf_call_init(-1., conf->min_baseQ); - bcr = calloc(sm->n, sizeof(bcf_callret1_t)); - bca->rghash = rghash; - bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; - bca->min_frac = conf->min_frac; - bca->min_support = conf->min_support; - bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; - - bc.bcf_hdr = bcf_hdr; - bc.n = sm->n; - bc.PL = malloc(15 * sm->n * sizeof(*bc.PL)); - if (conf->fmt_flag) - { - assert( sizeof(float)==sizeof(int32_t) ); - bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4); - bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields - if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) - { - // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample - bc.ADR = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); - bc.ADF = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); - for (i=0; in; i++) - { - bcr[i].ADR = bc.ADR + (i+1)*B2B_MAX_ALLELES; - bcr[i].ADF = bc.ADF + (i+1)*B2B_MAX_ALLELES; - } - } - } - } - else { - pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout; - - if (pileup_fp == NULL) { - fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); - exit(EXIT_FAILURE); - } + if (pileup_fp == NULL) { + fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); + exit(EXIT_FAILURE); } // init pileup @@ -751,10 +565,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(stderr, "[%s] Combined max depth is above 1M. Potential memory hog!\n", __func__); } - // Only used when writing BCF - max_indel_depth = conf->max_indel_depth * sm->n; + bam_mplp_set_maxcnt(iter, max_depth); - bcf1_t *bcf_rec = bcf_init1(); int ret; int last_tid = -1; hts_pos_t last_pos = -1; @@ -764,256 +576,227 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested mplp_get_ref(data[0], tid, &ref, &ref_len); //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); - if (conf->flag & MPLP_BCF) { - int total_depth, _ref0, ref16; - if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; - for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; - group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); - _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; - ref16 = seq_nt16_table[_ref0]; - bcf_callaux_clean(bca, &bc); - for (i = 0; i < gplp.n; ++i) - bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); - bc.tid = tid; bc.pos = pos; - bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); - bcf_clear1(bcf_rec); - bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); - if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { - print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", - conf->output_fname?conf->output_fname:"standard output"); - exit(EXIT_FAILURE); - } - // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) - { - bcf_callaux_clean(bca, &bc); - for (i = 0; i < gplp.n; ++i) - bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { - bcf_clear1(bcf_rec); - bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); - if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { - print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", - conf->output_fname?conf->output_fname:"standard output"); - exit(EXIT_FAILURE); + if (conf->all) { + // Deal with missing portions of previous tids + while (tid > last_tid) { + if (last_tid >= 0 && !conf->reg) { + while (++last_pos < sam_hdr_tid2len(h, last_tid)) { + if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); } } + last_tid++; + last_pos = -1; + if (conf->all < 2) + break; } - } else { - if (conf->all) { - // Deal with missing portions of previous tids - while (tid > last_tid) { - if (last_tid >= 0 && !conf->reg) { - while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; - print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); + } + if (conf->all) { + // Deal with missing portion of current tid + while (++last_pos < pos) { + if (conf->reg && last_pos < beg0) continue; // out of range; skip + if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len); + } + last_tid = tid; + last_pos = pos; + } + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; + + fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); + for (i = 0; i < n; ++i) { + int j, cnt; + for (j = cnt = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) ++cnt; + } + fprintf(pileup_fp, "\t%d\t", cnt); + if (n_plp[i] == 0) { + fputs("*\t*", pileup_fp); + int flag_value = MPLP_PRINT_MAPQ_CHAR; + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS + && (conf->flag & flag_value)) + fputs("\t*", pileup_fp); + flag_value <<= 1; + } + if (conf->auxlist) { + int t = 0; + while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) + fputs("\t*", pileup_fp); + } + } else { + int n = 0; + kstring_t ks = KS_INITIALIZE; + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) { + n++; + if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, + ref, &ks, conf->rev_del, + conf->no_ins, conf->no_ins_mods, + conf->no_del, conf->no_ends) < 0) { + ret = 1; + goto fail; } } - last_tid++; - last_pos = -1; - if (conf->all < 2) - break; - } - } - if (conf->all) { - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (conf->reg && last_pos < beg0) continue; // out of range; skip - if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; - print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len); } - last_tid = tid; - last_pos = pos; - } - if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; + if (!n) putc('*', pileup_fp); - fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); - for (i = 0; i < n; ++i) { - int j, cnt; - for (j = cnt = 0; j < n_plp[i]; ++j) { + /* Print base qualities */ + n = 0; + ks_free(&ks); + putc('\t', pileup_fp); + for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if (c >= conf->min_baseQ) ++cnt; - } - fprintf(pileup_fp, "\t%d\t", cnt); - if (n_plp[i] == 0) { - fputs("*\t*", pileup_fp); - int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_LAST) { - if (flag_value != MPLP_PRINT_MODS - && (conf->flag & flag_value)) - fputs("\t*", pileup_fp); - flag_value <<= 1; + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) { + c = c + 33 < 126? c + 33 : 126; + putc(c, pileup_fp); + n++; } - if (conf->auxlist) { - int t = 0; - while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) - fputs("\t*", pileup_fp); - } - } else { - int n = 0; - kstring_t ks = KS_INITIALIZE; - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if (c >= conf->min_baseQ) { + } + if (!n) putc('*', pileup_fp); + + /* Print selected columns */ + int flag_value = MPLP_PRINT_MAPQ_CHAR; + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS + && (conf->flag & flag_value)) { + n = 0; + putc('\t', pileup_fp); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = &plp[i][j]; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if ( c < conf->min_baseQ ) continue; + if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp); n++; - if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, - ref, &ks, conf->rev_del, - conf->no_ins, conf->no_ins_mods, - conf->no_del, conf->no_ends) < 0) { - ret = 1; - goto fail; + + switch (flag_value) { + case MPLP_PRINT_MAPQ_CHAR: + c = p->b->core.qual + 33; + if (c > 126) c = 126; + putc(c, pileup_fp); + break; + case MPLP_PRINT_QPOS: + // query position in current orientation + fprintf(pileup_fp, "%d", p->qpos + 1); + break; + case MPLP_PRINT_QPOS5: { + // query position in 5' to 3' orientation + int pos5 = bam_is_rev(p->b) + ? p->b->core.l_qseq-p->qpos + p->is_del + : p->qpos + 1; + fprintf(pileup_fp, "%d", pos5); + break; + } + case MPLP_PRINT_QNAME: + fputs(bam_get_qname(p->b), pileup_fp); + break; + case MPLP_PRINT_FLAG: + fprintf(pileup_fp, "%d", p->b->core.flag); + break; + case MPLP_PRINT_RNAME: + if (p->b->core.tid >= 0) + fputs(sam_hdr_tid2name(h, p->b->core.tid), pileup_fp); + else + putc('*', pileup_fp); + break; + case MPLP_PRINT_POS: + fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); + break; + case MPLP_PRINT_MAPQ: + fprintf(pileup_fp, "%d", p->b->core.qual); + break; + case MPLP_PRINT_RNEXT: + if (p->b->core.mtid >= 0) + fputs(sam_hdr_tid2name(h, p->b->core.mtid), pileup_fp); + else + putc('*', pileup_fp); + break; + case MPLP_PRINT_PNEXT: + fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1); + break; } } + if (!n) putc('*', pileup_fp); } - if (!n) putc('*', pileup_fp); - - /* Print base qualities */ - n = 0; - ks_free(&ks); - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if (c >= conf->min_baseQ) { - c = c + 33 < 126? c + 33 : 126; - putc(c, pileup_fp); + flag_value <<= 1; + } + + /* Print selected tags */ + klist_t(auxlist) *auxlist_p = ((klist_t(auxlist) *)conf->auxlist); + if (auxlist_p && auxlist_p->size) { + kliter_t(auxlist) *aux; + for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) { + n = 0; + putc('\t', pileup_fp); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = &plp[i][j]; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if ( c < conf->min_baseQ ) continue; + + if (n > 0) putc(conf->sep, pileup_fp); n++; - } - } - if (!n) putc('*', pileup_fp); - - /* Print selected columns */ - int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_LAST) { - if (flag_value != MPLP_PRINT_MODS - && (conf->flag & flag_value)) { - n = 0; - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = &plp[i][j]; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if ( c < conf->min_baseQ ) continue; - if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp); - n++; - - switch (flag_value) { - case MPLP_PRINT_MAPQ_CHAR: - c = p->b->core.qual + 33; - if (c > 126) c = 126; - putc(c, pileup_fp); - break; - case MPLP_PRINT_QPOS: - // query position in current orientation - fprintf(pileup_fp, "%d", p->qpos + 1); - break; - case MPLP_PRINT_QPOS5: { - // query position in 5' to 3' orientation - int pos5 = bam_is_rev(p->b) - ? p->b->core.l_qseq-p->qpos + p->is_del - : p->qpos + 1; - fprintf(pileup_fp, "%d", pos5); - break; - } - case MPLP_PRINT_QNAME: - fputs(bam_get_qname(p->b), pileup_fp); - break; - case MPLP_PRINT_FLAG: - fprintf(pileup_fp, "%d", p->b->core.flag); - break; - case MPLP_PRINT_RNAME: - if (p->b->core.tid >= 0) - fputs(sam_hdr_tid2name(h, p->b->core.tid), pileup_fp); - else - putc('*', pileup_fp); - break; - case MPLP_PRINT_POS: - fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); - break; - case MPLP_PRINT_MAPQ: - fprintf(pileup_fp, "%d", p->b->core.qual); - break; - case MPLP_PRINT_RNEXT: - if (p->b->core.mtid >= 0) - fputs(sam_hdr_tid2name(h, p->b->core.mtid), pileup_fp); - else - putc('*', pileup_fp); - break; - case MPLP_PRINT_PNEXT: - fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1); - break; - } + uint8_t* tag_u = bam_aux_get(p->b, kl_val(aux)); + if (!tag_u) { + putc(conf->empty , pileup_fp); + continue; + } + + int tag_supported = 0; + + /* Tag value is string */ + if (*tag_u == 'Z' || *tag_u == 'H') { + char *tag_s = bam_aux2Z(tag_u); + if (!tag_s) continue; + fputs(tag_s, pileup_fp); + tag_supported = 1; } - if (!n) putc('*', pileup_fp); - } - flag_value <<= 1; - } - /* Print selected tags */ - klist_t(auxlist) *auxlist_p = ((klist_t(auxlist) *)conf->auxlist); - if (auxlist_p && auxlist_p->size) { - kliter_t(auxlist) *aux; - for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) { - n = 0; - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = &plp[i][j]; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if ( c < conf->min_baseQ ) continue; - - if (n > 0) putc(conf->sep, pileup_fp); - n++; - uint8_t* tag_u = bam_aux_get(p->b, kl_val(aux)); - if (!tag_u) { - putc(conf->empty , pileup_fp); - continue; - } - - /* Tag value is string */ - if (*tag_u == 'Z' || *tag_u == 'H') { - char *tag_s = bam_aux2Z(tag_u); - if (!tag_s) continue; - fputs(tag_s, pileup_fp); - } - - /* Tag value is integer */ - if (*tag_u == 'I' || *tag_u == 'i' || *tag_u == 'C' || *tag_u == 'c' || *tag_u == 'S' || *tag_u == 's') { - int64_t tag_i = bam_aux2i(tag_u); - fprintf(pileup_fp, "%" PRId64 "", tag_i); - } - - /* Tag value is float */ - if (*tag_u == 'd' || *tag_u == 'f') { - double tag_f = bam_aux2f(tag_u); - fprintf(pileup_fp, "%lf", tag_f); - } - - /* Tag value is character */ - if (*tag_u == 'A') { - char tag_c = bam_aux2A(tag_u); - putc(tag_c, pileup_fp); - } + /* Tag value is integer */ + if (*tag_u == 'I' || *tag_u == 'i' || *tag_u == 'C' || *tag_u == 'c' || *tag_u == 'S' || *tag_u == 's') { + int64_t tag_i = bam_aux2i(tag_u); + fprintf(pileup_fp, "%" PRId64 "", tag_i); + tag_supported = 1; } - if (!n) putc('*', pileup_fp); + + /* Tag value is float */ + if (*tag_u == 'd' || *tag_u == 'f') { + double tag_f = bam_aux2f(tag_u); + fprintf(pileup_fp, "%lf", tag_f); + tag_supported = 1; + } + + /* Tag value is character */ + if (*tag_u == 'A') { + char tag_c = bam_aux2A(tag_u); + putc(tag_c, pileup_fp); + tag_supported = 1; + } + + if (!tag_supported) putc('*', pileup_fp); } + if (!n) putc('*', pileup_fp); } } } - putc('\n', pileup_fp); } + putc('\n', pileup_fp); } if (ret < 0) { @@ -1022,7 +805,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) goto fail; } - if (conf->all && !(conf->flag & MPLP_BCF)) { + if (conf->all) { // Handle terminating region if (last_tid < 0 && conf->reg && conf->all > 1) { last_tid = tid0; @@ -1045,26 +828,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fail: // clean up - free(bc.tmp.s); - bcf_destroy1(bcf_rec); - if (bcf_fp) - { - release_autoflush(bcf_fp); - hts_close(bcf_fp); - bcf_hdr_destroy(bcf_hdr); - bcf_call_destroy(bca); - free(bc.PL); - free(bc.DP4); - free(bc.ADR); - free(bc.ADF); - free(bc.fmt_arr); - free(bcr); - } if (pileup_fp && conf->output_fname) fclose(pileup_fp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); - bcf_call_del_rghash(rghash); bam_mplp_destroy(iter); sam_hdr_destroy(h); for (i = 0; i < n; ++i) { @@ -1143,35 +910,6 @@ int read_file_list(const char *file_list,int *n,char **argv[]) } #undef MAX_PATH_LEN -int parse_format_flag(const char *str) -{ - int i, flag = 0, n_tags; - char **tags = hts_readlist(str, 0, &n_tags); - for(i=0; irflag_require); @@ -1235,18 +973,13 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) sam_global_opt_help(fp, "-.--.--."); fprintf(fp, "\n" -"Note that using \"samtools mpileup\" to generate BCF or VCF files is now\n" -"deprecated. To output these formats, please use \"bcftools mpileup\" instead.\n"); +"Note that using \"samtools mpileup\" to generate BCF or VCF files has been\n" +"removed. To output these formats, please use \"bcftools mpileup\" instead.\n"); free(tmp_require); free(tmp_filter); } -static void deprecated(char opt) { - fprintf(stderr, "[warning] samtools mpileup option `%c` is functional, " - "but deprecated. Please switch to using bcftools mpileup in future.\n", opt); -} - int bam_mpileup(int argc, char *argv[]) { int c; @@ -1258,9 +991,6 @@ int bam_mpileup(int argc, char *argv[]) mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = MPLP_MAX_DEPTH; - mplp.max_indel_depth = MPLP_MAX_INDEL_DEPTH; - mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; - mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; @@ -1279,7 +1009,6 @@ int bam_mpileup(int argc, char *argv[]) {"incl-flags", required_argument, NULL, 1}, {"excl-flags", required_argument, NULL, 2}, {"output", required_argument, NULL, 3}, - {"open-prob", required_argument, NULL, 4}, {"output-QNAME", no_argument, NULL, 5}, {"output-qname", no_argument, NULL, 5}, {"illumina1.3+", no_argument, NULL, '6'}, @@ -1304,10 +1033,6 @@ int bam_mpileup(int argc, char *argv[]) {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, {"ignore-overlaps", no_argument, NULL, 'x'}, - {"BCF", no_argument, NULL, 'g'}, - {"bcf", no_argument, NULL, 'g'}, - {"VCF", no_argument, NULL, 'v'}, - {"vcf", no_argument, NULL, 'v'}, {"output-mods", no_argument, NULL, 'M'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, @@ -1315,8 +1040,6 @@ int bam_mpileup(int argc, char *argv[]) {"output-bp-5", no_argument, NULL, 14}, {"output-MQ", no_argument, NULL, 's'}, {"output-mq", no_argument, NULL, 's'}, - {"output-tags", required_argument, NULL, 't'}, - {"uncompressed", no_argument, NULL, 'u'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, {"tandem-qual", required_argument, NULL, 'h'}, @@ -1338,7 +1061,7 @@ int bam_mpileup(int argc, char *argv[]) {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:aM",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Af:r:l:q:Q:RC:Bd:b:o:EG:6OsxXaM",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -1350,7 +1073,6 @@ int bam_mpileup(int argc, char *argv[]) if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; - case 4 : mplp.openQ = atoi(optarg); break; case 5 : mplp.flag |= MPLP_PRINT_QNAME; break; case 6 : mplp.rev_del = 1; break; case 7 : @@ -1379,54 +1101,21 @@ int bam_mpileup(int argc, char *argv[]) mplp.bed = bed_read(optarg); if (!mplp.bed) { print_error_errno("mpileup", "Could not read file \"%s\"", optarg); return 1; } break; - case 'P': mplp.pl_list = strdup(optarg); deprecated(c); break; - case 'p': mplp.flag |= MPLP_PER_SAMPLE; deprecated(c); break; - case 'g': mplp.flag |= MPLP_BCF; deprecated(c); break; - case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; deprecated(c); break; - case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; deprecated(c); break; case 'B': mplp.flag &= ~MPLP_REALN; break; case 'X': has_index_file = 1; break; - case 'D': mplp.fmt_flag |= B2B_FMT_DP; deprecated(c); break; - case 'S': mplp.fmt_flag |= B2B_FMT_SP; deprecated(c); break; - case 'V': mplp.fmt_flag |= B2B_FMT_DV; deprecated(c); break; - case 'I': mplp.flag |= MPLP_NO_INDEL; deprecated(c); break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break; - case 'O': - if (!(mplp.flag & MPLP_PRINT_QPOS5)) - mplp.flag |= MPLP_PRINT_QPOS; - break; - case 14: - mplp.flag |= MPLP_PRINT_QPOS5; - mplp.flag &= ~MPLP_PRINT_QPOS; - break; + case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; + case 14: mplp.flag |= MPLP_PRINT_QPOS5; break; case 'M': mplp.flag |= MPLP_PRINT_MODS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; - case 'o': { - char *end; - long value = strtol(optarg, &end, 10); - // Distinguish between -o INT and -o FILE (a bit of a hack!) - if (*end == '\0') { - mplp.openQ = value; - fprintf(stderr, "[warning] samtools mpileup option " - "'--open-prob INT' is functional, but deprecated. " - "Please switch to using bcftools mpileup in future.\n"); - } else { - mplp.output_fname = optarg; - } - } - break; - case 'e': mplp.extQ = atoi(optarg); deprecated(c); break; - case 'h': mplp.tandemQ = atoi(optarg); deprecated(c); break; + case 'o': mplp.output_fname = optarg; break; case 'A': use_orphan = 1; break; - case 'F': mplp.min_frac = atof(optarg); deprecated(c); break; - case 'm': mplp.min_support = atoi(optarg); deprecated(c); break; - case 'L': mplp.max_indel_depth = atoi(optarg); deprecated(c); break; case 'G': { FILE *fp_rg; char buf[1024]; @@ -1438,7 +1127,6 @@ int bam_mpileup(int argc, char *argv[]) fclose(fp_rg); } break; - case 't': mplp.fmt_flag |= parse_format_flag(optarg); deprecated(c); break; case 'a': mplp.all++; break; default: if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break; diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index 7eb601d..c8252cd 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -48,6 +48,7 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" #include "bedidx.h" #include "sam_opts.h" +#include "bam_plbuf.h" #define dummy_free(p) KLIST_INIT(auxlist, char *, dummy_free) @@ -67,10 +68,10 @@ static inline int printw(int c, FILE *fp) return 0; } -static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, - hts_pos_t ref_len, const char *ref, kstring_t *ks, - int rev_del, int no_ins, int no_ins_mods, - int no_del, int no_ends) +int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, + hts_pos_t ref_len, const char *ref, kstring_t *ks, + int rev_del, int no_ins, int no_ins_mods, + int no_del, int no_ends) { no_ins_mods |= no_ins; int j; @@ -163,12 +164,8 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, return 0; } -#include -#include "bam2bcf.h" #include "sample.h" -#define MPLP_BCF 1 -#define MPLP_VCF (1<<1) #define MPLP_NO_COMP (1<<2) #define MPLP_NO_ORPHAN (1<<3) #define MPLP_REALN (1<<4) @@ -176,7 +173,6 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, #define MPLP_REDO_BAQ (1<<6) #define MPLP_ILLUMINA13 (1<<7) #define MPLP_IGNORE_RG (1<<8) -#define MPLP_PER_SAMPLE (1<<9) #define MPLP_SMART_OVERLAPS (1<<10) #define MPLP_PRINT_MAPQ_CHAR (1<<11) @@ -201,10 +197,8 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, #define MPLP_MAX_INDEL_DEPTH 250 typedef struct { - int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all, rev_del; + int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, all, rev_del; int rflag_require, rflag_filter; - int openQ, extQ, tandemQ, min_support; // for indels - double min_frac; // for indels char *reg, *pl_list, *fai_fname, *output_fname; faidx_t *fai; void *bed, *rghash, *auxlist; @@ -441,33 +435,6 @@ static int mplp_func(void *data, bam1_t *b) return ret; } -static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, - int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp, int ignore_rg) -{ - int i, j; - memset(m->n_plp, 0, m->n * sizeof(int)); - for (i = 0; i < n; ++i) { - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - uint8_t *q; - int id = -1; - q = ignore_rg? NULL : bam_aux_get(p->b, "RG"); - if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); - if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); - if (id < 0 || id >= m->n) { - assert(q); // otherwise a bug - fprintf(samtools_stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); - samtools_exit(EXIT_FAILURE); - } - if (m->n_plp[id] == m->m_plp[id]) { - m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; - m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); - } - m->plp[id][m->n_plp[id]++] = *p; - } - } -} - /* * Performs pileup * @param conf configuration for this pileup @@ -477,32 +444,22 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, */ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) { - extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); - extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; - int i, tid, *n_plp, tid0 = 0, max_depth, max_indel_depth; + int i, tid, *n_plp, tid0 = 0, max_depth; hts_pos_t pos, beg0 = 0, end0 = HTS_POS_MAX, ref_len; const bam_pileup1_t **plp; mplp_ref_t mp_ref = MPLP_REF_INIT; bam_mplp_t iter; sam_hdr_t *h = NULL; /* header of first file in input list */ char *ref; - void *rghash = NULL; FILE *pileup_fp = NULL; - bcf_callaux_t *bca = NULL; - bcf_callret1_t *bcr = NULL; - bcf_call_t bc; - htsFile *bcf_fp = NULL; - bcf_hdr_t *bcf_hdr = NULL; - bam_sample_t *sm = NULL; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); - memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(mplp_aux_t*)); plp = calloc(n, sizeof(bam_pileup1_t*)); n_plp = calloc(n, sizeof(int)); @@ -552,10 +509,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) samtools_exit(EXIT_FAILURE); } bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp)); - if (conf->flag & MPLP_BCF) { - // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) - rghash = bcf_call_add_rg(rghash, sam_hdr_str(h_tmp), conf->pl_list); - } if (conf->reg) { hts_idx_t *idx = NULL; // If index filename has not been specfied, look in BAM folder @@ -590,151 +543,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) } } fprintf(samtools_stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); - if (conf->flag & MPLP_BCF) - { - const char *mode; - // allocate data storage proportionate to number of samples being studied sm->n - gplp.n = sm->n; - gplp.n_plp = calloc(sm->n, sizeof(int)); - gplp.m_plp = calloc(sm->n, sizeof(int)); - gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); - // write the VCF header + pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : samtools_stdout; - if ( conf->flag & MPLP_VCF ) - mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF - else - mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb"; // uncompressed BCF or compressed BCF - - bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); - if (bcf_fp == NULL) { - fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); - samtools_exit(EXIT_FAILURE); - } - autoflush_if_stdout(bcf_fp, conf->output_fname); - - // BCF header creation - bcf_hdr = bcf_hdr_init("w"); - kstring_t str = {0,0,NULL}; - - ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version()); - bcf_hdr_append(bcf_hdr, str.s); - - str.l = 0; - ksprintf(&str, "##samtoolsCommand=samtools mpileup"); - for (i=1; iargc; i++) ksprintf(&str, " %s", conf->argv[i]); - kputc('\n', &str); - bcf_hdr_append(bcf_hdr, str.s); - - if (conf->fai_fname) - { - str.l = 0; - ksprintf(&str, "##reference=file://%s\n", conf->fai_fname); - bcf_hdr_append(bcf_hdr, str.s); - } - - // Translate BAM @SQ tags to BCF ##contig tags - // todo: use/write new BAM header manipulation routines, fill also UR, M5 - for (i=0; i < sam_hdr_nref(h); i++) - { - str.l = 0; - ksprintf(&str, "##contig=", sam_hdr_tid2name(h, i), (int64_t) sam_hdr_tid2len(h, i)); - bcf_hdr_append(bcf_hdr, str.s); - } - free(str.s); - bcf_hdr_append(bcf_hdr,"##ALT="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); -#if CDF_MWU_TESTS - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); -#endif - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##INFO="); - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_DP ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_DV ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_DPR ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_DPR ) - bcf_hdr_append(bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_FMT_DP4 ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_SP ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_AD ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_ADF ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_ADR ) - bcf_hdr_append(bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_AD ) - bcf_hdr_append(bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_ADF ) - bcf_hdr_append(bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_ADR ) - bcf_hdr_append(bcf_hdr,"##INFO="); - - for (i=0; in; i++) - bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); - bcf_hdr_add_sample(bcf_hdr, NULL); - if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) { - print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"", - conf->output_fname? conf->output_fname : "standard output"); - samtools_exit(EXIT_FAILURE); - } - // End of BCF header creation - - // Initialise the calling algorithm - bca = bcf_call_init(-1., conf->min_baseQ); - bcr = calloc(sm->n, sizeof(bcf_callret1_t)); - bca->rghash = rghash; - bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; - bca->min_frac = conf->min_frac; - bca->min_support = conf->min_support; - bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; - - bc.bcf_hdr = bcf_hdr; - bc.n = sm->n; - bc.PL = malloc(15 * sm->n * sizeof(*bc.PL)); - if (conf->fmt_flag) - { - assert( sizeof(float)==sizeof(int32_t) ); - bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4); - bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields - if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) - { - // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample - bc.ADR = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); - bc.ADF = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); - for (i=0; in; i++) - { - bcr[i].ADR = bc.ADR + (i+1)*B2B_MAX_ALLELES; - bcr[i].ADF = bc.ADF + (i+1)*B2B_MAX_ALLELES; - } - } - } - } - else { - pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : samtools_stdout; - - if (pileup_fp == NULL) { - fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); - samtools_exit(EXIT_FAILURE); - } + if (pileup_fp == NULL) { + fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); + samtools_exit(EXIT_FAILURE); } // init pileup @@ -753,10 +567,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(samtools_stderr, "[%s] Combined max depth is above 1M. Potential memory hog!\n", __func__); } - // Only used when writing BCF - max_indel_depth = conf->max_indel_depth * sm->n; + bam_mplp_set_maxcnt(iter, max_depth); - bcf1_t *bcf_rec = bcf_init1(); int ret; int last_tid = -1; hts_pos_t last_pos = -1; @@ -766,256 +578,227 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested mplp_get_ref(data[0], tid, &ref, &ref_len); //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); - if (conf->flag & MPLP_BCF) { - int total_depth, _ref0, ref16; - if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; - for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; - group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); - _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; - ref16 = seq_nt16_table[_ref0]; - bcf_callaux_clean(bca, &bc); - for (i = 0; i < gplp.n; ++i) - bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); - bc.tid = tid; bc.pos = pos; - bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); - bcf_clear1(bcf_rec); - bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); - if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { - print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", - conf->output_fname?conf->output_fname:"standard output"); - samtools_exit(EXIT_FAILURE); - } - // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) - { - bcf_callaux_clean(bca, &bc); - for (i = 0; i < gplp.n; ++i) - bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { - bcf_clear1(bcf_rec); - bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); - if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { - print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", - conf->output_fname?conf->output_fname:"standard output"); - samtools_exit(EXIT_FAILURE); + if (conf->all) { + // Deal with missing portions of previous tids + while (tid > last_tid) { + if (last_tid >= 0 && !conf->reg) { + while (++last_pos < sam_hdr_tid2len(h, last_tid)) { + if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); } } + last_tid++; + last_pos = -1; + if (conf->all < 2) + break; } - } else { - if (conf->all) { - // Deal with missing portions of previous tids - while (tid > last_tid) { - if (last_tid >= 0 && !conf->reg) { - while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; - print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); + } + if (conf->all) { + // Deal with missing portion of current tid + while (++last_pos < pos) { + if (conf->reg && last_pos < beg0) continue; // out of range; skip + if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) + continue; + print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len); + } + last_tid = tid; + last_pos = pos; + } + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; + + fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); + for (i = 0; i < n; ++i) { + int j, cnt; + for (j = cnt = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) ++cnt; + } + fprintf(pileup_fp, "\t%d\t", cnt); + if (n_plp[i] == 0) { + fputs("*\t*", pileup_fp); + int flag_value = MPLP_PRINT_MAPQ_CHAR; + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS + && (conf->flag & flag_value)) + fputs("\t*", pileup_fp); + flag_value <<= 1; + } + if (conf->auxlist) { + int t = 0; + while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) + fputs("\t*", pileup_fp); + } + } else { + int n = 0; + kstring_t ks = KS_INITIALIZE; + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) { + n++; + if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, + ref, &ks, conf->rev_del, + conf->no_ins, conf->no_ins_mods, + conf->no_del, conf->no_ends) < 0) { + ret = 1; + goto fail; } } - last_tid++; - last_pos = -1; - if (conf->all < 2) - break; - } - } - if (conf->all) { - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (conf->reg && last_pos < beg0) continue; // out of range; skip - if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; - print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len); } - last_tid = tid; - last_pos = pos; - } - if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; + if (!n) putc('*', pileup_fp); - fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); - for (i = 0; i < n; ++i) { - int j, cnt; - for (j = cnt = 0; j < n_plp[i]; ++j) { + /* Print base qualities */ + n = 0; + ks_free(&ks); + putc('\t', pileup_fp); + for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if (c >= conf->min_baseQ) ++cnt; - } - fprintf(pileup_fp, "\t%d\t", cnt); - if (n_plp[i] == 0) { - fputs("*\t*", pileup_fp); - int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_LAST) { - if (flag_value != MPLP_PRINT_MODS - && (conf->flag & flag_value)) - fputs("\t*", pileup_fp); - flag_value <<= 1; + ? bam_get_qual(p->b)[p->qpos] + : 0; + if (c >= conf->min_baseQ) { + c = c + 33 < 126? c + 33 : 126; + putc(c, pileup_fp); + n++; } - if (conf->auxlist) { - int t = 0; - while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) - fputs("\t*", pileup_fp); - } - } else { - int n = 0; - kstring_t ks = KS_INITIALIZE; - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if (c >= conf->min_baseQ) { + } + if (!n) putc('*', pileup_fp); + + /* Print selected columns */ + int flag_value = MPLP_PRINT_MAPQ_CHAR; + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS + && (conf->flag & flag_value)) { + n = 0; + putc('\t', pileup_fp); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = &plp[i][j]; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if ( c < conf->min_baseQ ) continue; + if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp); n++; - if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, - ref, &ks, conf->rev_del, - conf->no_ins, conf->no_ins_mods, - conf->no_del, conf->no_ends) < 0) { - ret = 1; - goto fail; + + switch (flag_value) { + case MPLP_PRINT_MAPQ_CHAR: + c = p->b->core.qual + 33; + if (c > 126) c = 126; + putc(c, pileup_fp); + break; + case MPLP_PRINT_QPOS: + // query position in current orientation + fprintf(pileup_fp, "%d", p->qpos + 1); + break; + case MPLP_PRINT_QPOS5: { + // query position in 5' to 3' orientation + int pos5 = bam_is_rev(p->b) + ? p->b->core.l_qseq-p->qpos + p->is_del + : p->qpos + 1; + fprintf(pileup_fp, "%d", pos5); + break; + } + case MPLP_PRINT_QNAME: + fputs(bam_get_qname(p->b), pileup_fp); + break; + case MPLP_PRINT_FLAG: + fprintf(pileup_fp, "%d", p->b->core.flag); + break; + case MPLP_PRINT_RNAME: + if (p->b->core.tid >= 0) + fputs(sam_hdr_tid2name(h, p->b->core.tid), pileup_fp); + else + putc('*', pileup_fp); + break; + case MPLP_PRINT_POS: + fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); + break; + case MPLP_PRINT_MAPQ: + fprintf(pileup_fp, "%d", p->b->core.qual); + break; + case MPLP_PRINT_RNEXT: + if (p->b->core.mtid >= 0) + fputs(sam_hdr_tid2name(h, p->b->core.mtid), pileup_fp); + else + putc('*', pileup_fp); + break; + case MPLP_PRINT_PNEXT: + fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1); + break; } } + if (!n) putc('*', pileup_fp); } - if (!n) putc('*', pileup_fp); - - /* Print base qualities */ - n = 0; - ks_free(&ks); - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if (c >= conf->min_baseQ) { - c = c + 33 < 126? c + 33 : 126; - putc(c, pileup_fp); + flag_value <<= 1; + } + + /* Print selected tags */ + klist_t(auxlist) *auxlist_p = ((klist_t(auxlist) *)conf->auxlist); + if (auxlist_p && auxlist_p->size) { + kliter_t(auxlist) *aux; + for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) { + n = 0; + putc('\t', pileup_fp); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = &plp[i][j]; + int c = p->qpos < p->b->core.l_qseq + ? bam_get_qual(p->b)[p->qpos] + : 0; + if ( c < conf->min_baseQ ) continue; + + if (n > 0) putc(conf->sep, pileup_fp); n++; - } - } - if (!n) putc('*', pileup_fp); - - /* Print selected columns */ - int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_LAST) { - if (flag_value != MPLP_PRINT_MODS - && (conf->flag & flag_value)) { - n = 0; - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = &plp[i][j]; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if ( c < conf->min_baseQ ) continue; - if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp); - n++; - - switch (flag_value) { - case MPLP_PRINT_MAPQ_CHAR: - c = p->b->core.qual + 33; - if (c > 126) c = 126; - putc(c, pileup_fp); - break; - case MPLP_PRINT_QPOS: - // query position in current orientation - fprintf(pileup_fp, "%d", p->qpos + 1); - break; - case MPLP_PRINT_QPOS5: { - // query position in 5' to 3' orientation - int pos5 = bam_is_rev(p->b) - ? p->b->core.l_qseq-p->qpos + p->is_del - : p->qpos + 1; - fprintf(pileup_fp, "%d", pos5); - break; - } - case MPLP_PRINT_QNAME: - fputs(bam_get_qname(p->b), pileup_fp); - break; - case MPLP_PRINT_FLAG: - fprintf(pileup_fp, "%d", p->b->core.flag); - break; - case MPLP_PRINT_RNAME: - if (p->b->core.tid >= 0) - fputs(sam_hdr_tid2name(h, p->b->core.tid), pileup_fp); - else - putc('*', pileup_fp); - break; - case MPLP_PRINT_POS: - fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); - break; - case MPLP_PRINT_MAPQ: - fprintf(pileup_fp, "%d", p->b->core.qual); - break; - case MPLP_PRINT_RNEXT: - if (p->b->core.mtid >= 0) - fputs(sam_hdr_tid2name(h, p->b->core.mtid), pileup_fp); - else - putc('*', pileup_fp); - break; - case MPLP_PRINT_PNEXT: - fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1); - break; - } + uint8_t* tag_u = bam_aux_get(p->b, kl_val(aux)); + if (!tag_u) { + putc(conf->empty , pileup_fp); + continue; + } + + int tag_supported = 0; + + /* Tag value is string */ + if (*tag_u == 'Z' || *tag_u == 'H') { + char *tag_s = bam_aux2Z(tag_u); + if (!tag_s) continue; + fputs(tag_s, pileup_fp); + tag_supported = 1; } - if (!n) putc('*', pileup_fp); - } - flag_value <<= 1; - } - /* Print selected tags */ - klist_t(auxlist) *auxlist_p = ((klist_t(auxlist) *)conf->auxlist); - if (auxlist_p && auxlist_p->size) { - kliter_t(auxlist) *aux; - for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) { - n = 0; - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = &plp[i][j]; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if ( c < conf->min_baseQ ) continue; - - if (n > 0) putc(conf->sep, pileup_fp); - n++; - uint8_t* tag_u = bam_aux_get(p->b, kl_val(aux)); - if (!tag_u) { - putc(conf->empty , pileup_fp); - continue; - } - - /* Tag value is string */ - if (*tag_u == 'Z' || *tag_u == 'H') { - char *tag_s = bam_aux2Z(tag_u); - if (!tag_s) continue; - fputs(tag_s, pileup_fp); - } - - /* Tag value is integer */ - if (*tag_u == 'I' || *tag_u == 'i' || *tag_u == 'C' || *tag_u == 'c' || *tag_u == 'S' || *tag_u == 's') { - int64_t tag_i = bam_aux2i(tag_u); - fprintf(pileup_fp, "%" PRId64 "", tag_i); - } - - /* Tag value is float */ - if (*tag_u == 'd' || *tag_u == 'f') { - double tag_f = bam_aux2f(tag_u); - fprintf(pileup_fp, "%lf", tag_f); - } - - /* Tag value is character */ - if (*tag_u == 'A') { - char tag_c = bam_aux2A(tag_u); - putc(tag_c, pileup_fp); - } + /* Tag value is integer */ + if (*tag_u == 'I' || *tag_u == 'i' || *tag_u == 'C' || *tag_u == 'c' || *tag_u == 'S' || *tag_u == 's') { + int64_t tag_i = bam_aux2i(tag_u); + fprintf(pileup_fp, "%" PRId64 "", tag_i); + tag_supported = 1; } - if (!n) putc('*', pileup_fp); + + /* Tag value is float */ + if (*tag_u == 'd' || *tag_u == 'f') { + double tag_f = bam_aux2f(tag_u); + fprintf(pileup_fp, "%lf", tag_f); + tag_supported = 1; + } + + /* Tag value is character */ + if (*tag_u == 'A') { + char tag_c = bam_aux2A(tag_u); + putc(tag_c, pileup_fp); + tag_supported = 1; + } + + if (!tag_supported) putc('*', pileup_fp); } + if (!n) putc('*', pileup_fp); } } } - putc('\n', pileup_fp); } + putc('\n', pileup_fp); } if (ret < 0) { @@ -1024,7 +807,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) goto fail; } - if (conf->all && !(conf->flag & MPLP_BCF)) { + if (conf->all) { // Handle terminating region if (last_tid < 0 && conf->reg && conf->all > 1) { last_tid = tid0; @@ -1047,26 +830,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fail: // clean up - free(bc.tmp.s); - bcf_destroy1(bcf_rec); - if (bcf_fp) - { - release_autoflush(bcf_fp); - hts_close(bcf_fp); - bcf_hdr_destroy(bcf_hdr); - bcf_call_destroy(bca); - free(bc.PL); - free(bc.DP4); - free(bc.ADR); - free(bc.ADF); - free(bc.fmt_arr); - free(bcr); - } if (pileup_fp && conf->output_fname) fclose(pileup_fp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); - bcf_call_del_rghash(rghash); bam_mplp_destroy(iter); sam_hdr_destroy(h); for (i = 0; i < n; ++i) { @@ -1145,35 +912,6 @@ int read_file_list(const char *file_list,int *n,char **argv[]) } #undef MAX_PATH_LEN -int parse_format_flag(const char *str) -{ - int i, flag = 0, n_tags; - char **tags = hts_readlist(str, 0, &n_tags); - for(i=0; irflag_require); @@ -1237,18 +975,13 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) sam_global_opt_help(fp, "-.--.--."); fprintf(fp, "\n" -"Note that using \"samtools mpileup\" to generate BCF or VCF files is now\n" -"deprecated. To output these formats, please use \"bcftools mpileup\" instead.\n"); +"Note that using \"samtools mpileup\" to generate BCF or VCF files has been\n" +"removed. To output these formats, please use \"bcftools mpileup\" instead.\n"); free(tmp_require); free(tmp_filter); } -static void deprecated(char opt) { - fprintf(samtools_stderr, "[warning] samtools mpileup option `%c` is functional, " - "but deprecated. Please switch to using bcftools mpileup in future.\n", opt); -} - int bam_mpileup(int argc, char *argv[]) { int c; @@ -1260,9 +993,6 @@ int bam_mpileup(int argc, char *argv[]) mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = MPLP_MAX_DEPTH; - mplp.max_indel_depth = MPLP_MAX_INDEL_DEPTH; - mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; - mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; @@ -1281,7 +1011,6 @@ int bam_mpileup(int argc, char *argv[]) {"incl-flags", required_argument, NULL, 1}, {"excl-flags", required_argument, NULL, 2}, {"output", required_argument, NULL, 3}, - {"open-prob", required_argument, NULL, 4}, {"output-QNAME", no_argument, NULL, 5}, {"output-qname", no_argument, NULL, 5}, {"illumina1.3+", no_argument, NULL, '6'}, @@ -1306,10 +1035,6 @@ int bam_mpileup(int argc, char *argv[]) {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, {"ignore-overlaps", no_argument, NULL, 'x'}, - {"BCF", no_argument, NULL, 'g'}, - {"bcf", no_argument, NULL, 'g'}, - {"VCF", no_argument, NULL, 'v'}, - {"vcf", no_argument, NULL, 'v'}, {"output-mods", no_argument, NULL, 'M'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, @@ -1317,8 +1042,6 @@ int bam_mpileup(int argc, char *argv[]) {"output-bp-5", no_argument, NULL, 14}, {"output-MQ", no_argument, NULL, 's'}, {"output-mq", no_argument, NULL, 's'}, - {"output-tags", required_argument, NULL, 't'}, - {"uncompressed", no_argument, NULL, 'u'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, {"tandem-qual", required_argument, NULL, 'h'}, @@ -1340,7 +1063,7 @@ int bam_mpileup(int argc, char *argv[]) {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:aM",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Af:r:l:q:Q:RC:Bd:b:o:EG:6OsxXaM",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -1352,7 +1075,6 @@ int bam_mpileup(int argc, char *argv[]) if ( mplp.rflag_filter<0 ) { fprintf(samtools_stderr,"Could not parse --ff %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; - case 4 : mplp.openQ = atoi(optarg); break; case 5 : mplp.flag |= MPLP_PRINT_QNAME; break; case 6 : mplp.rev_del = 1; break; case 7 : @@ -1381,54 +1103,21 @@ int bam_mpileup(int argc, char *argv[]) mplp.bed = bed_read(optarg); if (!mplp.bed) { print_error_errno("mpileup", "Could not read file \"%s\"", optarg); return 1; } break; - case 'P': mplp.pl_list = strdup(optarg); deprecated(c); break; - case 'p': mplp.flag |= MPLP_PER_SAMPLE; deprecated(c); break; - case 'g': mplp.flag |= MPLP_BCF; deprecated(c); break; - case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; deprecated(c); break; - case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; deprecated(c); break; case 'B': mplp.flag &= ~MPLP_REALN; break; case 'X': has_index_file = 1; break; - case 'D': mplp.fmt_flag |= B2B_FMT_DP; deprecated(c); break; - case 'S': mplp.fmt_flag |= B2B_FMT_SP; deprecated(c); break; - case 'V': mplp.fmt_flag |= B2B_FMT_DV; deprecated(c); break; - case 'I': mplp.flag |= MPLP_NO_INDEL; deprecated(c); break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break; - case 'O': - if (!(mplp.flag & MPLP_PRINT_QPOS5)) - mplp.flag |= MPLP_PRINT_QPOS; - break; - case 14: - mplp.flag |= MPLP_PRINT_QPOS5; - mplp.flag &= ~MPLP_PRINT_QPOS; - break; + case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; + case 14: mplp.flag |= MPLP_PRINT_QPOS5; break; case 'M': mplp.flag |= MPLP_PRINT_MODS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; - case 'o': { - char *end; - long value = strtol(optarg, &end, 10); - // Distinguish between -o INT and -o FILE (a bit of a hack!) - if (*end == '\0') { - mplp.openQ = value; - fprintf(samtools_stderr, "[warning] samtools mpileup option " - "'--open-prob INT' is functional, but deprecated. " - "Please switch to using bcftools mpileup in future.\n"); - } else { - mplp.output_fname = optarg; - } - } - break; - case 'e': mplp.extQ = atoi(optarg); deprecated(c); break; - case 'h': mplp.tandemQ = atoi(optarg); deprecated(c); break; + case 'o': mplp.output_fname = optarg; break; case 'A': use_orphan = 1; break; - case 'F': mplp.min_frac = atof(optarg); deprecated(c); break; - case 'm': mplp.min_support = atoi(optarg); deprecated(c); break; - case 'L': mplp.max_indel_depth = atoi(optarg); deprecated(c); break; case 'G': { FILE *fp_rg; char buf[1024]; @@ -1440,7 +1129,6 @@ int bam_mpileup(int argc, char *argv[]) fclose(fp_rg); } break; - case 't': mplp.fmt_flag |= parse_format_flag(optarg); deprecated(c); break; case 'a': mplp.all++; break; default: if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break; diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index e36e6e7..22a6cd9 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -533,7 +533,7 @@ cleanup: return h; } -int samtools_main_reheader(int argc, char *argv[]) +int main_reheader(int argc, char *argv[]) { int inplace = 0, r, no_pg = 0, c, skip_header = 0; sam_hdr_t *h; diff --git a/samtools/bamtk.c b/samtools/bamtk.c index fedfe69..ffec347 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -1,6 +1,6 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2021 Genome Research Ltd. + Copyright (C) 2008-2022 Genome Research Ltd. Author: Heng Li @@ -47,6 +47,7 @@ int bam_fillmd(int argc, char *argv[]); int bam_idxstats(int argc, char *argv[]); int bam_markdup(int argc, char *argv[]); int main_samview(int argc, char *argv[]); +int main_head(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); int main_cut_target(int argc, char *argv[]); int main_phase(int argc, char *argv[]); @@ -69,6 +70,7 @@ int amplicon_clip_main(int argc, char *argv[]); int main_ampliconstats(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_samples(int argc, char *argv[]); +int main_consensus(int argc, char *argv[]); const char *samtools_version() { @@ -98,7 +100,7 @@ const char *samtools_feature_string(void) { static void long_version(void) { printf("samtools %s\n" "Using htslib %s\n" - "Copyright (C) 2021 Genome Research Ltd.\n", + "Copyright (C) 2022 Genome Research Ltd.\n", samtools_version(), hts_version()); printf("\nSamtools compilation details:\n"); @@ -168,6 +170,7 @@ static void usage(FILE *fp) " -- File operations\n" " collate shuffle and group alignments by name\n" " cat concatenate BAMs\n" +" consensus produce a consensus Pileup/FASTA/FASTQ\n" " merge merge sorted alignments\n" " mpileup multi-way pileup\n" " sort sort alignment file\n" @@ -189,6 +192,7 @@ static void usage(FILE *fp) "\n" " -- Viewing\n" " flags explain BAM flags\n" +" head header viewer\n" " tview text alignment viewer\n" " view SAM<->BAM<->CRAM conversion\n" " depad convert padded BAM to unpadded BAM\n" @@ -242,6 +246,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1); else if (strcmp(argv[1], "fqidx") == 0) ret = fqidx_main(argc-1, argv+1); else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1); + else if (strcmp(argv[1], "head") == 0) ret = main_head(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); @@ -278,10 +283,10 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1); + else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ - strcmp(argv[1], "--version") == 0) { + strcmp(argv[1], "--version") == 0) long_version(); - } else if (strcmp(argv[1], "--version-only") == 0) { printf("%s+htslib-%s\n", samtools_version(), hts_version()); } diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index 3257ba1..11d21a3 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -2,7 +2,7 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2021 Genome Research Ltd. + Copyright (C) 2008-2022 Genome Research Ltd. Author: Heng Li @@ -50,7 +50,8 @@ int bam_fillmd(int argc, char *argv[]); int bam_idxstats(int argc, char *argv[]); int bam_markdup(int argc, char *argv[]); int main_samview(int argc, char *argv[]); -int samtools_main_reheader(int argc, char *argv[]); +int main_head(int argc, char *argv[]); +int main_reheader(int argc, char *argv[]); int main_cut_target(int argc, char *argv[]); int main_phase(int argc, char *argv[]); int main_cat(int argc, char *argv[]); @@ -72,6 +73,7 @@ int amplicon_clip_main(int argc, char *argv[]); int main_ampliconstats(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_samples(int argc, char *argv[]); +int main_consensus(int argc, char *argv[]); const char *samtools_version() { @@ -101,7 +103,7 @@ const char *samtools_feature_string(void) { static void long_version(void) { fprintf(samtools_stdout, "samtools %s\n" "Using htslib %s\n" - "Copyright (C) 2021 Genome Research Ltd.\n", + "Copyright (C) 2022 Genome Research Ltd.\n", samtools_version(), hts_version()); fprintf(samtools_stdout, "\nSamtools compilation details:\n"); @@ -171,6 +173,7 @@ static void usage(FILE *fp) " -- File operations\n" " collate shuffle and group alignments by name\n" " cat concatenate BAMs\n" +" consensus produce a consensus Pileup/FASTA/FASTQ\n" " merge merge sorted alignments\n" " mpileup multi-way pileup\n" " sort sort alignment file\n" @@ -192,6 +195,7 @@ static void usage(FILE *fp) "\n" " -- Viewing\n" " flags explain BAM flags\n" +" head header viewer\n" " tview text alignment viewer\n" " view SAM<->BAM<->CRAM conversion\n" " depad convert padded BAM to unpadded BAM\n" @@ -245,6 +249,7 @@ int samtools_main(int argc, char *argv[]) else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1); else if (strcmp(argv[1], "fqidx") == 0) ret = fqidx_main(argc-1, argv+1); else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1); + else if (strcmp(argv[1], "head") == 0) ret = main_head(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); @@ -253,7 +258,7 @@ int samtools_main(int argc, char *argv[]) strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1); - else if (strcmp(argv[1], "reheader") == 0) ret = samtools_main_reheader(argc-1, argv+1); + else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1); else if (strcmp(argv[1], "cat") == 0) ret = main_cat(argc-1, argv+1); else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1); else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1); @@ -281,10 +286,10 @@ int samtools_main(int argc, char *argv[]) //else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1); + else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ - strcmp(argv[1], "--version") == 0) { + strcmp(argv[1], "--version") == 0) long_version(); - } else if (strcmp(argv[1], "--version-only") == 0) { fprintf(samtools_stdout, "%s+htslib-%s\n", samtools_version(), hts_version()); } diff --git a/samtools/bedcov.c b/samtools/bedcov.c index bccc09b..07bd9ce 100644 --- a/samtools/bedcov.c +++ b/samtools/bedcov.c @@ -81,6 +81,8 @@ int main_bedcov(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { + {"min-MQ", required_argument, NULL, 'Q'}, + {"min-mq", required_argument, NULL, 'Q'}, SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), { NULL, 0, NULL, 0 } }; @@ -116,7 +118,7 @@ int main_bedcov(int argc, char *argv[]) if (usage || optind + 2 > argc) { fprintf(stderr, "Usage: samtools bedcov [options] [...]\n\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -Q mapping quality threshold [0]\n"); + fprintf(stderr, " -Q, --min-MQ mapping quality threshold [0]\n"); fprintf(stderr, " -X use customized index files\n"); fprintf(stderr, " -g remove the specified flags from the set used to filter out reads\n"); fprintf(stderr, " -G add the specified flags to the set used to filter out reads\n" diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c index b72cbf1..162630f 100644 --- a/samtools/bedcov.c.pysam.c +++ b/samtools/bedcov.c.pysam.c @@ -83,6 +83,8 @@ int main_bedcov(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { + {"min-MQ", required_argument, NULL, 'Q'}, + {"min-mq", required_argument, NULL, 'Q'}, SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), { NULL, 0, NULL, 0 } }; @@ -118,7 +120,7 @@ int main_bedcov(int argc, char *argv[]) if (usage || optind + 2 > argc) { fprintf(samtools_stderr, "Usage: samtools bedcov [options] [...]\n\n"); fprintf(samtools_stderr, "Options:\n"); - fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); + fprintf(samtools_stderr, " -Q, --min-MQ mapping quality threshold [0]\n"); fprintf(samtools_stderr, " -X use customized index files\n"); fprintf(samtools_stderr, " -g remove the specified flags from the set used to filter out reads\n"); fprintf(samtools_stderr, " -G add the specified flags to the set used to filter out reads\n" diff --git a/samtools/consensus_pileup.c b/samtools/consensus_pileup.c new file mode 100644 index 0000000..935cbdc --- /dev/null +++ b/samtools/consensus_pileup.c @@ -0,0 +1,595 @@ +/* consensus__pileup.h -- Pileup orientated data per consensus column + + Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd. + + Author: James Bonfied + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include + +#ifdef __SSE__ +# include +#else +# define _mm_prefetch(a,b) +#endif + +#include "consensus_pileup.h" + +#define MIN(a,b) ((a)<(b)?(a):(b)) +#define bam_strand(b) (((b)->core.flag & BAM_FREVERSE) != 0) + +/* + * START_WITH_DEL is the mode that Gap5 uses when building this. It prepends + * all cigar strings with 1D and decrements the position by one. (And then + * has code to reverse this operation in the pileup handler.) + * + * The reason for this is that it means reads starting with an insertion work. + * Otherwise the inserted bases are silently lost. (Try it with "samtools + * mpileup" and you can see it has the same issue.) + * + * However it's probably not want most people expect. + */ +//#define START_WITH_DEL + +/* -------------------------------------------------------------------------- + * The pileup code itself. + * + * This consists of the external pileup_loop() function, which takes a + * sam/bam samfile_t pointer and a callback function. The callback function + * is called once per column of aligned data (so once per base in an + * insertion). + * + * Current known issues. + * 1) zero length matches, ie 2S2S cause failures. + * 2) Insertions at starts of sequences get included in the soft clip, so + * 2S2I2M is treated as if it's 4S2M + * 3) From 1 and 2 above, 1S1I2S becomes 2S2S which fails. + */ + + +/* + * Fetches the next base => the nth base at unpadded position pos. (Nth can + * be greater than 0 if we have an insertion in this column). Do not call this + * with pos/nth lower than the previous query, although higher is better. + * (This allows it to be initialised at base 0.) + * + * Stores the result in base and also updates is_insert to indicate that + * this sequence still has more bases in this position beyond the current + * nth parameter. + * + * Returns 1 if a base was fetched + * 0 if not (eg ran off the end of sequence) + */ +static int get_next_base(pileup_t *p, hts_pos_t pos, int nth, int *is_insert) { + bam1_t *b = &p->b; + int op = p->cigar_op; + + p->start -= p->start>0; + if (p->first_del && op != BAM_CPAD) + p->first_del = 0; + + *is_insert = 0; + + /* Find pos first */ + while (p->pos < pos) { + p->nth = 0; + + if (p->cigar_len == 0) { + if (p->cigar_ind >= b->core.n_cigar) { + p->eof = 1; + return 0; + } + + op=p->cigar_op = p->b_cigar[p->cigar_ind] & BAM_CIGAR_MASK; + p->cigar_len = p->b_cigar[p->cigar_ind] >> BAM_CIGAR_SHIFT; + p->cigar_ind++; + } + + if ((op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) + && p->cigar_len <= pos - p->pos) { + p->seq_offset += p->cigar_len; + p->pos += p->cigar_len; + p->cigar_len = 0; + } else { + switch (op) { + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: + p->seq_offset++; + /* Fall through */ + case BAM_CDEL: + case BAM_CREF_SKIP: + p->pos++; + p->cigar_len--; + break; + + case BAM_CINS: + case BAM_CSOFT_CLIP: + p->seq_offset += p->cigar_len; + /* Fall through */ + case BAM_CPAD: + case BAM_CHARD_CLIP: + p->cigar_len = 0; + break; + + default: + fprintf(stderr, "Unhandled cigar_op %d\n", op); + return -1; + } + } + } + + /* Now at pos, find nth base */ + while (p->nth < nth) { + if (p->cigar_len == 0) { + if (p->cigar_ind >= b->core.n_cigar) { + p->eof = 1; + return 0; /* off end of seq */ + } + + op=p->cigar_op = p->b_cigar[p->cigar_ind] & BAM_CIGAR_MASK; + p->cigar_len = p->b_cigar[p->cigar_ind] >> BAM_CIGAR_SHIFT; + p->cigar_ind++; + } + + switch (op) { + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: + case BAM_CSOFT_CLIP: + case BAM_CDEL: + case BAM_CREF_SKIP: + goto at_nth; /* sorry, but it's fast! */ + + case BAM_CINS: + p->seq_offset++; + /* Fall through */ + case BAM_CPAD: + p->cigar_len--; + p->nth++; + break; + + case BAM_CHARD_CLIP: + p->cigar_len = 0; + break; + + default: + fprintf(stderr, "Unhandled cigar_op %d\n", op); + return -1; + } + } + at_nth: + + /* Fill out base & qual fields */ + p->ref_skip = 0; + if (p->nth < nth && op != BAM_CINS) { + //p->base = '-'; + p->base = '*'; + p->base4 = 16; + p->padding = 1; + if (p->seq_offset < b->core.l_qseq) + p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + else + p->qual = 0; + } else { + p->padding = 0; + switch(op) { + case BAM_CDEL: + p->base = '*'; + p->base4 = 16; + if (p->seq_offset+1 < b->core.l_qseq) + p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + else + p->qual = (p->qual + p->b_qual[p->seq_offset])/2; + break; + + case BAM_CPAD: + //p->base = '+'; + p->base = '*'; + p->base4 = 16; + if (p->seq_offset+1 < b->core.l_qseq) + p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + else + p->qual = (p->qual + p->b_qual[p->seq_offset])/2; + break; + + case BAM_CREF_SKIP: + p->base = '.'; + p->base4 = 0; + p->qual = 0; + /* end of fragment, but not sequence */ + p->eof = p->eof ? 2 : 3; + p->ref_skip = 1; + break; + + default: + if (p->seq_offset < b->core.l_qseq) { + p->qual = p->b_qual[p->seq_offset]; + p->base4 = p->b_seq[p->seq_offset/2] >> + ((~p->seq_offset&1)<<2) & 0xf; + p->base = "NACMGRSVTWYHKDBN"[p->base4]; + } else { + p->base = 'N'; + p->base4 = 15; + p->qual = 0xff; + } + + break; + } + } + + /* Handle moving out of N (skip) into sequence again */ + if (p->eof && p->base != '.') { + p->start = 1; + p->ref_skip = 1; + p->eof = 0; + } + + /* Starting with an indel needs a minor fudge */ + if (p->start && p->cigar_op == BAM_CDEL) { + p->first_del = 1; + } + + /* Check if next op is an insertion of some sort */ + if (p->cigar_len == 0) { + if (p->cigar_ind < b->core.n_cigar) { + op=p->cigar_op = p->b_cigar[p->cigar_ind] & BAM_CIGAR_MASK; + p->cigar_len = p->b_cigar[p->cigar_ind] >> BAM_CIGAR_SHIFT; + p->cigar_ind++; + if (op == BAM_CREF_SKIP) { + p->eof = 3; + p->ref_skip = 1; + } + } else { + p->eof = 1; + } + } + + switch (op) { + case BAM_CPAD: + case BAM_CINS: + *is_insert = p->cigar_len; + break; + + case BAM_CSOFT_CLIP: + /* Last op 'S' => eof */ + p->eof = (p->cigar_ind == b->core.n_cigar || + (p->cigar_ind+1 == b->core.n_cigar && + (p->b_cigar[p->cigar_ind] & BAM_CIGAR_MASK) + == BAM_CHARD_CLIP)) + ? 1 + : 0; + break; + + case BAM_CHARD_CLIP: + p->eof = 1; + break; + + default: + break; + } + + return 1; +} + +/* + * Loops through a set of supplied ranges producing columns of data. + * When found, it calls func with clientdata as a callback. Func should + * return 0 for success and non-zero for failure. seq_init() is called + * on each new entry before we start processing it. It should return 0 or 1 + * to indicate reject or accept status (eg to filter unmapped data). + * If seq_init() returns -1 we abort the pileup_loop with an error. + * seq_init may be NULL. + * + * Returns 0 on success + * -1 on failure + */ +int pileup_loop(samFile *fp, + sam_hdr_t *h, + int (*seq_fetch)(void *client_data, + samFile *fp, + sam_hdr_t *h, + bam1_t *b), + int (*seq_init)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p), + int (*seq_add)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert), + void *client_data) { + int ret = -1; + pileup_t *phead = NULL, *p, *pfree = NULL, *last, *next, *ptail = NULL; + pileup_t *pnew = NULL; + int is_insert, nth = 0, r; + hts_pos_t col = 0; + int last_ref = -1; + + /* FIXME: allow for start/stop boundaries rather than consuming all data */ + + if (NULL == (pnew = calloc(1, sizeof(*p)))) + return -1; + + do { + bam1_t *b; + hts_pos_t pos; + + r = seq_fetch(client_data, fp, h, &pnew->b); + if (r < -1) { + fprintf(stderr, "bam_next_seq() failure.\n"); + goto error; + } + + b = &pnew->b; + + /* Force realloc */ + //fp->bs = NULL; + //fp->bs_size = 0; + + //r = samread(fp, pnew->b); + if (r >= 0) { + if (b->core.flag & BAM_FUNMAP) + continue; + + if (b->core.tid == -1) { + /* Another indicator for unmapped */ + continue; + } else if (b->core.tid == last_ref) { + pos = b->core.pos+1; + //printf("New seq at pos %d @ %d %s\n", pos, b->core.tid, + // bam_name(b)); + } else { + //printf("New ctg at pos %ld @ %d\n",b->core.pos+1,b->core.tid); + pos = HTS_POS_MAX; + } + } else { + pos = HTS_POS_MAX; + } + + if (col > pos) { + fprintf(stderr, "BAM/SAM file is not sorted by position. " + "Aborting\n"); + goto error; + } + + /* Process data between the last column and our latest addition */ + while (col < pos && phead) { + struct pileup *eof_head = NULL, *eofp = NULL; + int v, ins, depth = 0; + //printf("Col=%ld pos=%ld nth=%d\n", col, pos, nth); + + /* Pileup */ + is_insert = 0; + pileup_t *pnext = phead ? phead->next : NULL; + for (p = phead, last = NULL; p; p = pnext) { +#if 0 + // Simple prefetching + pnext = p->next; + if (pnext) + _mm_prefetch(pnext, _MM_HINT_T0); +#else + // More complex prefetching => more instructions, but + // usually faster. + pnext = p->next; + if (pnext) { + // start memory fetches; a big help on very deep data + if (pnext->next) + // struct 2 ahead + _mm_prefetch(pnext->next, _MM_HINT_T0); + // seq/qual 1 ahead + _mm_prefetch(pnext->b_qual + pnext->seq_offset, + _MM_HINT_T0); + _mm_prefetch(pnext->b_seq + pnext->seq_offset/2, + _MM_HINT_T0); + } +#endif + + if (!get_next_base(p, col, nth, &ins)) + p->eof = 1; + if (p->eof == 1) { + if (eofp) + eofp->eofn = p; + eofp = p; + eofp->eofl = last; + if (!eof_head) + eof_head = eofp; + } else { + last = p; + } + + if (is_insert < ins) + is_insert = ins; + + depth++; + } + if ((ptail = last) == NULL) + ptail = phead; + + /* Call our function on phead linked list */ + v = seq_add(client_data, fp, h, phead, depth, +#ifdef START_WITH_DEL + col-1, +#else + col, +#endif + nth, is_insert); + + /* Remove dead seqs */ + for (p = eof_head ; p; p = p->eofn) { + if (p->eofl) + p->eofl->next = p->next; + else + phead = p->next; + + p->next = pfree; + pfree = p; + } + + if (v == 1) + break; /* early abort */ + + if (v != 0) + goto error; + + /* Next column */ + if (is_insert) { + nth++; + } else { + nth = 0; + col++; + } + } + + /* May happen if we have a hole in the contig */ + col = pos; + + /* New contig */ + if (b && b->core.tid != last_ref) { + last_ref = b->core.tid; + pos = b->core.pos+1; + nth = 0; + col = pos; + } + + /* + * Add this seq. + * Note: cigars starting with I or P ops (eg 2P3I10M) mean we have + * alignment instructions that take place before the designated + * starting location listed in the SAM file. They won't get included + * in the callback function until they officially start, which is + * already too late. + * + * So to workaround this, we prefix all CIGAR with 1D, move the + * position by 1bp, and then force the callback code to remove + * leaving pads (either P or D generated). + * + * Ie it's a level 10 hack! + */ + if (r >= 0) { + p = pnew; + p->next = NULL; + p->cd = NULL; + p->eofn = NULL; + p->eofl = NULL; + p->start = 2; + p->eof = 0; +#ifdef START_WITH_DEL + p->pos = pos-1; + p->cigar_ind = 0; + p->b_cigar = bam_get_cigar(&p->b); + if ((p->b_cigar[0] & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { + p->cigar_len = p->b_cigar[0] >> BAM_CIGAR_SHIFT; + p->cigar_op = BAM_CHARD_CLIP; + if ((p->b_cigar[1] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP) { + /* xHxS... => xHxS1D... */ + p->b_cigar[0] = p->b_cigar[1]; + p->b_cigar[1] = (1 << BAM_CIGAR_SHIFT) | BAM_CDEL; + } else { + /* xH... => xH1D... */ + p->b_cigar[0] = (1 << BAM_CIGAR_SHIFT) | BAM_CDEL; + } + } else { + if ((p->b_cigar[0] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP) { + /* xS... => xS1D... */ + p->cigar_len = p->b_cigar[0] >> BAM_CIGAR_SHIFT; + p->cigar_op = BAM_CSOFT_CLIP; + p->b_cigar[0] = (1 << BAM_CIGAR_SHIFT) | BAM_CDEL; + } else { + /* ... => 1D... */ + p->cigar_len = 1; /* was 0 */ + p->cigar_op = BAM_CDEL; /* was 'X' */ + } + } + p->seq_offset = -1; + p->first_del = 1; +#else + p->pos = pos-1; + p->cigar_ind = 0; + p->b_cigar = bam_get_cigar(&p->b); + p->cigar_len = 0; + p->cigar_op = -1; + p->seq_offset = -1; + p->first_del = 0; +#endif + p->b_is_rev = bam_is_rev(&p->b); + p->b_qual = (uint8_t *)bam_get_qual(&p->b); + p->b_seq = (uint8_t *)bam_get_seq(&p->b); + + if (seq_init) { + int v; + v = seq_init(client_data, fp, h, p); + if (v == -1) + goto error; + + if (v == 1) { + /* Keep this seq */ + if (phead) { + ptail->next = p; + } else { + phead = p; + } + ptail = p; + } else { + /* Push back on free list */ + p->next = pfree; + pfree = p; + } + } else { + if (phead) + ptail->next = p; + else + phead = p; + ptail = p; + } + + /* Allocate the next pileup rec */ + if (pfree) { + pnew = pfree; + pfree = pfree->next; + } else { + if (NULL == (pnew = calloc(1, sizeof(*pnew)))) + goto error; + } + } + } while (r >= 0); + + ret = 0; + error: + + if (pnew) { + free(pnew->b.data); + free(pnew); + } + + /* Tidy up */ + for (p = pfree; p; p = next) { + next = p->next; + free(p->b.data); + free(p); + } + + return ret; +} diff --git a/samtools/consensus_pileup.c.pysam.c b/samtools/consensus_pileup.c.pysam.c new file mode 100644 index 0000000..dde0ad0 --- /dev/null +++ b/samtools/consensus_pileup.c.pysam.c @@ -0,0 +1,597 @@ +#include "samtools.pysam.h" + +/* consensus__pileup.h -- Pileup orientated data per consensus column + + Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd. + + Author: James Bonfied + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include + +#ifdef __SSE__ +# include +#else +# define _mm_prefetch(a,b) +#endif + +#include "consensus_pileup.h" + +#define MIN(a,b) ((a)<(b)?(a):(b)) +#define bam_strand(b) (((b)->core.flag & BAM_FREVERSE) != 0) + +/* + * START_WITH_DEL is the mode that Gap5 uses when building this. It prepends + * all cigar strings with 1D and decrements the position by one. (And then + * has code to reverse this operation in the pileup handler.) + * + * The reason for this is that it means reads starting with an insertion work. + * Otherwise the inserted bases are silently lost. (Try it with "samtools + * mpileup" and you can see it has the same issue.) + * + * However it's probably not want most people expect. + */ +//#define START_WITH_DEL + +/* -------------------------------------------------------------------------- + * The pileup code itself. + * + * This consists of the external pileup_loop() function, which takes a + * sam/bam samfile_t pointer and a callback function. The callback function + * is called once per column of aligned data (so once per base in an + * insertion). + * + * Current known issues. + * 1) zero length matches, ie 2S2S cause failures. + * 2) Insertions at starts of sequences get included in the soft clip, so + * 2S2I2M is treated as if it's 4S2M + * 3) From 1 and 2 above, 1S1I2S becomes 2S2S which fails. + */ + + +/* + * Fetches the next base => the nth base at unpadded position pos. (Nth can + * be greater than 0 if we have an insertion in this column). Do not call this + * with pos/nth lower than the previous query, although higher is better. + * (This allows it to be initialised at base 0.) + * + * Stores the result in base and also updates is_insert to indicate that + * this sequence still has more bases in this position beyond the current + * nth parameter. + * + * Returns 1 if a base was fetched + * 0 if not (eg ran off the end of sequence) + */ +static int get_next_base(pileup_t *p, hts_pos_t pos, int nth, int *is_insert) { + bam1_t *b = &p->b; + int op = p->cigar_op; + + p->start -= p->start>0; + if (p->first_del && op != BAM_CPAD) + p->first_del = 0; + + *is_insert = 0; + + /* Find pos first */ + while (p->pos < pos) { + p->nth = 0; + + if (p->cigar_len == 0) { + if (p->cigar_ind >= b->core.n_cigar) { + p->eof = 1; + return 0; + } + + op=p->cigar_op = p->b_cigar[p->cigar_ind] & BAM_CIGAR_MASK; + p->cigar_len = p->b_cigar[p->cigar_ind] >> BAM_CIGAR_SHIFT; + p->cigar_ind++; + } + + if ((op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) + && p->cigar_len <= pos - p->pos) { + p->seq_offset += p->cigar_len; + p->pos += p->cigar_len; + p->cigar_len = 0; + } else { + switch (op) { + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: + p->seq_offset++; + /* Fall through */ + case BAM_CDEL: + case BAM_CREF_SKIP: + p->pos++; + p->cigar_len--; + break; + + case BAM_CINS: + case BAM_CSOFT_CLIP: + p->seq_offset += p->cigar_len; + /* Fall through */ + case BAM_CPAD: + case BAM_CHARD_CLIP: + p->cigar_len = 0; + break; + + default: + fprintf(samtools_stderr, "Unhandled cigar_op %d\n", op); + return -1; + } + } + } + + /* Now at pos, find nth base */ + while (p->nth < nth) { + if (p->cigar_len == 0) { + if (p->cigar_ind >= b->core.n_cigar) { + p->eof = 1; + return 0; /* off end of seq */ + } + + op=p->cigar_op = p->b_cigar[p->cigar_ind] & BAM_CIGAR_MASK; + p->cigar_len = p->b_cigar[p->cigar_ind] >> BAM_CIGAR_SHIFT; + p->cigar_ind++; + } + + switch (op) { + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: + case BAM_CSOFT_CLIP: + case BAM_CDEL: + case BAM_CREF_SKIP: + goto at_nth; /* sorry, but it's fast! */ + + case BAM_CINS: + p->seq_offset++; + /* Fall through */ + case BAM_CPAD: + p->cigar_len--; + p->nth++; + break; + + case BAM_CHARD_CLIP: + p->cigar_len = 0; + break; + + default: + fprintf(samtools_stderr, "Unhandled cigar_op %d\n", op); + return -1; + } + } + at_nth: + + /* Fill out base & qual fields */ + p->ref_skip = 0; + if (p->nth < nth && op != BAM_CINS) { + //p->base = '-'; + p->base = '*'; + p->base4 = 16; + p->padding = 1; + if (p->seq_offset < b->core.l_qseq) + p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + else + p->qual = 0; + } else { + p->padding = 0; + switch(op) { + case BAM_CDEL: + p->base = '*'; + p->base4 = 16; + if (p->seq_offset+1 < b->core.l_qseq) + p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + else + p->qual = (p->qual + p->b_qual[p->seq_offset])/2; + break; + + case BAM_CPAD: + //p->base = '+'; + p->base = '*'; + p->base4 = 16; + if (p->seq_offset+1 < b->core.l_qseq) + p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + else + p->qual = (p->qual + p->b_qual[p->seq_offset])/2; + break; + + case BAM_CREF_SKIP: + p->base = '.'; + p->base4 = 0; + p->qual = 0; + /* end of fragment, but not sequence */ + p->eof = p->eof ? 2 : 3; + p->ref_skip = 1; + break; + + default: + if (p->seq_offset < b->core.l_qseq) { + p->qual = p->b_qual[p->seq_offset]; + p->base4 = p->b_seq[p->seq_offset/2] >> + ((~p->seq_offset&1)<<2) & 0xf; + p->base = "NACMGRSVTWYHKDBN"[p->base4]; + } else { + p->base = 'N'; + p->base4 = 15; + p->qual = 0xff; + } + + break; + } + } + + /* Handle moving out of N (skip) into sequence again */ + if (p->eof && p->base != '.') { + p->start = 1; + p->ref_skip = 1; + p->eof = 0; + } + + /* Starting with an indel needs a minor fudge */ + if (p->start && p->cigar_op == BAM_CDEL) { + p->first_del = 1; + } + + /* Check if next op is an insertion of some sort */ + if (p->cigar_len == 0) { + if (p->cigar_ind < b->core.n_cigar) { + op=p->cigar_op = p->b_cigar[p->cigar_ind] & BAM_CIGAR_MASK; + p->cigar_len = p->b_cigar[p->cigar_ind] >> BAM_CIGAR_SHIFT; + p->cigar_ind++; + if (op == BAM_CREF_SKIP) { + p->eof = 3; + p->ref_skip = 1; + } + } else { + p->eof = 1; + } + } + + switch (op) { + case BAM_CPAD: + case BAM_CINS: + *is_insert = p->cigar_len; + break; + + case BAM_CSOFT_CLIP: + /* Last op 'S' => eof */ + p->eof = (p->cigar_ind == b->core.n_cigar || + (p->cigar_ind+1 == b->core.n_cigar && + (p->b_cigar[p->cigar_ind] & BAM_CIGAR_MASK) + == BAM_CHARD_CLIP)) + ? 1 + : 0; + break; + + case BAM_CHARD_CLIP: + p->eof = 1; + break; + + default: + break; + } + + return 1; +} + +/* + * Loops through a set of supplied ranges producing columns of data. + * When found, it calls func with clientdata as a callback. Func should + * return 0 for success and non-zero for failure. seq_init() is called + * on each new entry before we start processing it. It should return 0 or 1 + * to indicate reject or accept status (eg to filter unmapped data). + * If seq_init() returns -1 we abort the pileup_loop with an error. + * seq_init may be NULL. + * + * Returns 0 on success + * -1 on failure + */ +int pileup_loop(samFile *fp, + sam_hdr_t *h, + int (*seq_fetch)(void *client_data, + samFile *fp, + sam_hdr_t *h, + bam1_t *b), + int (*seq_init)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p), + int (*seq_add)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert), + void *client_data) { + int ret = -1; + pileup_t *phead = NULL, *p, *pfree = NULL, *last, *next, *ptail = NULL; + pileup_t *pnew = NULL; + int is_insert, nth = 0, r; + hts_pos_t col = 0; + int last_ref = -1; + + /* FIXME: allow for start/stop boundaries rather than consuming all data */ + + if (NULL == (pnew = calloc(1, sizeof(*p)))) + return -1; + + do { + bam1_t *b; + hts_pos_t pos; + + r = seq_fetch(client_data, fp, h, &pnew->b); + if (r < -1) { + fprintf(samtools_stderr, "bam_next_seq() failure.\n"); + goto error; + } + + b = &pnew->b; + + /* Force realloc */ + //fp->bs = NULL; + //fp->bs_size = 0; + + //r = samread(fp, pnew->b); + if (r >= 0) { + if (b->core.flag & BAM_FUNMAP) + continue; + + if (b->core.tid == -1) { + /* Another indicator for unmapped */ + continue; + } else if (b->core.tid == last_ref) { + pos = b->core.pos+1; + //printf("New seq at pos %d @ %d %s\n", pos, b->core.tid, + // bam_name(b)); + } else { + //printf("New ctg at pos %ld @ %d\n",b->core.pos+1,b->core.tid); + pos = HTS_POS_MAX; + } + } else { + pos = HTS_POS_MAX; + } + + if (col > pos) { + fprintf(samtools_stderr, "BAM/SAM file is not sorted by position. " + "Aborting\n"); + goto error; + } + + /* Process data between the last column and our latest addition */ + while (col < pos && phead) { + struct pileup *eof_head = NULL, *eofp = NULL; + int v, ins, depth = 0; + //printf("Col=%ld pos=%ld nth=%d\n", col, pos, nth); + + /* Pileup */ + is_insert = 0; + pileup_t *pnext = phead ? phead->next : NULL; + for (p = phead, last = NULL; p; p = pnext) { +#if 0 + // Simple prefetching + pnext = p->next; + if (pnext) + _mm_prefetch(pnext, _MM_HINT_T0); +#else + // More complex prefetching => more instructions, but + // usually faster. + pnext = p->next; + if (pnext) { + // start memory fetches; a big help on very deep data + if (pnext->next) + // struct 2 ahead + _mm_prefetch(pnext->next, _MM_HINT_T0); + // seq/qual 1 ahead + _mm_prefetch(pnext->b_qual + pnext->seq_offset, + _MM_HINT_T0); + _mm_prefetch(pnext->b_seq + pnext->seq_offset/2, + _MM_HINT_T0); + } +#endif + + if (!get_next_base(p, col, nth, &ins)) + p->eof = 1; + if (p->eof == 1) { + if (eofp) + eofp->eofn = p; + eofp = p; + eofp->eofl = last; + if (!eof_head) + eof_head = eofp; + } else { + last = p; + } + + if (is_insert < ins) + is_insert = ins; + + depth++; + } + if ((ptail = last) == NULL) + ptail = phead; + + /* Call our function on phead linked list */ + v = seq_add(client_data, fp, h, phead, depth, +#ifdef START_WITH_DEL + col-1, +#else + col, +#endif + nth, is_insert); + + /* Remove dead seqs */ + for (p = eof_head ; p; p = p->eofn) { + if (p->eofl) + p->eofl->next = p->next; + else + phead = p->next; + + p->next = pfree; + pfree = p; + } + + if (v == 1) + break; /* early abort */ + + if (v != 0) + goto error; + + /* Next column */ + if (is_insert) { + nth++; + } else { + nth = 0; + col++; + } + } + + /* May happen if we have a hole in the contig */ + col = pos; + + /* New contig */ + if (b && b->core.tid != last_ref) { + last_ref = b->core.tid; + pos = b->core.pos+1; + nth = 0; + col = pos; + } + + /* + * Add this seq. + * Note: cigars starting with I or P ops (eg 2P3I10M) mean we have + * alignment instructions that take place before the designated + * starting location listed in the SAM file. They won't get included + * in the callback function until they officially start, which is + * already too late. + * + * So to workaround this, we prefix all CIGAR with 1D, move the + * position by 1bp, and then force the callback code to remove + * leaving pads (either P or D generated). + * + * Ie it's a level 10 hack! + */ + if (r >= 0) { + p = pnew; + p->next = NULL; + p->cd = NULL; + p->eofn = NULL; + p->eofl = NULL; + p->start = 2; + p->eof = 0; +#ifdef START_WITH_DEL + p->pos = pos-1; + p->cigar_ind = 0; + p->b_cigar = bam_get_cigar(&p->b); + if ((p->b_cigar[0] & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { + p->cigar_len = p->b_cigar[0] >> BAM_CIGAR_SHIFT; + p->cigar_op = BAM_CHARD_CLIP; + if ((p->b_cigar[1] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP) { + /* xHxS... => xHxS1D... */ + p->b_cigar[0] = p->b_cigar[1]; + p->b_cigar[1] = (1 << BAM_CIGAR_SHIFT) | BAM_CDEL; + } else { + /* xH... => xH1D... */ + p->b_cigar[0] = (1 << BAM_CIGAR_SHIFT) | BAM_CDEL; + } + } else { + if ((p->b_cigar[0] & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP) { + /* xS... => xS1D... */ + p->cigar_len = p->b_cigar[0] >> BAM_CIGAR_SHIFT; + p->cigar_op = BAM_CSOFT_CLIP; + p->b_cigar[0] = (1 << BAM_CIGAR_SHIFT) | BAM_CDEL; + } else { + /* ... => 1D... */ + p->cigar_len = 1; /* was 0 */ + p->cigar_op = BAM_CDEL; /* was 'X' */ + } + } + p->seq_offset = -1; + p->first_del = 1; +#else + p->pos = pos-1; + p->cigar_ind = 0; + p->b_cigar = bam_get_cigar(&p->b); + p->cigar_len = 0; + p->cigar_op = -1; + p->seq_offset = -1; + p->first_del = 0; +#endif + p->b_is_rev = bam_is_rev(&p->b); + p->b_qual = (uint8_t *)bam_get_qual(&p->b); + p->b_seq = (uint8_t *)bam_get_seq(&p->b); + + if (seq_init) { + int v; + v = seq_init(client_data, fp, h, p); + if (v == -1) + goto error; + + if (v == 1) { + /* Keep this seq */ + if (phead) { + ptail->next = p; + } else { + phead = p; + } + ptail = p; + } else { + /* Push back on free list */ + p->next = pfree; + pfree = p; + } + } else { + if (phead) + ptail->next = p; + else + phead = p; + ptail = p; + } + + /* Allocate the next pileup rec */ + if (pfree) { + pnew = pfree; + pfree = pfree->next; + } else { + if (NULL == (pnew = calloc(1, sizeof(*pnew)))) + goto error; + } + } + } while (r >= 0); + + ret = 0; + error: + + if (pnew) { + free(pnew->b.data); + free(pnew); + } + + /* Tidy up */ + for (p = pfree; p; p = next) { + next = p->next; + free(p->b.data); + free(p); + } + + return ret; +} diff --git a/samtools/consensus_pileup.h b/samtools/consensus_pileup.h new file mode 100644 index 0000000..6eafdbb --- /dev/null +++ b/samtools/consensus_pileup.h @@ -0,0 +1,79 @@ +/* consensus_pileup.h -- Pileup orientated data per consensus column + + Copyright (C) 2013-2016, 2020-2021 Genome Research Ltd. + + Author: James Bonfied + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +typedef struct pileup { + // commonly used things together, to fit in a cache line (64 bytes) + struct pileup *next; // A link list, for active seqs + void *cd; // General purpose per-seq client-data + int eof; // True if this sequence has finished + int qual; // Current qual (for active seq only) + char start; // True if this is a new sequence + char base; // Current base (for active seq only) in ASCII + char ref_skip; // True if the cause of eof or start is cigar N + char padding; // True if the base was added due to another seq + int base4; // Base in 4-bit notation (0-15) + hts_pos_t pos; // Current unpadded position in seq + int nth; // nth base at unpadded position 'pos' + int b_is_rev; // 0 => fwd, 1 => rev + int seq_offset; // Current base position in s->seq[] array. + + unsigned char *b_qual;// cached bam_qual + unsigned char *b_seq; // cached bam_seq + + // --- 64 bytes + struct pileup *eofn; // p->eof set, next eof member + struct pileup *eofl; // last non-eof that points to p with p->eof + + uint32_t *b_cigar; // cached bam_cigar + + int cigar_ind; // Current location in s->alignment cigar str + int cigar_op; // Current cigar operation + int cigar_len; // Remaining length of this cigar op + + int first_del; // Used when first base is a deletion + + bam1_t b; // Bam entry associated with struct +} pileup_t; + +int pileup_loop(samFile *fp, + sam_hdr_t *h, + int (*seq_fetch)(void *client_data, + samFile *fp, + sam_hdr_t *h, + bam1_t *b), + int (*seq_init)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p), + int (*seq_add)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert), + void *client_data); diff --git a/samtools/phase.c b/samtools/phase.c index 50f7a8f..a4a7351 100644 --- a/samtools/phase.c +++ b/samtools/phase.c @@ -597,6 +597,8 @@ int main_phase(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), + {"min-BQ", required_argument, NULL, 'Q'}, + {"min-bq", required_argument, NULL, 'Q'}, {"no-PG", no_argument, NULL, 1}, { NULL, 0, NULL, 0 } }; @@ -630,7 +632,8 @@ int main_phase(int argc, char *argv[]) fprintf(stderr, "Options: -k INT block length [%d]\n", g.k); fprintf(stderr, " -b STR prefix of BAMs to output [null]\n"); fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); - fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); + fprintf(stderr, " -Q, --min-BQ INT\n" + " min base quality in het calling [%d]\n", g.min_baseQ); fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth); // fprintf(stderr, " -l FILE list of sites to phase [null]\n"); fprintf(stderr, " -F do not attempt to fix chimeras\n"); diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c index 13ab556..b0b5257 100644 --- a/samtools/phase.c.pysam.c +++ b/samtools/phase.c.pysam.c @@ -599,6 +599,8 @@ int main_phase(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), + {"min-BQ", required_argument, NULL, 'Q'}, + {"min-bq", required_argument, NULL, 'Q'}, {"no-PG", no_argument, NULL, 1}, { NULL, 0, NULL, 0 } }; @@ -632,7 +634,8 @@ int main_phase(int argc, char *argv[]) fprintf(samtools_stderr, "Options: -k INT block length [%d]\n", g.k); fprintf(samtools_stderr, " -b STR prefix of BAMs to output [null]\n"); fprintf(samtools_stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); - fprintf(samtools_stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); + fprintf(samtools_stderr, " -Q, --min-BQ INT\n" + " min base quality in het calling [%d]\n", g.min_baseQ); fprintf(samtools_stderr, " -D INT max read depth [%d]\n", g.max_depth); // fprintf(samtools_stderr, " -l FILE list of sites to phase [null]\n"); fprintf(samtools_stderr, " -F do not attempt to fix chimeras\n"); diff --git a/samtools/sam_view.c b/samtools/sam_view.c index 7c4d7cc..c4d65d2 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -1,6 +1,6 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2021 Genome Research Ltd. + Copyright (C) 2009-2022 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "htslib/faidx.h" #include "htslib/khash.h" +#include "htslib/kstring.h" #include "htslib/thread_pool.h" #include "htslib/hts_expr.h" #include "samtools.h" @@ -79,6 +80,15 @@ typedef struct samview_settings { int unmap; auxhash_t remove_tag; auxhash_t keep_tag; + + hts_idx_t *hts_idx; + sam_hdr_t *header; + samFile *in, *out, *un_out; + int64_t count; + int is_count; + char *fn_in, *fn_idx_in, *fn_out, *fn_fai, *fn_un_out, *fn_out_idx, *fn_un_out_idx; + int fetch_pairs, nreglist; + hts_reglist_t *reglist; } samview_settings_t; // Copied from htslib/sam.c. @@ -196,6 +206,11 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin const char *p = bam_get_library((sam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } + return 0; +} + +static int adjust_tags(const sam_hdr_t *h, bam1_t *b, + samview_settings_t* settings) { if (settings->keep_tag) { uint8_t *s_from, *s_to, *end = b->data + b->l_data; auxhash_t h = settings->keep_tag; @@ -207,7 +222,7 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin if (s == NULL) { print_error("view", "malformed aux data for record \"%s\"", bam_get_qname(b)); - break; + return -1; } if (kh_get(aux_exists, h, x) != kh_end(h) ) { @@ -229,7 +244,7 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin if (s == NULL) { print_error("view", "malformed aux data for record \"%s\"", bam_get_qname(b)); - break; + return -1; } if (kh_get(aux_exists, h, x) == kh_end(h) ) { @@ -404,47 +419,372 @@ int parse_aux_list(auxhash_t *h, char *optarg) { return 0; } +static int cmp_reglist_intervals(const void *aptr, const void *bptr) +{ + hts_pair_pos_t *a = (hts_pair_pos_t*)aptr; + hts_pair_pos_t *b = (hts_pair_pos_t*)bptr; + if ( a->beg < b->beg ) return -1; + if ( a->beg > b->beg ) return 1; + if ( a->end < b->end ) return -1; + if ( a->end > b->end ) return 1; + return 0; +} +static int cmp_reglist_tids(const void *aptr, const void *bptr) +{ + hts_reglist_t *a = (hts_reglist_t*)aptr; + hts_reglist_t *b = (hts_reglist_t*)bptr; + if ( b->tid==HTS_IDX_NOCOOR || a->tid < b->tid ) return -1; + if ( a->tid==HTS_IDX_NOCOOR || a->tid > b->tid ) return 1; + return 0; +} + +static hts_reglist_t *_reglist_dup(sam_hdr_t *hdr, hts_reglist_t *src, int nsrc) +{ + int i,j; + hts_reglist_t *dst = (hts_reglist_t*)calloc(nsrc,sizeof(hts_reglist_t)); + if ( !dst ) { + print_error_errno("view", "[%s:%d] could not allocate region list" + ,__FILE__ ,__LINE__); + return NULL; + } + for (i=0; i tid ) imax = i - 1; + else break; + } + if ( i<0 || reg[i].tid < tid ) i++; // not found, i will be the index of the inserted element + return i; +} +static int _reglist_push(hts_reglist_t **_reg, int *_nreg, int tid, hts_pos_t beg, hts_pos_t end) +{ + hts_reglist_t *reg = *_reg; + int nreg = *_nreg; + int i = _reglist_find_tid(reg,nreg,tid); + if ( i>=nreg || reg[i].tid!=tid ) { + nreg++; + reg = (hts_reglist_t*)realloc(reg,sizeof(hts_reglist_t)*nreg); + if ( !reg ) { + print_error_errno("view", "[%s:%d] could not extend region list", + __FILE__, __LINE__); + return -1; + } + if ( i+1 < nreg ) + memmove(reg + i + 1, reg + i, sizeof(hts_reglist_t)*(nreg - i - 1)); + reg[i].reg = NULL; + reg[i].tid = tid; + reg[i].min_beg = beg; + reg[i].max_end = end; + reg[i].intervals = NULL; + reg[i].count = 0; + } + *_reg = reg; + *_nreg = nreg; + if ( reg[i].count > 0 + && reg[i].intervals[reg[i].count - 1].beg==beg + && reg[i].intervals[reg[i].count - 1].end==end ) { + return 0; + } + hts_pair_pos_t *new_intervals = realloc(reg[i].intervals, sizeof(hts_pair_pos_t)*(reg[i].count + 1)); + if (!new_intervals) { + print_error_errno("view", "[%s:%d] could not extend region list", + __FILE__, __LINE__); + return -1; + } + reg[i].intervals = new_intervals; + reg[i].intervals[reg[i].count].beg = beg; + reg[i].intervals[reg[i].count].end = end; + reg[i].count++; + return 0; +} + +static void _reglist_merge(hts_reglist_t *reg, int nreg) +{ + int i,j; + for (i=0; ibed = bed_hash_regions(conf->bed, regs, 0, nregs, &filter_op); // insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file + if ( !filter_op ) + filter_state = FILTERED; + } + else + bed_unify(conf->bed); + if ( !conf->bed) { // index is unavailable or no regions have been specified + print_error("view", "No regions or BED file have been provided. Aborting."); + return NULL; + } + + int regcount = 0; + hts_reglist_t *reglist = bed_reglist(conf->bed, filter_state, ®count); + if (!reglist) { + print_error("view", "Region list is empty or could not be created. Aborting."); + return NULL; + } + + if ( conf->fetch_pairs ) { + conf->reglist = _reglist_dup(conf->header,reglist,regcount); + if (!conf->reglist) + return NULL; + conf->nreglist = regcount; + } + + iter = sam_itr_regions(conf->hts_idx, conf->header, reglist, regcount); + if ( !iter ) { + print_error("view", "Iterator could not be created. Aborting."); + return NULL; + } + return iter; +} + +KHASH_SET_INIT_STR(names) + +static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t *iter) +{ + khint_t k; + int nunmap = 0, r = 0, nmates = 0, write_error = 0, retval = EXIT_FAILURE; + kh_names_t *mate_names = kh_init(names); + bam1_t *rec = bam_init1(); + + if (!mate_names) { + print_error_errno("view", "could not allocate mate names table"); + goto out; + } + if (!rec) { + print_error_errno("view", "could not allocate bam record"); + goto out; + } + + while ((r =sam_itr_multi_next(conf->in, iter, rec))>=0) { + if ( (rec->core.flag & BAM_FPAIRED) == 0 ) continue; + if ( rec->core.mtid>=0 && bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.mtid), rec->core.mpos, rec->core.mpos) ) continue; + if ( process_aln(conf->header, rec, conf) ) continue; + + nmates++; + + k = kh_get(names,mate_names,bam_get_qname(rec)); + if ( k == kh_end(mate_names) ) { + int ret = 0; + char *name_copy = strdup(bam_get_qname(rec)); + if (!name_copy) { + print_error_errno("view", "[%s:%d] could not store sample name, %d elements", __FILE__,__LINE__,nmates); + goto out; + } + kh_put(names, mate_names, name_copy, &ret); + if ( ret<0 ) { + print_error_errno("view", "[%s:%d] could not store sample name, %d elements",__FILE__,__LINE__,nmates); + free(name_copy); + goto out; + } + } + + if ( rec->core.mtid < 0 || (rec->core.flag & BAM_FMUNMAP) ) nunmap = 1; + if ( rec->core.mtid >= 0 ) { + if (_reglist_push(&conf->reglist, &conf->nreglist, rec->core.mtid, rec->core.mpos,rec->core.mpos+1) != 0) + goto out; + } + } + + if (r < -1) { + print_error_errno("view", "error reading file \"%s\"", conf->fn_in); + goto out; + } + + _reglist_merge(conf->reglist, conf->nreglist); + if ( nunmap ) { + if (_reglist_push(&conf->reglist,&conf->nreglist,HTS_IDX_NOCOOR,0,HTS_POS_MAX) != 0) + goto out; + } + hts_itr_multi_destroy(iter); + iter = sam_itr_regions(conf->hts_idx, conf->header, conf->reglist, conf->nreglist); + if ( !iter ) { + print_error_errno("view", "[%s:%d] iterator could not be created",__FILE__,__LINE__); + goto out; + } + while ((r = sam_itr_multi_next(conf->in, iter, rec))>=0) { + int drop = 1; + if (rec->core.tid >=0 && + bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.tid), rec->core.pos, bam_endpos(rec))) drop = 0; + if ( drop ) { + k = kh_get(names,mate_names,bam_get_qname(rec)); + if ( k != kh_end(mate_names) ) drop = 0; + } + if (!drop && process_aln(conf->header, rec, conf) == 0) { + if (adjust_tags(conf->header, rec, conf) != 0) + goto out; + if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out, + &write_error) < 0) + goto out; + } + } + + if (r < -1) { + print_error_errno("view", "error reading file \"%s\"", conf->fn_in); + goto out; + } + + retval = EXIT_SUCCESS; + + out: + hts_itr_multi_destroy(iter); + hts_idx_destroy(conf->hts_idx); // destroy the BAM index + conf->hts_idx = NULL; + if (mate_names) { + // free khash keys + for (k = 0; k < kh_end(mate_names); ++k) + if ( kh_exist(mate_names,k) ) free((char*)kh_key(mate_names, k)); + kh_destroy(names,mate_names); + } + bam_destroy1(rec); + return retval; +} + +// Common code for processing and writing a record +static inline int process_one_record(samview_settings_t *conf, bam1_t *b, + int *write_error) { + if (!process_aln(conf->header, b, conf)) { + if (!conf->is_count) { + change_flag(b, conf); + if (adjust_tags(conf->header, b, conf) != 0) + return -1; + if (check_sam_write1(conf->out, conf->header, + b, conf->fn_out, write_error) < 0) { + return -1; + } + } + conf->count++; + } else if (conf->unmap) { + b->core.flag |= BAM_FUNMAP; + if (check_sam_write1(conf->out, conf->header, + b, conf->fn_out, write_error) < 0) { + return -1; + } + } else { + if (conf->un_out) { + if (check_sam_write1(conf->un_out, conf->header, + b, conf->fn_un_out, write_error) < 0) { + return -1; + } + } + } + return 0; +} + +static int stream_view(samview_settings_t *conf) { + bam1_t *b = bam_init1(); + int write_error = 0, r; + if (!b) { + print_error_errno("view", "could not allocate bam record"); + return 1; + } + while ((r = sam_read1(conf->in, conf->header, b)) >= 0) { + if (process_one_record(conf, b, &write_error) < 0) break; + } + bam_destroy1(b); + if (r < -1) { + print_error_errno("view", "error reading file \"%s\"", conf->fn_in); + return 1; + } + return write_error; +} + +static int multi_region_view(samview_settings_t *conf, hts_itr_multi_t *iter) +{ + bam1_t *b = bam_init1(); + int write_error = 0, result; + if (!b) { + print_error_errno("view", "could not allocate bam record"); + return 1; + } + // fetch alignments + while ((result = sam_itr_multi_next(conf->in, iter, b)) >= 0) { + if (process_one_record(conf, b, &write_error) < 0) break; + } + hts_itr_multi_destroy(iter); + bam_destroy1(b); + + if (result < -1) { + print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid); + return 1; + } + return write_error; +} + // Make mnemonic distinct values for longoption-only options #define LONGOPT(c) ((c) + 128) +// Check for ".sam" filenames as sam_open_mode cannot distinguish between +// foo.sam and foo.unknown, both getting mode "". +static int is_sam(const char *fn) { + if (!fn) + return 0; + size_t l = strlen(fn); + return (l >= 4 && strcasecmp(fn + l-4, ".sam") == 0); +} + int main_samview(int argc, char *argv[]) { - int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; - int64_t count = 0; - samFile *in = 0, *out = 0, *un_out=0; + samview_settings_t settings; + int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, has_index_file = 0, no_pg = 0; FILE *fp_out = NULL; - sam_hdr_t *header = NULL; - char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = ""; - char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0; - char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; + char out_mode[6] = {0}, out_un_mode[6] = {0}; + char *out_format = ""; + char *arg_list = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; - int filter_state = ALL, filter_op = 0; - int result; - - samview_settings_t settings = { - .rghash = NULL, - .tvhash = NULL, - .min_mapQ = 0, - .flag_on = 0, - .flag_off = 0, - .flag_alloff = 0, - .flag_anyon = 0, - .min_qlen = 0, - .remove_B = 0, - .subsam_seed = 0, - .subsam_frac = -1., - .library = NULL, - .bed = NULL, - .multi_region = 0, - .tag = NULL, - .filter = NULL, - .remove_flag = 0, - .add_flag = 0, - .keep_tag = NULL, - .remove_tag = NULL, - .unmap = 0, - }; + + memset(&settings,0,sizeof(settings)); + settings.subsam_frac = -1.0; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), @@ -460,6 +800,7 @@ int main_samview(int argc, char *argv[]) {"expression", required_argument, NULL, 'e'}, {"fai-reference", required_argument, NULL, 't'}, {"fast", no_argument, NULL, '1'}, + {"fetch-pairs", no_argument, NULL, 'P'}, {"header-only", no_argument, NULL, 'H'}, {"help", no_argument, NULL, LONGOPT('?')}, {"incl-flags", required_argument, NULL, LONGOPT('g')}, @@ -512,15 +853,16 @@ int main_samview(int argc, char *argv[]) // set optopt to '\0'). opterr = 0; + char *tmp; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:p", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pP", lopts, NULL)) >= 0) { switch (c) { case 's': - settings.subsam_seed = strtol(optarg, &q, 10); - if (q && *q == '.') { - settings.subsam_frac = strtod(q, &q); - if (*q) ret = 1; + settings.subsam_seed = strtol(optarg, &tmp, 10); + if (tmp && *tmp == '.') { + settings.subsam_frac = strtod(tmp, &tmp); + if (*tmp) ret = 1; } else { ret = 1; } @@ -531,24 +873,24 @@ int main_samview(int argc, char *argv[]) } break; case LONGOPT('s'): - settings.subsam_frac = strtod(optarg, &q); - if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) { + settings.subsam_frac = strtod(optarg, &tmp); + if (*tmp || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) { print_error("view", "Incorrect sampling argument \"%s\"", optarg); goto view_end; } break; case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break; case 'm': settings.min_qlen = atoi(optarg); break; - case 'c': is_count = 1; break; + case 'c': settings.is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; - case 't': fn_fai = strdup(optarg); break; + case 't': settings.fn_fai = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; case LONGOPT('H'): is_header = is_header_only = 0; break; - case 'o': fn_out = strdup(optarg); break; - case 'U': fn_un_out = strdup(optarg); break; + case 'o': settings.fn_out = strdup(optarg); break; + case 'U': settings.fn_un_out = strdup(optarg); break; case 'X': has_index_file = 1; break; case 'f': settings.flag_on |= bam_str2flag(optarg); break; case 'F': settings.flag_off |= bam_str2flag(optarg); break; @@ -560,6 +902,7 @@ int main_samview(int argc, char *argv[]) case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; case 'p': settings.unmap = 1; break; + case 'P': settings.fetch_pairs = 1; settings.multi_region = 1; break; case LONGOPT('L'): settings.multi_region = 1; // fall through @@ -646,10 +989,6 @@ int main_samview(int argc, char *argv[]) goto view_end; } break; - /* REMOVED as htslib doesn't support this - //case 'x': out_format = "x"; break; - //case 'X': out_format = "X"; break; - */ case LONGOPT('?'): return usage(stdout, EXIT_SUCCESS, 1); case '?': @@ -703,16 +1042,36 @@ int main_samview(int argc, char *argv[]) break; } } - if (fn_fai == 0 && ga.reference) fn_fai = fai_path(ga.reference); - if (compress_level >= 0 && !*out_format) out_format = "b"; + if (settings.is_count && settings.fetch_pairs) + { + print_error("view","The options -P and -c cannot be combined\n"); + return 1; + } + if (settings.fn_fai == 0 && ga.reference) settings.fn_fai = fai_path(ga.reference); if (is_header_only) is_header = 1; // File format auto-detection first - if (fn_out) sam_open_mode(out_mode+1, fn_out, NULL); - if (fn_un_out) sam_open_mode(out_un_mode+1, fn_un_out, NULL); - // Overridden by manual -b, -C - if (*out_format) + if (settings.fn_out) sam_open_mode(out_mode+1, settings.fn_out, NULL); + if (settings.fn_un_out) sam_open_mode(out_un_mode+1, settings.fn_un_out, NULL); + + // -1 or -u without an explicit format (-b, -C) => check fn extensions + if (!*out_format && compress_level >= 0) { + if (compress_level == 0 && + (out_mode[strlen(out_mode)-1] == 'z' || + out_un_mode[strlen(out_un_mode)-1] == 'z')) + // z, fz, Fz sanity check + fprintf(stderr, "[view] Warning option -u ignored due to" + " filename suffix\n"); + + // If known extension, use it, otherwise BAM + if (!(out_mode[1] || is_sam(settings.fn_out))) + out_mode[1] = 'b'; + + if (!(out_un_mode[1] || is_sam(settings.fn_un_out))) + out_un_mode[1] = 'b'; + } else if (*out_format) { out_mode[1] = out_un_mode[1] = *out_format; - // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul. + } + if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; @@ -724,7 +1083,7 @@ int main_samview(int argc, char *argv[]) return usage(stderr, EXIT_FAILURE, 0); // potential memory leak... } - if (settings.unmap && fn_un_out) { + if (settings.unmap && settings.fn_un_out) { print_error("view", "Options --unoutput and --unmap are mutually exclusive."); ret = 1; goto view_end; @@ -737,42 +1096,42 @@ int main_samview(int argc, char *argv[]) settings.subsam_seed = rand(); } - fn_in = (optind < argc)? argv[optind] : "-"; - if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { - print_error_errno("view", "failed to open \"%s\" for reading", fn_in); + settings.fn_in = (optind < argc)? argv[optind] : "-"; + if ((settings.in = sam_open_format(settings.fn_in, "r", &ga.in)) == 0) { + print_error_errno("view", "failed to open \"%s\" for reading", settings.fn_in); ret = 1; goto view_end; } - if (fn_fai) { - if (hts_set_fai_filename(in, fn_fai) != 0) { - fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); + if (settings.fn_fai) { + if (hts_set_fai_filename(settings.in, settings.fn_fai) != 0) { + fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", settings.fn_fai); ret = 1; goto view_end; } } - if ((header = sam_hdr_read(in)) == 0) { - fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); + if ((settings.header = sam_hdr_read(settings.in)) == 0) { + fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", settings.fn_in); ret = 1; goto view_end; } if (settings.rghash) { - sam_hdr_remove_lines(header, "RG", "ID", settings.rghash); + sam_hdr_remove_lines(settings.header, "RG", "ID", settings.rghash); } - if (!is_count) { - if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { - print_error_errno("view", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output"); + if (!settings.is_count) { + if ((settings.out = sam_open_format(settings.fn_out? settings.fn_out : "-", out_mode, &ga.out)) == 0) { + print_error_errno("view", "failed to open \"%s\" for writing", settings.fn_out? settings.fn_out : "standard output"); ret = 1; goto view_end; } - if (fn_fai) { - if (hts_set_fai_filename(out, fn_fai) != 0) { - fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); + if (settings.fn_fai) { + if (hts_set_fai_filename(settings.out, settings.fn_fai) != 0) { + fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", settings.fn_fai); ret = 1; goto view_end; } } - autoflush_if_stdout(out, fn_out); + autoflush_if_stdout(settings.out, settings.fn_out); if (!no_pg) { if (!(arg_list = stringify_argv(argc+1, argv-1))) { @@ -780,7 +1139,7 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - if (sam_hdr_add_pg(header, "samtools", + if (sam_hdr_add_pg(settings.header, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, @@ -791,47 +1150,47 @@ int main_samview(int argc, char *argv[]) } } - if (*out_format || ga.write_index || is_header || + if (ga.write_index || is_header || out_mode[1] == 'b' || out_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { - if (sam_hdr_write(out, header) != 0) { + if (sam_hdr_write(settings.out, settings.header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (ga.write_index) { - if (!(fn_out_idx = auto_index(out, fn_out, header))) { + if (!(settings.fn_out_idx = auto_index(settings.out, settings.fn_out, settings.header))) { ret = 1; goto view_end; } } - if (fn_un_out) { - if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { - print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); + if (settings.fn_un_out) { + if ((settings.un_out = sam_open_format(settings.fn_un_out, out_un_mode, &ga.out)) == 0) { + print_error_errno("view", "failed to open \"%s\" for writing", settings.fn_un_out); ret = 1; goto view_end; } - if (fn_fai) { - if (hts_set_fai_filename(un_out, fn_fai) != 0) { - fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); + if (settings.fn_fai) { + if (hts_set_fai_filename(settings.un_out, settings.fn_fai) != 0) { + fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", settings.fn_fai); ret = 1; goto view_end; } } - autoflush_if_stdout(un_out, fn_un_out); - if (*out_format || is_header || + autoflush_if_stdout(settings.un_out, settings.fn_un_out); + if (ga.write_index || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { - if (sam_hdr_write(un_out, header) != 0) { + if (sam_hdr_write(settings.un_out, settings.header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (ga.write_index) { - if (!(fn_un_out_idx = auto_index(un_out, fn_un_out, header))) { + if (!(settings.fn_un_out_idx = auto_index(settings.un_out, settings.fn_un_out, settings.header))) { ret = 1; goto view_end; } @@ -839,14 +1198,15 @@ int main_samview(int argc, char *argv[]) } } else { - if (fn_out) { - fp_out = fopen(fn_out, "w"); + if (settings.fn_out) { + fp_out = fopen(settings.fn_out, "w"); if (fp_out == NULL) { - print_error_errno("view", "can't create \"%s\"", fn_out); + print_error_errno("view", "can't create \"%s\"", settings.fn_out); ret = EXIT_FAILURE; goto view_end; } } + settings.unmap = 0; // Not valid in counting mode } if (ga.nthreads > 1) { @@ -855,188 +1215,93 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); - if (out) hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(settings.in, HTS_OPT_THREAD_POOL, &p); + if (settings.out) hts_set_opt(settings.out, HTS_OPT_THREAD_POOL, &p); } if (is_header_only) goto view_end; // no need to print alignments - if (has_index_file) { - fn_idx_in = (optind+1 < argc)? argv[optind+1] : 0; - if (fn_idx_in == 0) { - fprintf(stderr, "[main_samview] incorrect number of arguments for -X option. Aborting.\n"); + + // Initialize BAM/CRAM index + char **regs = NULL; + int nregs = 0; + if ( has_index_file && optind < argc - 2 ) regs = &argv[optind+2], nregs = argc - optind - 2, settings.fn_idx_in = argv[optind+1]; + else if ( !has_index_file && optind < argc - 1 ) regs = &argv[optind+1], nregs = argc - optind - 1; + else if ( has_index_file ) + { + print_error("view", "Incorrect number of arguments for -X option. Aborting."); + return 1; + } + if ( settings.fn_idx_in || nregs || settings.multi_region ) + { + settings.hts_idx = settings.fn_idx_in ? sam_index_load2(settings.in, settings.fn_in, settings.fn_idx_in) : sam_index_load(settings.in, settings.fn_in); + if ( !settings.hts_idx ) + { + print_error("view", "Random alignment retrieval only works for indexed SAM.gz, BAM or CRAM files."); return 1; } } - if (settings.multi_region) { - if (!has_index_file && optind < argc - 1) { //regions have been specified in the command line - settings.bed = bed_hash_regions(settings.bed, argv, optind+1, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file - if (!filter_op) - filter_state = FILTERED; - } else if (has_index_file && optind < argc - 2) { - settings.bed = bed_hash_regions(settings.bed, argv, optind+2, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file - if (!filter_op) - filter_state = FILTERED; - } else { - bed_unify(settings.bed); - } - - bam1_t *b = bam_init1(); - if (settings.bed == NULL) { // index is unavailable or no regions have been specified - fprintf(stderr, "[main_samview] no regions or BED file have been provided. Aborting.\n"); - } else { - hts_idx_t *idx = NULL; - // If index filename has not been specfied, look in BAM folder - if (fn_idx_in != 0) { - idx = sam_index_load2(in, fn_in, fn_idx_in); // load index - } else { - idx = sam_index_load(in, fn_in); - } - if (idx != NULL) { - - int regcount = 0; - - hts_reglist_t *reglist = bed_reglist(settings.bed, filter_state, ®count); - if(reglist) { - hts_itr_multi_t *iter = sam_itr_regions(idx, header, reglist, regcount); - if (iter) { - // fetch alignments - while ((result = sam_itr_multi_next(in, iter, b)) >= 0) { - if (!process_aln(header, b, &settings)) { - if (!is_count) { - change_flag(b, &settings); - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } - count++; - } else if (settings.unmap) { - b->core.flag |= BAM_FUNMAP; - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } else { - if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } - } - } - if (result < -1) { - print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid); - ret = 1; - } - - hts_itr_multi_destroy(iter); - } else { - fprintf(stderr, "[main_samview] iterator could not be created. Aborting.\n"); - } - } else { - fprintf(stderr, "[main_samview] region list is empty or could not be created. Aborting.\n"); - } - hts_idx_destroy(idx); // destroy the BAM index - } else { - fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); - } - } - bam_destroy1(b); - } else { - if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file - bam1_t *b = bam_init1(); - int r; - errno = 0; - while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' - if (!process_aln(header, b, &settings)) { - if (!is_count) { - change_flag(b, &settings); - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } - count++; - } else if (settings.unmap) { - b->core.flag |= BAM_FUNMAP; - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } else { - if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } - } - } - if (r < -1) { - print_error_errno("view", "error reading file \"%s\"", fn_in); - ret = 1; - } - bam_destroy1(b); - } else { // retrieve alignments in specified regions - int i; - bam1_t *b; - hts_idx_t *idx = NULL; - // If index filename has not been specfied, look in BAM folder - if (fn_idx_in != NULL) { - idx = sam_index_load2(in, fn_in, fn_idx_in); // load index - } else { - idx = sam_index_load(in, fn_in); - } - if (idx == 0) { // index is unavailable - fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); - ret = 1; - goto view_end; - } - b = bam_init1(); - - for (i = (has_index_file)? optind+2 : optind+1; i < argc; ++i) { - int result; - hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' - if (iter == NULL) { // region invalid or reference name not found - fprintf(stderr, "[main_samview] region \"%s\" specifies an invalid region or unknown reference. Continue anyway.\n", argv[i]); - continue; - } - // fetch alignments - while ((result = sam_itr_next(in, iter, b)) >= 0) { - if (!process_aln(header, b, &settings)) { - if (!is_count) { - change_flag(b, &settings); - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } - count++; - } else if (settings.unmap) { - b->core.flag |= BAM_FUNMAP; - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } else { - if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } - } - } - hts_itr_destroy(iter); - if (result < -1) { - print_error("view", "retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file", argv[i]); - ret = 1; - break; - } + if ( settings.fetch_pairs ) + { + hts_itr_multi_t *iter = multi_region_init(&settings, regs, nregs); + ret = iter ? fetch_pairs_collect_mates(&settings, iter) : 1; + if (ret) goto view_end; + } + else if ( settings.multi_region ) + { + hts_itr_multi_t *iter = multi_region_init(&settings, regs, nregs); + ret = iter ? multi_region_view(&settings, iter) : 1; + if (ret) goto view_end; + } + else if ( !settings.hts_idx ) // stream through the entire file + { + ret = stream_view(&settings); + if (ret) goto view_end; + } else { // retrieve alignments in specified regions + int i; + for (i = (has_index_file)? optind+2 : optind+1; i < argc; ++i) { + hts_itr_t *iter = sam_itr_querys(settings.hts_idx, settings.header, argv[i]); // parse a region in the format like `chr2:100-200' + if (iter == NULL) { // region invalid or reference name not found + fprintf(stderr, "[main_samview] region \"%s\" specifies an invalid region or unknown reference. Continue anyway.\n", argv[i]); + continue; } - bam_destroy1(b); - hts_idx_destroy(idx); // destroy the BAM index + // fetch alignments + ret = multi_region_view(&settings, iter); + if (ret) goto view_end; } } + if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx); + if (ga.write_index) { - if (sam_idx_save(out) < 0) { + if (sam_idx_save(settings.out) < 0) { print_error_errno("view", "writing index failed"); ret = 1; } - if (un_out && sam_idx_save(un_out) < 0) { + if (settings.un_out && sam_idx_save(settings.un_out) < 0) { print_error_errno("view", "writing index failed"); ret = 1; } } view_end: - if (is_count && ret == 0) { - if (fprintf(fn_out? fp_out : stdout, "%" PRId64 "\n", count) < 0) { - if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out); + if (settings.is_count && ret == 0) { + if (fprintf(settings.fn_out? fp_out : stdout, "%" PRId64 "\n", settings.count) < 0) { + if (settings.fn_out) print_error_errno("view", "writing to \"%s\" failed", settings.fn_out); else print_error_errno("view", "writing to standard output failed"); ret = EXIT_FAILURE; } } // close files, free and return - if (in) check_sam_close("view", in, fn_in, "standard input", &ret); - if (out) check_sam_close("view", out, fn_out, "standard output", &ret); - if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); + if (settings.in) check_sam_close("view", settings.in, settings.fn_in, "standard input", &ret); + if (settings.out) check_sam_close("view", settings.out, settings.fn_out, "standard output", &ret); + if (settings.un_out) check_sam_close("view", settings.un_out, settings.fn_un_out, "file", &ret); if (fp_out) fclose(fp_out); - free(fn_fai); free(fn_out); free(settings.library); free(fn_un_out); + free(settings.fn_fai); free(settings.fn_out); free(settings.library); free(settings.fn_un_out); sam_global_args_free(&ga); - if ( header ) sam_hdr_destroy(header); + if ( settings.header ) sam_hdr_destroy(settings.header); if (settings.bed) bed_destroy(settings.bed); if (settings.rghash) { khint_t k; @@ -1069,10 +1334,10 @@ view_end: if (p.pool) hts_tpool_destroy(p.pool); - if (fn_out_idx) - free(fn_out_idx); - if (fn_un_out_idx) - free(fn_un_out_idx); + if (settings.fn_out_idx) + free(settings.fn_out_idx); + if (settings.fn_un_out_idx) + free(settings.fn_un_out_idx); free(arg_list); if (settings.keep_tag) @@ -1093,8 +1358,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "Output options:\n" " -b, --bam Output BAM\n" " -C, --cram Output CRAM (requires -T)\n" -" -1, --fast Use fast BAM compression (implies --bam)\n" -" -u, --uncompressed Uncompressed BAM output (implies --bam)\n" +" -1, --fast Use fast BAM compression (and default to --bam)\n" +" -u, --uncompressed Uncompressed BAM output (and default to --bam)\n" " -h, --with-header Include header in SAM output\n" " -H, --header-only Print SAM header only (no alignments)\n" " --no-header Print SAM alignment records only [default]\n" @@ -1104,6 +1369,7 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " Output reads not selected by filters to FILE\n" " -p, --unmap Set flag to UNMAP on reads not selected\n" " then write to output file.\n" +" -P, --fetch-pairs Retrieve complete pairs even when outside of region\n" "Input options:\n" " -t, --fai-reference FILE FILE listing reference names and lengths\n" " -M, --use-index Use index and multi-region iterator for regions\n" @@ -1123,6 +1389,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -e, --expr STR ...match the filter expression STR\n" " -f, --require-flags FLAG ...have all of the FLAGs present\n" // F&x == x " -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n" // F&x == 0 +" --rf, --incl-flags, --include-flags FLAG\n" +" ...have some of the FLAGs present\n" " -G FLAG EXCLUDE reads with all of the FLAGs present\n" // !(F&x == x) TODO long option " --subsample FLOAT Keep only FLOAT fraction of templates/read pairs\n" " --subsample-seed INT Influence WHICH reads are kept in subsampling [0]\n" @@ -1194,3 +1462,123 @@ static int usage(FILE *fp, int exit_status, int is_long_help) return exit_status; } + +static int head_usage(FILE *fp, int exit_status) +{ + fprintf(fp, +"Usage: samtools head [OPTION]... [FILE]\n" +"Options:\n" +" -h, --headers INT Display INT header lines [all]\n" +" -n, --records INT Display INT alignment record lines [none]\n" +); + sam_global_opt_help(fp, "-.--T@-."); + return exit_status; +} + +int main_head(int argc, char *argv[]) +{ + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'T', '@'), + { "headers", required_argument, NULL, 'h' }, + { "records", required_argument, NULL, 'n' }, + { NULL, 0, NULL, 0 } + }; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + + int all_headers = 1; + uint64_t nheaders = 0; + uint64_t nrecords = 0; + + int c, nargs; + while ((c = getopt_long(argc, argv, "h:n:T:@:", lopts, NULL)) >= 0) + switch (c) { + case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break; + case 'n': nrecords = strtoull(optarg, NULL, 0); break; + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': + return head_usage(stderr, EXIT_FAILURE); + } + + nargs = argc - optind; + if (nargs == 0 && isatty(STDIN_FILENO)) + return head_usage(stdout, EXIT_SUCCESS); + else if (nargs > 1) + return head_usage(stderr, EXIT_FAILURE); + + samFile *fp = NULL; + sam_hdr_t *hdr = NULL; + kstring_t str = KS_INITIALIZE; + bam1_t *b = NULL; + + const char *fname = (nargs == 1)? argv[optind] : "-"; + fp = sam_open_format(fname, "r", &ga.in); + if (fp == NULL) { + if (strcmp(fname, "-") != 0) + print_error_errno("head", "failed to open \"%s\" for reading", fname); + else + print_error_errno("head", "failed to open standard input for reading"); + goto err; + } + + if (ga.nthreads > 0) hts_set_threads(fp, ga.nthreads); + + hdr = sam_hdr_read(fp); + if (hdr == NULL) { + if (strcmp(fname, "-") != 0) + print_error("head", "failed to read the header from \"%s\"", fname); + else + print_error("head", "failed to read the header"); + goto err; + } + + if (all_headers) { + fputs(sam_hdr_str(hdr), stdout); + } + else if (nheaders > 0) { + const char *text = sam_hdr_str(hdr); + const char *lim = text; + uint64_t n; + for (n = 0; n < nheaders; n++) { + lim = strchr(lim, '\n'); + if (lim) lim++; + else break; + } + if (lim) fwrite(text, lim - text, 1, stdout); + else fputs(text, stdout); + } + + if (nrecords > 0) { + b = bam_init1(); + uint64_t n; + int r; + for (n = 0; n < nrecords && (r = sam_read1(fp, hdr, b)) >= 0; n++) { + if (sam_format1(hdr, b, &str) < 0) { + print_error_errno("head", "couldn't format record"); + goto err; + } + puts(ks_str(&str)); + } + if (r < -1) { + print_error("head", "\"%s\" is truncated", fname); + goto err; + } + bam_destroy1(b); + ks_free(&str); + } + + sam_hdr_destroy(hdr); + sam_close(fp); + sam_global_args_free(&ga); + + return EXIT_SUCCESS; + +err: + if (fp) sam_close(fp); + sam_hdr_destroy(hdr); + bam_destroy1(b); + ks_free(&str); + sam_global_args_free(&ga); + return EXIT_FAILURE; +} diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 9bcc9ac..e768ec4 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -2,7 +2,7 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2021 Genome Research Ltd. + Copyright (C) 2009-2022 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -38,6 +38,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "htslib/faidx.h" #include "htslib/khash.h" +#include "htslib/kstring.h" #include "htslib/thread_pool.h" #include "htslib/hts_expr.h" #include "samtools.h" @@ -81,6 +82,15 @@ typedef struct samview_settings { int unmap; auxhash_t remove_tag; auxhash_t keep_tag; + + hts_idx_t *hts_idx; + sam_hdr_t *header; + samFile *in, *out, *un_out; + int64_t count; + int is_count; + char *fn_in, *fn_idx_in, *fn_out, *fn_fai, *fn_un_out, *fn_out_idx, *fn_un_out_idx; + int fetch_pairs, nreglist; + hts_reglist_t *reglist; } samview_settings_t; // Copied from htslib/sam.c. @@ -198,6 +208,11 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin const char *p = bam_get_library((sam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } + return 0; +} + +static int adjust_tags(const sam_hdr_t *h, bam1_t *b, + samview_settings_t* settings) { if (settings->keep_tag) { uint8_t *s_from, *s_to, *end = b->data + b->l_data; auxhash_t h = settings->keep_tag; @@ -209,7 +224,7 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin if (s == NULL) { print_error("view", "malformed aux data for record \"%s\"", bam_get_qname(b)); - break; + return -1; } if (kh_get(aux_exists, h, x) != kh_end(h) ) { @@ -231,7 +246,7 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin if (s == NULL) { print_error("view", "malformed aux data for record \"%s\"", bam_get_qname(b)); - break; + return -1; } if (kh_get(aux_exists, h, x) == kh_end(h) ) { @@ -406,47 +421,372 @@ int parse_aux_list(auxhash_t *h, char *optarg) { return 0; } +static int cmp_reglist_intervals(const void *aptr, const void *bptr) +{ + hts_pair_pos_t *a = (hts_pair_pos_t*)aptr; + hts_pair_pos_t *b = (hts_pair_pos_t*)bptr; + if ( a->beg < b->beg ) return -1; + if ( a->beg > b->beg ) return 1; + if ( a->end < b->end ) return -1; + if ( a->end > b->end ) return 1; + return 0; +} +static int cmp_reglist_tids(const void *aptr, const void *bptr) +{ + hts_reglist_t *a = (hts_reglist_t*)aptr; + hts_reglist_t *b = (hts_reglist_t*)bptr; + if ( b->tid==HTS_IDX_NOCOOR || a->tid < b->tid ) return -1; + if ( a->tid==HTS_IDX_NOCOOR || a->tid > b->tid ) return 1; + return 0; +} + +static hts_reglist_t *_reglist_dup(sam_hdr_t *hdr, hts_reglist_t *src, int nsrc) +{ + int i,j; + hts_reglist_t *dst = (hts_reglist_t*)calloc(nsrc,sizeof(hts_reglist_t)); + if ( !dst ) { + print_error_errno("view", "[%s:%d] could not allocate region list" + ,__FILE__ ,__LINE__); + return NULL; + } + for (i=0; i tid ) imax = i - 1; + else break; + } + if ( i<0 || reg[i].tid < tid ) i++; // not found, i will be the index of the inserted element + return i; +} +static int _reglist_push(hts_reglist_t **_reg, int *_nreg, int tid, hts_pos_t beg, hts_pos_t end) +{ + hts_reglist_t *reg = *_reg; + int nreg = *_nreg; + int i = _reglist_find_tid(reg,nreg,tid); + if ( i>=nreg || reg[i].tid!=tid ) { + nreg++; + reg = (hts_reglist_t*)realloc(reg,sizeof(hts_reglist_t)*nreg); + if ( !reg ) { + print_error_errno("view", "[%s:%d] could not extend region list", + __FILE__, __LINE__); + return -1; + } + if ( i+1 < nreg ) + memmove(reg + i + 1, reg + i, sizeof(hts_reglist_t)*(nreg - i - 1)); + reg[i].reg = NULL; + reg[i].tid = tid; + reg[i].min_beg = beg; + reg[i].max_end = end; + reg[i].intervals = NULL; + reg[i].count = 0; + } + *_reg = reg; + *_nreg = nreg; + if ( reg[i].count > 0 + && reg[i].intervals[reg[i].count - 1].beg==beg + && reg[i].intervals[reg[i].count - 1].end==end ) { + return 0; + } + hts_pair_pos_t *new_intervals = realloc(reg[i].intervals, sizeof(hts_pair_pos_t)*(reg[i].count + 1)); + if (!new_intervals) { + print_error_errno("view", "[%s:%d] could not extend region list", + __FILE__, __LINE__); + return -1; + } + reg[i].intervals = new_intervals; + reg[i].intervals[reg[i].count].beg = beg; + reg[i].intervals[reg[i].count].end = end; + reg[i].count++; + return 0; +} + +static void _reglist_merge(hts_reglist_t *reg, int nreg) +{ + int i,j; + for (i=0; ibed = bed_hash_regions(conf->bed, regs, 0, nregs, &filter_op); // insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file + if ( !filter_op ) + filter_state = FILTERED; + } + else + bed_unify(conf->bed); + if ( !conf->bed) { // index is unavailable or no regions have been specified + print_error("view", "No regions or BED file have been provided. Aborting."); + return NULL; + } + + int regcount = 0; + hts_reglist_t *reglist = bed_reglist(conf->bed, filter_state, ®count); + if (!reglist) { + print_error("view", "Region list is empty or could not be created. Aborting."); + return NULL; + } + + if ( conf->fetch_pairs ) { + conf->reglist = _reglist_dup(conf->header,reglist,regcount); + if (!conf->reglist) + return NULL; + conf->nreglist = regcount; + } + + iter = sam_itr_regions(conf->hts_idx, conf->header, reglist, regcount); + if ( !iter ) { + print_error("view", "Iterator could not be created. Aborting."); + return NULL; + } + return iter; +} + +KHASH_SET_INIT_STR(names) + +static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t *iter) +{ + khint_t k; + int nunmap = 0, r = 0, nmates = 0, write_error = 0, retval = EXIT_FAILURE; + kh_names_t *mate_names = kh_init(names); + bam1_t *rec = bam_init1(); + + if (!mate_names) { + print_error_errno("view", "could not allocate mate names table"); + goto out; + } + if (!rec) { + print_error_errno("view", "could not allocate bam record"); + goto out; + } + + while ((r =sam_itr_multi_next(conf->in, iter, rec))>=0) { + if ( (rec->core.flag & BAM_FPAIRED) == 0 ) continue; + if ( rec->core.mtid>=0 && bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.mtid), rec->core.mpos, rec->core.mpos) ) continue; + if ( process_aln(conf->header, rec, conf) ) continue; + + nmates++; + + k = kh_get(names,mate_names,bam_get_qname(rec)); + if ( k == kh_end(mate_names) ) { + int ret = 0; + char *name_copy = strdup(bam_get_qname(rec)); + if (!name_copy) { + print_error_errno("view", "[%s:%d] could not store sample name, %d elements", __FILE__,__LINE__,nmates); + goto out; + } + kh_put(names, mate_names, name_copy, &ret); + if ( ret<0 ) { + print_error_errno("view", "[%s:%d] could not store sample name, %d elements",__FILE__,__LINE__,nmates); + free(name_copy); + goto out; + } + } + + if ( rec->core.mtid < 0 || (rec->core.flag & BAM_FMUNMAP) ) nunmap = 1; + if ( rec->core.mtid >= 0 ) { + if (_reglist_push(&conf->reglist, &conf->nreglist, rec->core.mtid, rec->core.mpos,rec->core.mpos+1) != 0) + goto out; + } + } + + if (r < -1) { + print_error_errno("view", "error reading file \"%s\"", conf->fn_in); + goto out; + } + + _reglist_merge(conf->reglist, conf->nreglist); + if ( nunmap ) { + if (_reglist_push(&conf->reglist,&conf->nreglist,HTS_IDX_NOCOOR,0,HTS_POS_MAX) != 0) + goto out; + } + hts_itr_multi_destroy(iter); + iter = sam_itr_regions(conf->hts_idx, conf->header, conf->reglist, conf->nreglist); + if ( !iter ) { + print_error_errno("view", "[%s:%d] iterator could not be created",__FILE__,__LINE__); + goto out; + } + while ((r = sam_itr_multi_next(conf->in, iter, rec))>=0) { + int drop = 1; + if (rec->core.tid >=0 && + bed_overlap(conf->bed, sam_hdr_tid2name(conf->header,rec->core.tid), rec->core.pos, bam_endpos(rec))) drop = 0; + if ( drop ) { + k = kh_get(names,mate_names,bam_get_qname(rec)); + if ( k != kh_end(mate_names) ) drop = 0; + } + if (!drop && process_aln(conf->header, rec, conf) == 0) { + if (adjust_tags(conf->header, rec, conf) != 0) + goto out; + if (check_sam_write1(conf->out, conf->header, rec, conf->fn_out, + &write_error) < 0) + goto out; + } + } + + if (r < -1) { + print_error_errno("view", "error reading file \"%s\"", conf->fn_in); + goto out; + } + + retval = EXIT_SUCCESS; + + out: + hts_itr_multi_destroy(iter); + hts_idx_destroy(conf->hts_idx); // destroy the BAM index + conf->hts_idx = NULL; + if (mate_names) { + // free khash keys + for (k = 0; k < kh_end(mate_names); ++k) + if ( kh_exist(mate_names,k) ) free((char*)kh_key(mate_names, k)); + kh_destroy(names,mate_names); + } + bam_destroy1(rec); + return retval; +} + +// Common code for processing and writing a record +static inline int process_one_record(samview_settings_t *conf, bam1_t *b, + int *write_error) { + if (!process_aln(conf->header, b, conf)) { + if (!conf->is_count) { + change_flag(b, conf); + if (adjust_tags(conf->header, b, conf) != 0) + return -1; + if (check_sam_write1(conf->out, conf->header, + b, conf->fn_out, write_error) < 0) { + return -1; + } + } + conf->count++; + } else if (conf->unmap) { + b->core.flag |= BAM_FUNMAP; + if (check_sam_write1(conf->out, conf->header, + b, conf->fn_out, write_error) < 0) { + return -1; + } + } else { + if (conf->un_out) { + if (check_sam_write1(conf->un_out, conf->header, + b, conf->fn_un_out, write_error) < 0) { + return -1; + } + } + } + return 0; +} + +static int stream_view(samview_settings_t *conf) { + bam1_t *b = bam_init1(); + int write_error = 0, r; + if (!b) { + print_error_errno("view", "could not allocate bam record"); + return 1; + } + while ((r = sam_read1(conf->in, conf->header, b)) >= 0) { + if (process_one_record(conf, b, &write_error) < 0) break; + } + bam_destroy1(b); + if (r < -1) { + print_error_errno("view", "error reading file \"%s\"", conf->fn_in); + return 1; + } + return write_error; +} + +static int multi_region_view(samview_settings_t *conf, hts_itr_multi_t *iter) +{ + bam1_t *b = bam_init1(); + int write_error = 0, result; + if (!b) { + print_error_errno("view", "could not allocate bam record"); + return 1; + } + // fetch alignments + while ((result = sam_itr_multi_next(conf->in, iter, b)) >= 0) { + if (process_one_record(conf, b, &write_error) < 0) break; + } + hts_itr_multi_destroy(iter); + bam_destroy1(b); + + if (result < -1) { + print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid); + return 1; + } + return write_error; +} + // Make mnemonic distinct values for longoption-only options #define LONGOPT(c) ((c) + 128) +// Check for ".sam" filenames as sam_open_mode cannot distinguish between +// foo.sam and foo.unknown, both getting mode "". +static int is_sam(const char *fn) { + if (!fn) + return 0; + size_t l = strlen(fn); + return (l >= 4 && strcasecmp(fn + l-4, ".sam") == 0); +} + int main_samview(int argc, char *argv[]) { - int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; - int64_t count = 0; - samFile *in = 0, *out = 0, *un_out=0; + samview_settings_t settings; + int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, has_index_file = 0, no_pg = 0; FILE *fp_out = NULL; - sam_hdr_t *header = NULL; - char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = ""; - char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0; - char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; + char out_mode[6] = {0}, out_un_mode[6] = {0}; + char *out_format = ""; + char *arg_list = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; - int filter_state = ALL, filter_op = 0; - int result; - - samview_settings_t settings = { - .rghash = NULL, - .tvhash = NULL, - .min_mapQ = 0, - .flag_on = 0, - .flag_off = 0, - .flag_alloff = 0, - .flag_anyon = 0, - .min_qlen = 0, - .remove_B = 0, - .subsam_seed = 0, - .subsam_frac = -1., - .library = NULL, - .bed = NULL, - .multi_region = 0, - .tag = NULL, - .filter = NULL, - .remove_flag = 0, - .add_flag = 0, - .keep_tag = NULL, - .remove_tag = NULL, - .unmap = 0, - }; + + memset(&settings,0,sizeof(settings)); + settings.subsam_frac = -1.0; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), @@ -462,6 +802,7 @@ int main_samview(int argc, char *argv[]) {"expression", required_argument, NULL, 'e'}, {"fai-reference", required_argument, NULL, 't'}, {"fast", no_argument, NULL, '1'}, + {"fetch-pairs", no_argument, NULL, 'P'}, {"header-only", no_argument, NULL, 'H'}, {"help", no_argument, NULL, LONGOPT('?')}, {"incl-flags", required_argument, NULL, LONGOPT('g')}, @@ -514,15 +855,16 @@ int main_samview(int argc, char *argv[]) // set optopt to '\0'). opterr = 0; + char *tmp; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:p", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pP", lopts, NULL)) >= 0) { switch (c) { case 's': - settings.subsam_seed = strtol(optarg, &q, 10); - if (q && *q == '.') { - settings.subsam_frac = strtod(q, &q); - if (*q) ret = 1; + settings.subsam_seed = strtol(optarg, &tmp, 10); + if (tmp && *tmp == '.') { + settings.subsam_frac = strtod(tmp, &tmp); + if (*tmp) ret = 1; } else { ret = 1; } @@ -533,24 +875,24 @@ int main_samview(int argc, char *argv[]) } break; case LONGOPT('s'): - settings.subsam_frac = strtod(optarg, &q); - if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) { + settings.subsam_frac = strtod(optarg, &tmp); + if (*tmp || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) { print_error("view", "Incorrect sampling argument \"%s\"", optarg); goto view_end; } break; case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break; case 'm': settings.min_qlen = atoi(optarg); break; - case 'c': is_count = 1; break; + case 'c': settings.is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; - case 't': fn_fai = strdup(optarg); break; + case 't': settings.fn_fai = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; case LONGOPT('H'): is_header = is_header_only = 0; break; - case 'o': fn_out = strdup(optarg); break; - case 'U': fn_un_out = strdup(optarg); break; + case 'o': settings.fn_out = strdup(optarg); break; + case 'U': settings.fn_un_out = strdup(optarg); break; case 'X': has_index_file = 1; break; case 'f': settings.flag_on |= bam_str2flag(optarg); break; case 'F': settings.flag_off |= bam_str2flag(optarg); break; @@ -562,6 +904,7 @@ int main_samview(int argc, char *argv[]) case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; case 'p': settings.unmap = 1; break; + case 'P': settings.fetch_pairs = 1; settings.multi_region = 1; break; case LONGOPT('L'): settings.multi_region = 1; // fall through @@ -648,10 +991,6 @@ int main_samview(int argc, char *argv[]) goto view_end; } break; - /* REMOVED as htslib doesn't support this - //case 'x': out_format = "x"; break; - //case 'X': out_format = "X"; break; - */ case LONGOPT('?'): return usage(samtools_stdout, EXIT_SUCCESS, 1); case '?': @@ -705,16 +1044,36 @@ int main_samview(int argc, char *argv[]) break; } } - if (fn_fai == 0 && ga.reference) fn_fai = fai_path(ga.reference); - if (compress_level >= 0 && !*out_format) out_format = "b"; + if (settings.is_count && settings.fetch_pairs) + { + print_error("view","The options -P and -c cannot be combined\n"); + return 1; + } + if (settings.fn_fai == 0 && ga.reference) settings.fn_fai = fai_path(ga.reference); if (is_header_only) is_header = 1; // File format auto-detection first - if (fn_out) sam_open_mode(out_mode+1, fn_out, NULL); - if (fn_un_out) sam_open_mode(out_un_mode+1, fn_un_out, NULL); - // Overridden by manual -b, -C - if (*out_format) + if (settings.fn_out) sam_open_mode(out_mode+1, settings.fn_out, NULL); + if (settings.fn_un_out) sam_open_mode(out_un_mode+1, settings.fn_un_out, NULL); + + // -1 or -u without an explicit format (-b, -C) => check fn extensions + if (!*out_format && compress_level >= 0) { + if (compress_level == 0 && + (out_mode[strlen(out_mode)-1] == 'z' || + out_un_mode[strlen(out_un_mode)-1] == 'z')) + // z, fz, Fz sanity check + fprintf(samtools_stderr, "[view] Warning option -u ignored due to" + " filename suffix\n"); + + // If known extension, use it, otherwise BAM + if (!(out_mode[1] || is_sam(settings.fn_out))) + out_mode[1] = 'b'; + + if (!(out_un_mode[1] || is_sam(settings.fn_un_out))) + out_un_mode[1] = 'b'; + } else if (*out_format) { out_mode[1] = out_un_mode[1] = *out_format; - // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul. + } + if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; @@ -726,7 +1085,7 @@ int main_samview(int argc, char *argv[]) return usage(samtools_stderr, EXIT_FAILURE, 0); // potential memory leak... } - if (settings.unmap && fn_un_out) { + if (settings.unmap && settings.fn_un_out) { print_error("view", "Options --unoutput and --unmap are mutually exclusive."); ret = 1; goto view_end; @@ -739,42 +1098,42 @@ int main_samview(int argc, char *argv[]) settings.subsam_seed = rand(); } - fn_in = (optind < argc)? argv[optind] : "-"; - if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { - print_error_errno("view", "failed to open \"%s\" for reading", fn_in); + settings.fn_in = (optind < argc)? argv[optind] : "-"; + if ((settings.in = sam_open_format(settings.fn_in, "r", &ga.in)) == 0) { + print_error_errno("view", "failed to open \"%s\" for reading", settings.fn_in); ret = 1; goto view_end; } - if (fn_fai) { - if (hts_set_fai_filename(in, fn_fai) != 0) { - fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); + if (settings.fn_fai) { + if (hts_set_fai_filename(settings.in, settings.fn_fai) != 0) { + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", settings.fn_fai); ret = 1; goto view_end; } } - if ((header = sam_hdr_read(in)) == 0) { - fprintf(samtools_stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); + if ((settings.header = sam_hdr_read(settings.in)) == 0) { + fprintf(samtools_stderr, "[main_samview] fail to read the header from \"%s\".\n", settings.fn_in); ret = 1; goto view_end; } if (settings.rghash) { - sam_hdr_remove_lines(header, "RG", "ID", settings.rghash); + sam_hdr_remove_lines(settings.header, "RG", "ID", settings.rghash); } - if (!is_count) { - if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { - print_error_errno("view", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output"); + if (!settings.is_count) { + if ((settings.out = sam_open_format(settings.fn_out? settings.fn_out : "-", out_mode, &ga.out)) == 0) { + print_error_errno("view", "failed to open \"%s\" for writing", settings.fn_out? settings.fn_out : "standard output"); ret = 1; goto view_end; } - if (fn_fai) { - if (hts_set_fai_filename(out, fn_fai) != 0) { - fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); + if (settings.fn_fai) { + if (hts_set_fai_filename(settings.out, settings.fn_fai) != 0) { + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", settings.fn_fai); ret = 1; goto view_end; } } - autoflush_if_stdout(out, fn_out); + autoflush_if_stdout(settings.out, settings.fn_out); if (!no_pg) { if (!(arg_list = stringify_argv(argc+1, argv-1))) { @@ -782,7 +1141,7 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - if (sam_hdr_add_pg(header, "samtools", + if (sam_hdr_add_pg(settings.header, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, @@ -793,47 +1152,47 @@ int main_samview(int argc, char *argv[]) } } - if (*out_format || ga.write_index || is_header || + if (ga.write_index || is_header || out_mode[1] == 'b' || out_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { - if (sam_hdr_write(out, header) != 0) { + if (sam_hdr_write(settings.out, settings.header) != 0) { fprintf(samtools_stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (ga.write_index) { - if (!(fn_out_idx = auto_index(out, fn_out, header))) { + if (!(settings.fn_out_idx = auto_index(settings.out, settings.fn_out, settings.header))) { ret = 1; goto view_end; } } - if (fn_un_out) { - if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { - print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); + if (settings.fn_un_out) { + if ((settings.un_out = sam_open_format(settings.fn_un_out, out_un_mode, &ga.out)) == 0) { + print_error_errno("view", "failed to open \"%s\" for writing", settings.fn_un_out); ret = 1; goto view_end; } - if (fn_fai) { - if (hts_set_fai_filename(un_out, fn_fai) != 0) { - fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); + if (settings.fn_fai) { + if (hts_set_fai_filename(settings.un_out, settings.fn_fai) != 0) { + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", settings.fn_fai); ret = 1; goto view_end; } } - autoflush_if_stdout(un_out, fn_un_out); - if (*out_format || is_header || + autoflush_if_stdout(settings.un_out, settings.fn_un_out); + if (ga.write_index || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { - if (sam_hdr_write(un_out, header) != 0) { + if (sam_hdr_write(settings.un_out, settings.header) != 0) { fprintf(samtools_stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (ga.write_index) { - if (!(fn_un_out_idx = auto_index(un_out, fn_un_out, header))) { + if (!(settings.fn_un_out_idx = auto_index(settings.un_out, settings.fn_un_out, settings.header))) { ret = 1; goto view_end; } @@ -841,14 +1200,15 @@ int main_samview(int argc, char *argv[]) } } else { - if (fn_out) { - fp_out = fopen(fn_out, "w"); + if (settings.fn_out) { + fp_out = fopen(settings.fn_out, "w"); if (fp_out == NULL) { - print_error_errno("view", "can't create \"%s\"", fn_out); + print_error_errno("view", "can't create \"%s\"", settings.fn_out); ret = EXIT_FAILURE; goto view_end; } } + settings.unmap = 0; // Not valid in counting mode } if (ga.nthreads > 1) { @@ -857,188 +1217,93 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); - if (out) hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(settings.in, HTS_OPT_THREAD_POOL, &p); + if (settings.out) hts_set_opt(settings.out, HTS_OPT_THREAD_POOL, &p); } if (is_header_only) goto view_end; // no need to print alignments - if (has_index_file) { - fn_idx_in = (optind+1 < argc)? argv[optind+1] : 0; - if (fn_idx_in == 0) { - fprintf(samtools_stderr, "[main_samview] incorrect number of arguments for -X option. Aborting.\n"); + + // Initialize BAM/CRAM index + char **regs = NULL; + int nregs = 0; + if ( has_index_file && optind < argc - 2 ) regs = &argv[optind+2], nregs = argc - optind - 2, settings.fn_idx_in = argv[optind+1]; + else if ( !has_index_file && optind < argc - 1 ) regs = &argv[optind+1], nregs = argc - optind - 1; + else if ( has_index_file ) + { + print_error("view", "Incorrect number of arguments for -X option. Aborting."); + return 1; + } + if ( settings.fn_idx_in || nregs || settings.multi_region ) + { + settings.hts_idx = settings.fn_idx_in ? sam_index_load2(settings.in, settings.fn_in, settings.fn_idx_in) : sam_index_load(settings.in, settings.fn_in); + if ( !settings.hts_idx ) + { + print_error("view", "Random alignment retrieval only works for indexed SAM.gz, BAM or CRAM files."); return 1; } } - if (settings.multi_region) { - if (!has_index_file && optind < argc - 1) { //regions have been specified in the command line - settings.bed = bed_hash_regions(settings.bed, argv, optind+1, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file - if (!filter_op) - filter_state = FILTERED; - } else if (has_index_file && optind < argc - 2) { - settings.bed = bed_hash_regions(settings.bed, argv, optind+2, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file - if (!filter_op) - filter_state = FILTERED; - } else { - bed_unify(settings.bed); - } - - bam1_t *b = bam_init1(); - if (settings.bed == NULL) { // index is unavailable or no regions have been specified - fprintf(samtools_stderr, "[main_samview] no regions or BED file have been provided. Aborting.\n"); - } else { - hts_idx_t *idx = NULL; - // If index filename has not been specfied, look in BAM folder - if (fn_idx_in != 0) { - idx = sam_index_load2(in, fn_in, fn_idx_in); // load index - } else { - idx = sam_index_load(in, fn_in); - } - if (idx != NULL) { - - int regcount = 0; - - hts_reglist_t *reglist = bed_reglist(settings.bed, filter_state, ®count); - if(reglist) { - hts_itr_multi_t *iter = sam_itr_regions(idx, header, reglist, regcount); - if (iter) { - // fetch alignments - while ((result = sam_itr_multi_next(in, iter, b)) >= 0) { - if (!process_aln(header, b, &settings)) { - if (!is_count) { - change_flag(b, &settings); - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } - count++; - } else if (settings.unmap) { - b->core.flag |= BAM_FUNMAP; - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } else { - if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } - } - } - if (result < -1) { - print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid); - ret = 1; - } - - hts_itr_multi_destroy(iter); - } else { - fprintf(samtools_stderr, "[main_samview] iterator could not be created. Aborting.\n"); - } - } else { - fprintf(samtools_stderr, "[main_samview] region list is empty or could not be created. Aborting.\n"); - } - hts_idx_destroy(idx); // destroy the BAM index - } else { - fprintf(samtools_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); - } - } - bam_destroy1(b); - } else { - if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file - bam1_t *b = bam_init1(); - int r; - errno = 0; - while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' - if (!process_aln(header, b, &settings)) { - if (!is_count) { - change_flag(b, &settings); - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } - count++; - } else if (settings.unmap) { - b->core.flag |= BAM_FUNMAP; - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } else { - if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } - } - } - if (r < -1) { - print_error_errno("view", "error reading file \"%s\"", fn_in); - ret = 1; - } - bam_destroy1(b); - } else { // retrieve alignments in specified regions - int i; - bam1_t *b; - hts_idx_t *idx = NULL; - // If index filename has not been specfied, look in BAM folder - if (fn_idx_in != NULL) { - idx = sam_index_load2(in, fn_in, fn_idx_in); // load index - } else { - idx = sam_index_load(in, fn_in); - } - if (idx == 0) { // index is unavailable - fprintf(samtools_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); - ret = 1; - goto view_end; - } - b = bam_init1(); - - for (i = (has_index_file)? optind+2 : optind+1; i < argc; ++i) { - int result; - hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' - if (iter == NULL) { // region invalid or reference name not found - fprintf(samtools_stderr, "[main_samview] region \"%s\" specifies an invalid region or unknown reference. Continue anyway.\n", argv[i]); - continue; - } - // fetch alignments - while ((result = sam_itr_next(in, iter, b)) >= 0) { - if (!process_aln(header, b, &settings)) { - if (!is_count) { - change_flag(b, &settings); - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } - count++; - } else if (settings.unmap) { - b->core.flag |= BAM_FUNMAP; - if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; - } else { - if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } - } - } - hts_itr_destroy(iter); - if (result < -1) { - print_error("view", "retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file", argv[i]); - ret = 1; - break; - } + if ( settings.fetch_pairs ) + { + hts_itr_multi_t *iter = multi_region_init(&settings, regs, nregs); + ret = iter ? fetch_pairs_collect_mates(&settings, iter) : 1; + if (ret) goto view_end; + } + else if ( settings.multi_region ) + { + hts_itr_multi_t *iter = multi_region_init(&settings, regs, nregs); + ret = iter ? multi_region_view(&settings, iter) : 1; + if (ret) goto view_end; + } + else if ( !settings.hts_idx ) // stream through the entire file + { + ret = stream_view(&settings); + if (ret) goto view_end; + } else { // retrieve alignments in specified regions + int i; + for (i = (has_index_file)? optind+2 : optind+1; i < argc; ++i) { + hts_itr_t *iter = sam_itr_querys(settings.hts_idx, settings.header, argv[i]); // parse a region in the format like `chr2:100-200' + if (iter == NULL) { // region invalid or reference name not found + fprintf(samtools_stderr, "[main_samview] region \"%s\" specifies an invalid region or unknown reference. Continue anyway.\n", argv[i]); + continue; } - bam_destroy1(b); - hts_idx_destroy(idx); // destroy the BAM index + // fetch alignments + ret = multi_region_view(&settings, iter); + if (ret) goto view_end; } } + if ( settings.hts_idx ) hts_idx_destroy(settings.hts_idx); + if (ga.write_index) { - if (sam_idx_save(out) < 0) { + if (sam_idx_save(settings.out) < 0) { print_error_errno("view", "writing index failed"); ret = 1; } - if (un_out && sam_idx_save(un_out) < 0) { + if (settings.un_out && sam_idx_save(settings.un_out) < 0) { print_error_errno("view", "writing index failed"); ret = 1; } } view_end: - if (is_count && ret == 0) { - if (fprintf(fn_out? fp_out : samtools_stdout, "%" PRId64 "\n", count) < 0) { - if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out); + if (settings.is_count && ret == 0) { + if (fprintf(settings.fn_out? fp_out : samtools_stdout, "%" PRId64 "\n", settings.count) < 0) { + if (settings.fn_out) print_error_errno("view", "writing to \"%s\" failed", settings.fn_out); else print_error_errno("view", "writing to standard output failed"); ret = EXIT_FAILURE; } } // close files, free and return - if (in) check_sam_close("view", in, fn_in, "standard input", &ret); - if (out) check_sam_close("view", out, fn_out, "standard output", &ret); - if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); + if (settings.in) check_sam_close("view", settings.in, settings.fn_in, "standard input", &ret); + if (settings.out) check_sam_close("view", settings.out, settings.fn_out, "standard output", &ret); + if (settings.un_out) check_sam_close("view", settings.un_out, settings.fn_un_out, "file", &ret); if (fp_out) fclose(fp_out); - free(fn_fai); free(fn_out); free(settings.library); free(fn_un_out); + free(settings.fn_fai); free(settings.fn_out); free(settings.library); free(settings.fn_un_out); sam_global_args_free(&ga); - if ( header ) sam_hdr_destroy(header); + if ( settings.header ) sam_hdr_destroy(settings.header); if (settings.bed) bed_destroy(settings.bed); if (settings.rghash) { khint_t k; @@ -1071,10 +1336,10 @@ view_end: if (p.pool) hts_tpool_destroy(p.pool); - if (fn_out_idx) - free(fn_out_idx); - if (fn_un_out_idx) - free(fn_un_out_idx); + if (settings.fn_out_idx) + free(settings.fn_out_idx); + if (settings.fn_un_out_idx) + free(settings.fn_un_out_idx); free(arg_list); if (settings.keep_tag) @@ -1095,8 +1360,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "Output options:\n" " -b, --bam Output BAM\n" " -C, --cram Output CRAM (requires -T)\n" -" -1, --fast Use fast BAM compression (implies --bam)\n" -" -u, --uncompressed Uncompressed BAM output (implies --bam)\n" +" -1, --fast Use fast BAM compression (and default to --bam)\n" +" -u, --uncompressed Uncompressed BAM output (and default to --bam)\n" " -h, --with-header Include header in SAM output\n" " -H, --header-only Print SAM header only (no alignments)\n" " --no-header Print SAM alignment records only [default]\n" @@ -1106,6 +1371,7 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " Output reads not selected by filters to FILE\n" " -p, --unmap Set flag to UNMAP on reads not selected\n" " then write to output file.\n" +" -P, --fetch-pairs Retrieve complete pairs even when outside of region\n" "Input options:\n" " -t, --fai-reference FILE FILE listing reference names and lengths\n" " -M, --use-index Use index and multi-region iterator for regions\n" @@ -1125,6 +1391,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -e, --expr STR ...match the filter expression STR\n" " -f, --require-flags FLAG ...have all of the FLAGs present\n" // F&x == x " -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n" // F&x == 0 +" --rf, --incl-flags, --include-flags FLAG\n" +" ...have some of the FLAGs present\n" " -G FLAG EXCLUDE reads with all of the FLAGs present\n" // !(F&x == x) TODO long option " --subsample FLOAT Keep only FLOAT fraction of templates/read pairs\n" " --subsample-seed INT Influence WHICH reads are kept in subsampling [0]\n" @@ -1196,3 +1464,123 @@ static int usage(FILE *fp, int exit_status, int is_long_help) return exit_status; } + +static int head_usage(FILE *fp, int exit_status) +{ + fprintf(fp, +"Usage: samtools head [OPTION]... [FILE]\n" +"Options:\n" +" -h, --headers INT Display INT header lines [all]\n" +" -n, --records INT Display INT alignment record lines [none]\n" +); + sam_global_opt_help(fp, "-.--T@-."); + return exit_status; +} + +int main_head(int argc, char *argv[]) +{ + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'T', '@'), + { "headers", required_argument, NULL, 'h' }, + { "records", required_argument, NULL, 'n' }, + { NULL, 0, NULL, 0 } + }; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + + int all_headers = 1; + uint64_t nheaders = 0; + uint64_t nrecords = 0; + + int c, nargs; + while ((c = getopt_long(argc, argv, "h:n:T:@:", lopts, NULL)) >= 0) + switch (c) { + case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break; + case 'n': nrecords = strtoull(optarg, NULL, 0); break; + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': + return head_usage(samtools_stderr, EXIT_FAILURE); + } + + nargs = argc - optind; + if (nargs == 0 && isatty(STDIN_FILENO)) + return head_usage(samtools_stdout, EXIT_SUCCESS); + else if (nargs > 1) + return head_usage(samtools_stderr, EXIT_FAILURE); + + samFile *fp = NULL; + sam_hdr_t *hdr = NULL; + kstring_t str = KS_INITIALIZE; + bam1_t *b = NULL; + + const char *fname = (nargs == 1)? argv[optind] : "-"; + fp = sam_open_format(fname, "r", &ga.in); + if (fp == NULL) { + if (strcmp(fname, "-") != 0) + print_error_errno("head", "failed to open \"%s\" for reading", fname); + else + print_error_errno("head", "failed to open standard input for reading"); + goto err; + } + + if (ga.nthreads > 0) hts_set_threads(fp, ga.nthreads); + + hdr = sam_hdr_read(fp); + if (hdr == NULL) { + if (strcmp(fname, "-") != 0) + print_error("head", "failed to read the header from \"%s\"", fname); + else + print_error("head", "failed to read the header"); + goto err; + } + + if (all_headers) { + fputs(sam_hdr_str(hdr), samtools_stdout); + } + else if (nheaders > 0) { + const char *text = sam_hdr_str(hdr); + const char *lim = text; + uint64_t n; + for (n = 0; n < nheaders; n++) { + lim = strchr(lim, '\n'); + if (lim) lim++; + else break; + } + if (lim) fwrite(text, lim - text, 1, samtools_stdout); + else fputs(text, samtools_stdout); + } + + if (nrecords > 0) { + b = bam_init1(); + uint64_t n; + int r; + for (n = 0; n < nrecords && (r = sam_read1(fp, hdr, b)) >= 0; n++) { + if (sam_format1(hdr, b, &str) < 0) { + print_error_errno("head", "couldn't format record"); + goto err; + } + samtools_puts(ks_str(&str)); + } + if (r < -1) { + print_error("head", "\"%s\" is truncated", fname); + goto err; + } + bam_destroy1(b); + ks_free(&str); + } + + sam_hdr_destroy(hdr); + sam_close(fp); + sam_global_args_free(&ga); + + return EXIT_SUCCESS; + +err: + if (fp) sam_close(fp); + sam_hdr_destroy(hdr); + bam_destroy1(b); + ks_free(&str); + sam_global_args_free(&ga); + return EXIT_FAILURE; +} diff --git a/samtools/samtools.pysam.c b/samtools/samtools.pysam.c index 7044603..10740a4 100644 --- a/samtools/samtools.pysam.c +++ b/samtools/samtools.pysam.c @@ -1,5 +1,4 @@ -#include -#include +#include #include #include #include @@ -62,6 +61,15 @@ static int samtools_status = 0; int samtools_dispatch(int argc, char *argv[]) { + /* Reset getopt()/getopt_long() processing. */ +#if defined __GLIBC__ + optind = 0; +#elif defined _OPTRESET || defined _OPTRESET_DECLARED + optreset = optind = 1; +#else + optind = 1; +#endif + if (setjmp(samtools_jmpbuf) == 0) return samtools_main(argc, argv); else @@ -73,17 +81,3 @@ void samtools_exit(int status) samtools_status = status; longjmp(samtools_jmpbuf, 1); } - - -void samtools_set_optind(int val) -{ - // setting this in cython via - // "from posix.unistd cimport optind" - // did not work. - // - // setting to 0 forces a complete re-initialization - optind = val; -} - - - diff --git a/samtools/samtools.pysam.h b/samtools/samtools.pysam.h index 9d20ecb..cb63b60 100644 --- a/samtools/samtools.pysam.h +++ b/samtools/samtools.pysam.h @@ -53,8 +53,22 @@ int samtools_dispatch(int argc, char *argv[]); void PYSAM_NORETURN samtools_exit(int status); -void samtools_set_optind(int); - extern int samtools_main(int argc, char *argv[]); - + +/* Define these only in samtools/bcftools C source, not Cython code. */ +#if !(defined CYTHON_ABI || defined CYTHON_HEX_VERSION) + +/*! Several non-static function names are used in both samtools and bcftools. + Both libcsamtools.so and libcbcftools.so are loaded simultaneously, leading + to collisions and wrong functions being called. #define these names so the + actual symbol names include distinct prefixes to avoid collisions. + */ +#define main_consensus samtools_main_consensus +#define main_reheader samtools_main_reheader +#define bam_smpl_init samtools_bam_smpl_init +#define bam_smpl_destroy samtools_bam_smpl_destroy +#define read_file_list samtools_read_file_list + +#endif + #endif diff --git a/samtools/splaysort.h b/samtools/splaysort.h new file mode 100644 index 0000000..af83533 --- /dev/null +++ b/samtools/splaysort.h @@ -0,0 +1,200 @@ +/* The MIT License + + Copyright (c) 2021 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef SPLAYSORT_H +#define SPLAYSORT_H + +#define SPLAYSORT_INIT(name, type_t, __sort_lt) \ + typedef struct splaynode_##name { \ + type_t value; \ + struct splaynode_##name *left; \ + struct splaynode_##name *right; \ + struct splaynode_##name *parent; \ + } splaynode_##name; \ + \ + void rotate_left_##name(splaynode_##name *node); \ + void rotate_right_##name(splaynode_##name *node); \ + int splay_sort_##name(size_t n, type_t array[] ); \ + int splay_flatten_##name(splaynode_##name *node, type_t dest[], size_t n);\ + splaynode_##name *splay_tree_##name(splaynode_##name *node); \ + splaynode_##name *splay_insert_##name(splaynode_##name *node, \ + type_t value, \ + splaynode_##name *node_ptr); \ + \ + void rotate_left_##name(splaynode_##name *node) { \ + splaynode_##name *parent = node->parent; \ + splaynode_##name *grandparent = parent->parent; \ + parent->right = node->left; \ + if (node->left != NULL) { \ + node->left->parent = parent; \ + } \ + node->left = parent; \ + parent->parent = node; \ + node->parent = grandparent; \ + \ + if (grandparent != NULL) { \ + if (grandparent->left == parent) { \ + grandparent->left = node; \ + } else { \ + grandparent->right = node; \ + } \ + } \ + } \ + \ + void rotate_right_##name(splaynode_##name *node) { \ + splaynode_##name *parent = node->parent; \ + splaynode_##name *grandparent = parent->parent; \ + parent->left = node->right; \ + \ + if (node->right != NULL) { \ + node->right->parent = parent; \ + } \ + node->right = parent; \ + parent->parent = node; \ + node->parent = grandparent; \ + \ + if (grandparent != NULL) { \ + if (grandparent->left == parent) { \ + grandparent->left = node; \ + } else { \ + grandparent->right = node; \ + } \ + } \ + } \ + int splay_sort_##name(size_t n, type_t array[] ) { \ + if (n < 1) { \ + return 0; \ + } \ + int i; \ + splaynode_##name *node_pool = malloc(sizeof(splaynode_##name) * n); \ + if (node_pool == NULL) return -1; \ + splaynode_##name *head = node_pool; \ + head->value = array[0]; \ + head->left = NULL; head->right = NULL; head->parent = NULL; \ + for (i = 1; i < n; i++) { \ + head = splay_insert_##name(head, array[i], node_pool + i ); \ + } \ + \ + if (splay_flatten_##name(head, array, n) == -1) { \ + free(node_pool); \ + return -1; \ + } \ + free(node_pool); \ + return 0; \ + } \ + \ + int splay_flatten_##name(splaynode_##name *head, type_t *dest, size_t n) {\ + int sp = 0, i = 0; \ + splaynode_##name *current = head; \ + splaynode_##name **stack = malloc(sizeof(current)*n); \ + if (stack == NULL) return -1; \ + \ + do { \ + while (current != NULL && sp < n) { \ + stack[sp++] = current; \ + current = current->left; \ + } \ + if (sp != 0) { \ + sp--; \ + dest[i++] = stack[sp]->value; \ + current = stack[sp]->right; \ + } \ + } while (!(current == NULL && sp == 0)); \ + \ + free(stack); \ + return 0; \ + } \ + splaynode_##name *splay_insert_##name(splaynode_##name *head, \ + type_t value, \ + splaynode_##name *node_ptr) { \ + splaynode_##name *parent = NULL; \ + while (head != NULL) { \ + parent = head; \ + if (__sort_lt(value, head->value)) { \ + head = head->left; \ + } else { \ + head = head->right; \ + } \ + } \ + splaynode_##name *new_node = node_ptr; \ + new_node->value = value; \ + new_node->left = NULL; \ + new_node->right = NULL; \ + new_node->parent = parent; \ + if (parent) { \ + if (__sort_lt(value, parent->value)) { \ + parent->left = new_node; \ + } else { \ + parent->right = new_node; \ + } \ + } \ + new_node = splay_tree_##name(new_node); \ + return new_node; \ + } \ + \ + splaynode_##name *splay_tree_##name(splaynode_##name *node) { \ + splaynode_##name *parent = node->parent; \ + \ + if (node->parent == NULL) { \ + return node; \ + } \ + if (node == parent->left) { \ + if (parent->parent == NULL) { \ + /* zig */ \ + rotate_right_##name(node); \ + } else if (parent->parent->left == parent) { \ + /* left zig zig */ \ + rotate_right_##name(node); \ + rotate_right_##name(node); \ + } else { \ + /* right left zig zag */ \ + rotate_right_##name(node); \ + rotate_left_##name(node); \ + } \ + } else { \ + if (parent->parent == NULL) { \ + /* zig */ \ + rotate_left_##name(node); \ + } else if (parent->parent->right == parent) { \ + /* right zig zig */ \ + rotate_left_##name(node); \ + rotate_left_##name(node); \ + } else { \ + /* left right zig zag */ \ + rotate_left_##name(node); \ + rotate_right_##name(node); \ + } \ + } \ + \ + if (node->parent != NULL) { \ + return splay_tree_##name(node); \ + } \ + return node; \ + } \ + + +#define splaysort(name, n, array) splay_sort_##name(n, array) + +#endif diff --git a/samtools/stats.c b/samtools/stats.c index 1b4f051..55a6465 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -1063,6 +1063,7 @@ static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stat pc->chunks = calloc(pc->m, sizeof(hts_pair_pos_t)); if ( !pc->chunks ) { fprintf(stderr, "Error allocating memory\n"); + free(pc); return; } @@ -2452,13 +2453,13 @@ int main_stats(int argc, char *argv[]) } if (init_stat_info_fname(info, bam_fname, &ga.in)) { - free(info); + cleanup_stats_info(info); return 1; } if (has_index_file && !(bam_idx_fname = argv[optind++])) { fprintf(stderr, "No index file provided\n"); - free(info); + cleanup_stats_info(info); return 1; } diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index 7f763f8..5158827 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -1065,6 +1065,7 @@ static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stat pc->chunks = calloc(pc->m, sizeof(hts_pair_pos_t)); if ( !pc->chunks ) { fprintf(samtools_stderr, "Error allocating memory\n"); + free(pc); return; } @@ -2454,13 +2455,13 @@ int main_stats(int argc, char *argv[]) } if (init_stat_info_fname(info, bam_fname, &ga.in)) { - free(info); + cleanup_stats_info(info); return 1; } if (has_index_file && !(bam_idx_fname = argv[optind++])) { fprintf(samtools_stderr, "No index file provided\n"); - free(info); + cleanup_stats_info(info); return 1; } diff --git a/samtools/version.sh b/samtools/version.sh index 0347be5..e943440 100755 --- a/samtools/version.sh +++ b/samtools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.14 +VERSION=1.15.1 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/setup.cfg b/setup.cfg index 1f061e5..ef3f511 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,5 +6,38 @@ universal = 0 # -v: verbose output addopts = -s -v testpaths = pysam tests -pep8maxlinelength = 120 -pep8ignore = E402 + +[flake8] +max-line-length = 100 +max-complexity = 23 +extend-ignore = E117, E124, E125, E201, E202, E211, E225, E231, E265, E266, E302, E303, E305, E402, E501, E701, E713, E722, E741, F403, F405, F811, F821, F841, W291, W293, W391, W605 +per-file-ignores = __init__.py:F401 + +# E117 over-indented +# E124 closing bracket does not match visual indentation +# E125 continuation line with same indent as next logical line +# E201 whitespace after '{' +# E202 whitespace before '}' +# E211 whitespace before '(' +# E225 missing whitespace around operator +# E231 missing whitespace after ':' +# E265 block comment should start with '# ' +# E266 too many leading '#' for block comment +# E302 expected 2 blank lines, found 1 +# E303 too many blank lines +# E305 expected 2 blank lines after class or function definition, found 1 +# E402 module level import not at top of file +# E501 line too long +# E701 multiple statements on one line (colon) +# E713 test for membership should be 'not in' +# E722 do not use bare 'except' +# E741 ambiguous variable name '...' +# F403 'from ... import *' used; unable to detect undefined names +# F405 '...' may be undefined, or defined from star imports: ... +# F811 redefinition of unused '...' from line ... +# F821 undefined name '...' +# F841 local variable '...' is assigned to but never used +# W291 trailing whitespace +# W293 blank line contains whitespace +# W391 blank line at end of file +# W605 invalid escape sequence '...' diff --git a/setup.py b/setup.py index 5f2bb00..2a3a386 100644 --- a/setup.py +++ b/setup.py @@ -31,16 +31,19 @@ import sysconfig from contextlib import contextmanager from distutils import log from setuptools import setup, Command +from distutils.command.build import build from setuptools.command.sdist import sdist +from distutils.errors import LinkError from cy_build import CyExtension as Extension, cy_build_ext as build_ext try: - import cython + import cython # noqa HAVE_CYTHON = True except ImportError: HAVE_CYTHON = False IS_PYTHON3 = sys.version_info.major >= 3 +IS_DARWIN = platform.system() == 'Darwin' @contextmanager @@ -82,6 +85,25 @@ def run_make_print_config(): return make_print_config +def run_nm_defined_symbols(objfile): + stdout = subprocess.check_output(["nm", "-g", "-P", objfile]) + if IS_PYTHON3: + stdout = stdout.decode("ascii") + + symbols = set() + for line in stdout.splitlines(): + (sym, symtype) = line.split()[:2] + if symtype not in "UFWw": + if IS_DARWIN: + # On macOS, all symbols have a leading underscore + symbols.add(sym.lstrip('_')) + else: + # Ignore symbols such as _edata (present in all shared objects) + if not sym.startswith('_'): symbols.add(sym) + + return symbols + + # This function emulates the way distutils combines settings from sysconfig, # environment variables, and the extension being built. It returns a dictionary # representing the usual set of variables, suitable for writing to a generated @@ -142,10 +164,10 @@ def set_compiler_envvars(): tmp_vars = [] for var in ['CC', 'CFLAGS', 'LDFLAGS']: if var in os.environ: - print ("# pysam: (env) {}={}".format(var, os.environ[var])) + print("# pysam: (env) {}={}".format(var, os.environ[var])) elif var in sysconfig.get_config_vars(): value = sysconfig.get_config_var(var) - print ("# pysam: (sysconfig) {}={}".format(var, value)) + print("# pysam: (sysconfig) {}={}".format(var, value)) os.environ[var] = value tmp_vars += [var] @@ -209,7 +231,40 @@ class cythonize_sdist(sdist): def run(self): from Cython.Build import cythonize cythonize(self.distribution.ext_modules) - super().run() + sdist.run(self) + + +# Override build command to add extra build steps. +class extra_build(build): + def check_ext_symbol_conflicts(self): + """Checks for symbols defined in multiple extension modules, + which can lead to crashes due to incorrect functions being invoked. + Avoid by adding an appropriate #define to import/pysam.h or in + unusual cases adding another rewrite rule to devtools/import.py. + """ + build_ext_obj = self.distribution.get_command_obj('build_ext') + + symbols = dict() + for ext in self.distribution.ext_modules: + for sym in run_nm_defined_symbols(build_ext_obj.get_ext_fullpath(ext.name)): + symbols.setdefault(sym, []).append(ext.name.lstrip('pysam.')) + + errors = 0 + for (sym, objs) in symbols.items(): + if (len(objs) > 1): + log.error("conflicting symbol (%s): %s", " ".join(objs), sym) + errors += 1 + + if errors > 0: raise LinkError("symbols defined in multiple extensions") + + def run(self): + build.run(self) + try: + self.check_ext_symbol_conflicts() + except OSError as e: + log.warn("skipping symbol collision check (invoking nm failed: %s)", e) + except subprocess.CalledProcessError: + log.warn("skipping symbol collision check (invoking nm failed)") class clean_ext(Command): @@ -272,10 +327,10 @@ config_headers = ["samtools/config.h", # the .pyx files. If no cython is available, the C-files included in the # distribution will be used. if HAVE_CYTHON: - print ("# pysam: cython is available - using cythonize if necessary") + print("# pysam: cython is available - using cythonize if necessary") source_pattern = "pysam/libc%s.pyx" else: - print ("# pysam: no cython available - using pre-compiled C") + print("# pysam: no cython available - using pre-compiled C") source_pattern = "pysam/libc%s.c" # Exit if there are no pre-compiled files and no cython available @@ -287,8 +342,8 @@ if not os.path.exists(fn): "from the repository" .format(fn)) -print ("# pysam: htslib mode is {}".format(HTSLIB_MODE)) -print ("# pysam: HTSLIB_CONFIGURE_OPTIONS={}".format( +print("# pysam: htslib mode is {}".format(HTSLIB_MODE)) +print("# pysam: HTSLIB_CONFIGURE_OPTIONS={}".format( HTSLIB_CONFIGURE_OPTIONS)) htslib_configure_options = None @@ -304,7 +359,7 @@ if HTSLIB_MODE in ['shared', 'separate']: "--disable-libcurl"]) HTSLIB_SOURCE = "builtin" - print ("# pysam: htslib configure options: {}".format( + print("# pysam: htslib configure options: {}".format( str(htslib_configure_options))) config_headers += ["htslib/config.h"] @@ -320,7 +375,7 @@ if HTSLIB_MODE in ['shared', 'separate']: htslib_make_options = run_make_print_config() for key, value in htslib_make_options.items(): - print ("# pysam: htslib_config {}={}".format(key, value)) + print("# pysam: htslib_config {}={}".format(key, value)) external_htslib_libraries = ['z'] if "LIBS" in htslib_make_options: @@ -385,7 +440,7 @@ with open(os.path.join("pysam", "config.py"), "w") as outf: "HAVE_LIBLZMA", "HAVE_MMAP"]: outf.write("{} = {}\n".format(key, config_values[key])) - print ("# pysam: config_option: {}={}".format(key, config_values[key])) + print("# pysam: config_option: {}={}".format(key, config_values[key])) # create empty config.h files if they have not been created automatically # or created by the user: @@ -536,9 +591,9 @@ metadata = { 'packages': package_list, 'requires': ['cython (>=0.29.12)'], 'ext_modules': [Extension(**opts) for opts in modules], - 'cmdclass': {'build_ext': build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist}, + 'cmdclass': {'build': extra_build, 'build_ext': build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist}, 'package_dir': package_dirs, - 'package_data': {'': ['*.pxd', '*.h'], }, + 'package_data': {'': ['*.pxd', '*.h', 'py.typed', '*.pyi'], }, # do not pack in order to permit linking to csamtools.so 'zip_safe': False, } diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 8fb1971..400425f 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -4,10 +4,18 @@ import unittest import json import collections import string +import struct import copy import array -from TestUtils import checkFieldEqual, make_data_files, BAM_DATADIR, get_temp_filename, get_temp_context, IS_PYTHON3 +from TestUtils import ( + checkFieldEqual, + make_data_files, + BAM_DATADIR, + get_temp_filename, + get_temp_context, + IS_PYTHON3, +) if IS_PYTHON3: @@ -21,14 +29,13 @@ def setUpModule(): class ReadTest(unittest.TestCase): - def build_read(self): - '''build an example read.''' + """build an example read.""" header = pysam.AlignmentHeader.from_references( - ["chr1", "chr2"], - [10000000, 10000000]) - + ["chr1", "chr2"], [10000000, 10000000] + ) + a = pysam.AlignedSegment(header) a.query_name = "read_12345" a.query_sequence = "ATGC" * 10 @@ -46,42 +53,39 @@ class ReadTest(unittest.TestCase): class TestAlignedSegment(ReadTest): - '''tests to check if aligned read can be constructed + """tests to check if aligned read can be constructed and manipulated. - ''' + """ def testEmpty(self): a = pysam.AlignedSegment() self.assertEqual(a.query_name, None) self.assertEqual(a.query_sequence, None) - self.assertEqual(pysam.qualities_to_qualitystring( - a.query_qualities), None) + self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) self.assertEqual(a.flag, 0) self.assertEqual(a.reference_id, -1) self.assertEqual(a.mapping_quality, 0) self.assertEqual(a.cigartuples, None) - self.assertEqual(a.tags, []) + self.assertEqual(a.get_tags(), []) self.assertEqual(a.next_reference_id, -1) self.assertEqual(a.next_reference_start, -1) self.assertEqual(a.template_length, 0) - + def testStrOfEmptyRead(self): a = pysam.AlignedSegment() s = str(a) - self.assertEqual( - "None\t0\t*\t0\t0\tNone\t*\t0\t0\tNone\tNone\t[]", - s) + self.assertEqual("None\t0\t*\t0\t0\tNone\t*\t0\t0\tNone\tNone\t[]", s) def testSettingTagInEmptyRead(self): - '''see issue 62''' + """see issue 62""" a = pysam.AlignedSegment() a.tags = (("NM", 1),) a.query_qualities = None - self.assertEqual(a.tags, [("NM", 1), ]) + self.assertEqual(a.tags, [("NM", 1),]) def testCompare(self): - '''check comparison functions.''' + """check comparison functions.""" a = self.build_read() b = None @@ -112,8 +116,8 @@ class TestAlignedSegment(ReadTest): self.assertNotEqual(hash(a), hash(b)) def testUpdate(self): - '''check if updating fields affects other variable length data - ''' + """check if updating fields affects other variable length data + """ a = self.build_read() b = self.build_read() @@ -126,7 +130,7 @@ class TestAlignedSegment(ReadTest): checkFieldEqual(self, a, b) # check cigar - b.cigartuples = ((0, 10), ) + b.cigartuples = ((0, 10),) checkFieldEqual(self, a, b, "cigartuples") b.cigartuples = ((0, 10), (2, 1), (0, 10)) checkFieldEqual(self, a, b, "cigartuples") @@ -135,55 +139,117 @@ class TestAlignedSegment(ReadTest): # check seq b.query_sequence = "ATGC" - checkFieldEqual(self, - a, b, - ("query_sequence", "query_qualities", "query_length")) + checkFieldEqual( + self, a, b, ("query_sequence", "query_qualities", "query_length") + ) b.query_sequence = "ATGC" * 3 - checkFieldEqual(self, - a, b, - ("query_sequence", "query_qualities", "query_length")) + checkFieldEqual( + self, a, b, ("query_sequence", "query_qualities", "query_length") + ) b.query_sequence = "ATGC" * 10 checkFieldEqual(self, a, b, ("query_qualities",)) # reset qual b = self.build_read() + def dual(name): + if name.endswith('is_unmapped'): return name.replace('unmapped', 'mapped') + elif name.endswith('is_mapped'): return name.replace('mapped', 'unmapped') + elif name.endswith('is_reverse'): return name.replace('reverse', 'forward') + elif name.endswith('is_forward'): return name.replace('forward', 'reverse') + else: return name + # check flags: for x in ( - "is_paired", "is_proper_pair", - "is_unmapped", "mate_is_unmapped", - "is_reverse", "mate_is_reverse", - "is_read1", "is_read2", - "is_secondary", "is_qcfail", - "is_duplicate", "is_supplementary"): + "is_paired", + "is_proper_pair", + "is_unmapped", + "mate_is_unmapped", + "is_reverse", + "mate_is_reverse", + "is_read1", + "is_read2", + "is_secondary", + "is_qcfail", + "is_duplicate", + "is_supplementary", + ): setattr(b, x, True) self.assertEqual(getattr(b, x), True) - checkFieldEqual(self, a, b, ("flag", x,)) + checkFieldEqual(self, a, b, ("flag", x, dual(x),)) + setattr(b, x, False) + self.assertEqual(getattr(b, x), False) + checkFieldEqual(self, a, b) + + for x in ( + "is_mapped", + "mate_is_mapped", + "is_forward", + "mate_is_forward", + ): setattr(b, x, False) self.assertEqual(getattr(b, x), False) + checkFieldEqual(self, a, b, ("flag", x, dual(x),)) + setattr(b, x, True) + self.assertEqual(getattr(b, x), True) checkFieldEqual(self, a, b) def testUpdate2(self): - '''issue 135: inplace update of sequence and quality score. + """issue 135: inplace update of sequence and quality score. This does not work as setting the sequence will erase the quality scores. - ''' + """ a = self.build_read() a.query_sequence = a.query_sequence[5:10] - self.assertEqual(pysam.qualities_to_qualitystring( - a.query_qualities), None) + self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) a = self.build_read() s = pysam.qualities_to_qualitystring(a.query_qualities) a.query_sequence = a.query_sequence[5:10] a.query_qualities = pysam.qualitystring_to_array(s[5:10]) - self.assertEqual(pysam.qualities_to_qualitystring( - a.query_qualities), s[5:10]) + self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), s[5:10]) + + def testUpdateQual(self): + """Ensure SEQ and QUAL updates leading to absent QUAL set all bytes to 0xff""" + + a = self.build_read() + with get_temp_context("absent_qual.bam") as fname: + with pysam.AlignmentFile(fname, "wb", header=a.header) as outf: + a.query_sequence = "ATGC" + outf.write(a) + + a.query_sequence = "ATGCATGCATGC" + outf.write(a) + + a.query_sequence = "ATGCATGC" + a.query_qualities = pysam.qualitystring_to_array("<<<<<<<<") + a.query_qualities = None + outf.write(a) + + with pysam.BGZFile(fname) as f: + # Skip BAM header + (l_text,) = struct.unpack("<4xL", f.read(8)) + f.read(l_text) + (n_ref,) = struct.unpack(" /dev/null | wc -l".format(fn)).read() @@ -81,6 +82,6 @@ def build_aligned_pairs_with_pysam(*args, **kwargs): with_seq = kwargs.pop("with_seq", False) with pysam.AlignmentFile(*args, **kwargs) as inf: data = [x.get_aligned_pairs(matches_only=matches_only, with_seq=with_seq) - for x in inf if not x.is_unmapped] + for x in inf if x.is_mapped] return data diff --git a/tests/AlignmentFileFetch_bench.py b/tests/AlignmentFileFetch_bench.py index bb8ce43..9fd8b83 100644 --- a/tests/AlignmentFileFetch_bench.py +++ b/tests/AlignmentFileFetch_bench.py @@ -1,9 +1,8 @@ """Benchmarking module for AlignmentFile functionality""" -import os import pytest -from TestUtils import BAM_DATADIR, force_str, flatten_nested_list +from TestUtils import BAM_DATADIR from AlignmentFileFetchTestUtils import * diff --git a/tests/AlignmentFileHeader_test.py b/tests/AlignmentFileHeader_test.py index a665f43..3d9e2e4 100644 --- a/tests/AlignmentFileHeader_test.py +++ b/tests/AlignmentFileHeader_test.py @@ -7,7 +7,6 @@ and data files located there. import unittest import os -import sys import re import copy from collections import OrderedDict as odict @@ -15,11 +14,6 @@ import pysam import pysam.samtools from TestUtils import get_temp_filename, make_data_files, BAM_DATADIR -if sys.version_info.major >= 3: - from io import StringIO -else: - from StringIO import StringIO - def setUpModule(): make_data_files(BAM_DATADIR) diff --git a/tests/AlignmentFilePileup_bench.py b/tests/AlignmentFilePileup_bench.py index 24a06cb..d165e52 100644 --- a/tests/AlignmentFilePileup_bench.py +++ b/tests/AlignmentFilePileup_bench.py @@ -1,7 +1,5 @@ """Benchmarking module for AlignmentFile functionality""" -import os - -from TestUtils import BAM_DATADIR, force_str, flatten_nested_list +from TestUtils import BAM_DATADIR, flatten_nested_list from PileupTestUtils import * diff --git a/tests/PileupTestUtils.py b/tests/PileupTestUtils.py index 652bd5b..33acc9e 100644 --- a/tests/PileupTestUtils.py +++ b/tests/PileupTestUtils.py @@ -2,7 +2,8 @@ import os import subprocess import pysam -from TestUtils import BAM_DATADIR, force_str +from TestUtils import force_str + def build_pileup_with_samtoolsshell(fn): os.system("samtools mpileup {} 2> /dev/null | wc -l > /dev/null".format(fn)) diff --git a/tests/TestUtils.py b/tests/TestUtils.py index 97bd2ed..e5dee6c 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -1,7 +1,6 @@ import sys import os import glob -import difflib import gzip import contextlib import inspect @@ -177,8 +176,10 @@ def checkFieldEqual(cls, read1, read2, exclude=[]): ".query_qualities", ".bin", ".is_paired", ".is_proper_pair", - ".is_unmapped", ".mate_is_unmapped", - ".is_reverse", ".mate_is_reverse", + ".is_unmapped", ".is_mapped", + ".mate_is_unmapped", ".mate_is_mapped", + ".is_reverse", ".is_forward", + ".mate_is_reverse", ".mate_is_forward", ".is_read1", ".is_read2", ".is_secondary", ".is_qcfail", ".is_duplicate"): diff --git a/tests/VariantFileFetchTestUtils.py b/tests/VariantFileFetchTestUtils.py index 1aaca37..4c16b97 100644 --- a/tests/VariantFileFetchTestUtils.py +++ b/tests/VariantFileFetchTestUtils.py @@ -8,8 +8,6 @@ except ImportError: pass -from TestUtils import CBCF_DATADIR, force_str - def build_filter_from_vcf_with_samtoolsshell(fn): retval = os.popen( "bcftools filter -e \"N_ALT != 1 || QUAL < 20 || maf[0]>0.05\" {} | grep -cv ^# ".format(fn)).read() diff --git a/tests/VariantFile_bench.py b/tests/VariantFile_bench.py index d48760c..9663ea6 100644 --- a/tests/VariantFile_bench.py +++ b/tests/VariantFile_bench.py @@ -1,9 +1,7 @@ """Benchmarking module for AlignmentFile functionality""" -import os import pytest -from TestUtils import BAM_DATADIR, force_str, flatten_nested_list from VariantFileFetchTestUtils import * diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py index fcc39a6..6224f0c 100644 --- a/tests/VariantFile_test.py +++ b/tests/VariantFile_test.py @@ -364,6 +364,23 @@ class TestParsing(unittest.TestCase): self.assertEqual(alleles, [ ('T',), ('G', 'A'), ('T', 'A'), ('A', 'G', 'T'), ('GTCT', 'G', 'GTACT')]) + def testAllelesVariantTypes(self): + fn = os.path.join(CBCF_DATADIR, self.filename) + v = pysam.VariantFile(fn) + rec = next(v) + + self.assertEqual(rec.alleles, ('T',)) + self.assertEqual(rec.alleles_variant_types, ("REF",)) + + rec.alleles = ("T", "C") + self.assertEqual(rec.alleles_variant_types, ("REF", "SNP")) + + rec.alts = ("TC",) + self.assertEqual(rec.alleles_variant_types, ("REF", "INDEL")) + + rec.ref = "AG" + self.assertEqual(rec.alleles_variant_types, ("REF", "MNP")) + def testQual(self): fn = os.path.join(CBCF_DATADIR, self.filename) v = pysam.VariantFile(fn) @@ -536,8 +553,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase): vcf_in = pysam.VariantFile(fn_in) header = pysam.VariantHeader() - for sample in vcf_in.header.samples: - header.add_sample(sample) + header.add_samples(vcf_in.header.samples) for hr in vcf_in.header.records: header.add_line(str(hr)) @@ -615,10 +631,10 @@ class TestMultiThreading(unittest.TestCase): def testSingleThreadEqualsMultithreadResult(self): with pysam.VariantFile(self.filename) as inf: - header = inf.header - single = [r for r in inf] + header = inf.header + single = [r for r in inf] with pysam.VariantFile(self.filename, threads=2) as inf: - multi = [r for r in inf] + multi = [r for r in inf] for r1, r2 in zip(single, multi): assert str(r1) == str(r2) diff --git a/tests/VariantRecord_test.py b/tests/VariantRecord_test.py index 5043d1f..b045b98 100644 --- a/tests/VariantRecord_test.py +++ b/tests/VariantRecord_test.py @@ -1,11 +1,4 @@ -import os -import glob -import sys -import unittest import pysam -import shutil -import gzip -import subprocess import pytest try: @@ -13,7 +6,7 @@ try: except ImportError: Path = None -from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, make_data_files, CBCF_DATADIR, get_temp_context +from TestUtils import make_data_files, CBCF_DATADIR def setUpModule(): @@ -23,8 +16,7 @@ def setUpModule(): @pytest.fixture def vcf_header(): vcf_header = pysam.VariantHeader() - vcf_header.add_sample("sample1") - vcf_header.add_sample("sample2") + vcf_header.add_samples("sample1", "sample2") vcf_header.contigs.add("1") return vcf_header diff --git a/tests/faidx_test.py b/tests/faidx_test.py index 72520e7..8718f98 100644 --- a/tests/faidx_test.py +++ b/tests/faidx_test.py @@ -1,5 +1,4 @@ import pysam -import pytest import unittest import os import gzip diff --git a/tests/pysam_data/MM-chebi.sam b/tests/pysam_data/MM-chebi.sam new file mode 100644 index 0000000..62920ec --- /dev/null +++ b/tests/pysam_data/MM-chebi.sam @@ -0,0 +1,2 @@ +@CO Separate m, h and N modifications +* 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15; Ml:B:C,102,128,153,179,204,161,187,212,169 diff --git a/tests/pysam_data/MM-double.sam b/tests/pysam_data/MM-double.sam new file mode 100644 index 0000000..608516f --- /dev/null +++ b/tests/pysam_data/MM-double.sam @@ -0,0 +1,3 @@ +@CO Modifications called on both strands of the same record, +@CO including potentially at the same location simultaneously. +* 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:C+m,1,3,0;G-m,0,2,0,4;G+o,4; Ml:B:C,128,153,179,115,141,166,192,102 diff --git a/tests/pysam_data/MM-multi.sam b/tests/pysam_data/MM-multi.sam new file mode 100644 index 0000000..b2259a0 --- /dev/null +++ b/tests/pysam_data/MM-multi.sam @@ -0,0 +1,7 @@ +@CO Testing multiple m, h and N modifications on the same read. +@CO r1 has them separated out. +@CO r2 has them combined together, for example as produced by +@CO a joint basecaller which assigns probabilities to all +@CO trained events simultaneously. +r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r2 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/tests/pysam_data/MM-orient.sam b/tests/pysam_data/MM-orient.sam new file mode 100644 index 0000000..363e7c2 --- /dev/null +++ b/tests/pysam_data/MM-orient.sam @@ -0,0 +1,6 @@ +@CO Testing mods on top and bottom strand, but also in +@CO original vs reverse-complemented orientation +top-fwd 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:C+m,1,3,0; Ml:B:C,128,153,179 +top-rev 16 * 0 0 * * 0 0 ATATGGCATATCCCCCGCCGATCCGCTAGAGATCCT * Mm:Z:C+m,1,3,0; Ml:B:C,128,153,179 +bot-fwd 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:G-m,0,0,4,3; Ml:B:C,115,141,166,192 +bot-rev 16 * 0 0 * * 0 0 ATATGGCATATCCCCCGCCGATCCGCTAGAGATCCT * Mm:Z:G-m,0,0,4,3; Ml:B:C,115,141,166,192 diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py index 762002a..09cb210 100644 --- a/tests/tabixproxies_test.py +++ b/tests/tabixproxies_test.py @@ -1,7 +1,6 @@ import unittest import pysam import os -import sys import re import copy import gzip diff --git a/tests/test_samtools_python.py b/tests/test_samtools_python.py index da4d332..3a9d841 100644 --- a/tests/test_samtools_python.py +++ b/tests/test_samtools_python.py @@ -1,6 +1,5 @@ import pysam import os -import pytest from TestUtils import make_data_files, BAM_DATADIR -- 2.30.2