From 0756fadda6074d56d45d8f40268f1342947bb217 Mon Sep 17 00:00:00 2001 From: Nilesh Patra Date: Fri, 15 Oct 2021 00:58:59 +0530 Subject: [PATCH] New upstream version 0.17.0+ds --- .github/workflows/ci.yaml | 152 ++ .github/workflows/release.yaml | 115 ++ .gitignore | 3 + .travis.yml => .travis.disabled.yml | 17 + AUTHORS | 9 + INSTALL | 2 +- MANIFEST.in | 50 +- NEWS | 8 +- README.rst | 2 +- bcftools/HMM.c | 2 +- bcftools/HMM.c.pysam.c | 2 +- bcftools/HMM.h | 2 +- bcftools/LICENSE | 27 +- bcftools/README | 22 + bcftools/abuf.c | 713 +++++++ bcftools/abuf.c.pysam.c | 715 +++++++ bcftools/abuf.h | 78 + bcftools/bam2bcf.c | 397 +++- bcftools/bam2bcf.c.pysam.c | 397 +++- bcftools/bam2bcf.h | 39 +- bcftools/bam2bcf_indel.c | 1110 ++++++++--- bcftools/bam2bcf_indel.c.pysam.c | 1110 ++++++++--- bcftools/bcftools.h | 67 +- bcftools/bcftools.pysam.c | 20 + bcftools/bcftools.pysam.h | 13 + bcftools/bin.c | 1 + bcftools/bin.c.pysam.c | 1 + bcftools/call.h | 36 +- bcftools/ccall.c | 5 +- bcftools/ccall.c.pysam.c | 5 +- bcftools/consensus.c | 512 +++-- bcftools/consensus.c.pysam.c | 514 +++-- bcftools/convert.c | 63 +- bcftools/convert.c.pysam.c | 69 +- bcftools/csq.c | 203 +- bcftools/csq.c.pysam.c | 203 +- bcftools/dist.c | 124 ++ bcftools/dist.c.pysam.c | 126 ++ bcftools/dist.h | 98 + bcftools/em.c | 2 +- bcftools/em.c.pysam.c | 2 +- bcftools/extsort.c | 250 +++ bcftools/extsort.c.pysam.c | 252 +++ bcftools/extsort.h | 56 + bcftools/filter.c | 747 +++++-- bcftools/filter.c.pysam.c | 749 +++++-- bcftools/filter.h | 14 +- bcftools/hclust.c | 1 + bcftools/hclust.c.pysam.c | 1 + bcftools/htslib-1.10.2/LICENSE | 69 - bcftools/htslib-1.10.2/README | 5 - bcftools/main.c | 8 +- bcftools/main.c.pysam.c | 8 +- bcftools/mcall.c | 764 ++++--- bcftools/mcall.c.pysam.c | 764 ++++--- bcftools/mpileup.c | 536 ++++- bcftools/mpileup.c.pysam.c | 566 ++++-- bcftools/ploidy.h | 2 +- bcftools/prob1.c | 2 +- bcftools/prob1.c.pysam.c | 2 +- bcftools/prob1.h | 2 +- bcftools/rbuf.h | 2 +- bcftools/regidx.c | 2 +- bcftools/regidx.c.pysam.c | 2 +- bcftools/regidx.h | 2 +- bcftools/reheader.c | 100 +- bcftools/reheader.c.pysam.c | 102 +- bcftools/smpl_ilist.c | 2 +- bcftools/smpl_ilist.c.pysam.c | 2 +- bcftools/str_finder.c | 270 +++ bcftools/str_finder.c.pysam.c | 272 +++ bcftools/str_finder.h | 64 + bcftools/utlist.h | 761 +++++++ bcftools/vcfannotate.c | 795 ++++++-- bcftools/vcfannotate.c.pysam.c | 797 ++++++-- bcftools/vcfbuf.c | 214 +- bcftools/vcfbuf.c.pysam.c | 214 +- bcftools/vcfbuf.h | 44 +- bcftools/vcfcall.c | 106 +- bcftools/vcfcall.c.pysam.c | 114 +- bcftools/vcfcnv.c | 1 + bcftools/vcfcnv.c.pysam.c | 3 +- bcftools/vcfconcat.c | 13 +- bcftools/vcfconcat.c.pysam.c | 15 +- bcftools/vcfconvert.c | 42 +- bcftools/vcfconvert.c.pysam.c | 44 +- bcftools/vcffilter.c | 66 +- bcftools/vcffilter.c.pysam.c | 68 +- bcftools/vcfgtcheck.c | 1613 +++++++++------ bcftools/vcfgtcheck.c.pysam.c | 1615 +++++++++------ bcftools/vcfindex.c | 155 +- bcftools/vcfindex.c.pysam.c | 157 +- bcftools/vcfisec.c | 11 +- bcftools/vcfisec.c.pysam.c | 13 +- bcftools/vcfmerge.c | 638 +++++- bcftools/vcfmerge.c.pysam.c | 644 +++++- bcftools/vcfnorm.c | 479 +++-- bcftools/vcfnorm.c.pysam.c | 481 +++-- bcftools/vcfplugin.c | 93 +- bcftools/vcfplugin.c.pysam.c | 95 +- bcftools/vcfquery.c | 13 +- bcftools/vcfquery.c.pysam.c | 15 +- bcftools/vcfroh.c | 54 +- bcftools/vcfroh.c.pysam.c | 62 +- bcftools/vcfsom.c | 16 +- bcftools/vcfsom.c.pysam.c | 18 +- bcftools/vcfsort.c | 42 +- bcftools/vcfsort.c.pysam.c | 46 +- bcftools/vcfstats.c | 177 +- bcftools/vcfstats.c.pysam.c | 185 +- bcftools/vcfview.c | 20 +- bcftools/vcfview.c.pysam.c | 30 +- bcftools/vcmp.c | 2 +- bcftools/vcmp.c.pysam.c | 2 +- bcftools/vcmp.h | 2 +- bcftools/version.c | 14 +- bcftools/version.c.pysam.c | 18 +- bcftools/version.sh | 25 +- cy_build.py | 6 +- devtools/import.py | 29 +- devtools/install-CGAT-tools.sh | 5 +- devtools/run_tests_travis.sh | 7 +- doc/api.rst | 30 +- doc/benchmarking.rst | 2 + doc/conf.py | 16 +- doc/developer.rst | 13 +- doc/faq.rst | 13 +- doc/glossary.rst | 13 + doc/index.rst | 21 +- doc/installation.rst | 2 +- doc/release.rst | 70 +- doc/usage.rst | 3 +- import/pysam.c | 20 + import/pysam.h | 13 + pysam.py | 1 - pysam/__init__.py | 4 +- pysam/libcalignedsegment.pxd | 6 +- pysam/libcalignedsegment.pyx | 64 +- pysam/libcalignmentfile.pxd | 20 +- pysam/libcalignmentfile.pyx | 71 +- pysam/libcbcf.pyx | 39 +- pysam/libcbcftools.pxd | 2 +- pysam/libcfaidx.pyx | 4 +- pysam/libchtslib.pxd | 23 +- pysam/libchtslib.pyx | 6 +- pysam/libcsamtools.pxd | 2 +- pysam/libctabix.pyx | 60 +- pysam/libcutils.pxd | 16 +- pysam/libcutils.pyx | 70 +- pysam/samtools.py | 4 + pysam/version.h | 6 +- pysam/version.py | 8 +- samtools/LICENSE | 2 +- samtools/README | 35 +- samtools/amplicon_stats.c | 1754 ++++++++++++++++ samtools/amplicon_stats.c.pysam.c | 1756 +++++++++++++++++ samtools/bam.c | 20 +- samtools/bam.c.pysam.c | 20 +- samtools/bam.h | 4 +- samtools/bam2bcf_indel.c | 4 + samtools/bam2bcf_indel.c.pysam.c | 4 + samtools/bam2depth.c | 1160 ++++++++--- samtools/bam2depth.c.pysam.c | 1160 ++++++++--- samtools/bam_addrprg.c | 42 +- samtools/bam_addrprg.c.pysam.c | 42 +- samtools/bam_ampliconclip.c | 1079 ++++++++++ samtools/bam_ampliconclip.c.pysam.c | 1081 ++++++++++ samtools/bam_ampliconclip.h | 54 + samtools/bam_aux.c | 6 +- samtools/bam_aux.c.pysam.c | 6 +- samtools/bam_cat.c | 24 +- samtools/bam_cat.c.pysam.c | 24 +- samtools/bam_color.c | 24 +- samtools/bam_color.c.pysam.c | 24 +- samtools/bam_fastq.c | 950 ++++----- samtools/bam_fastq.c.pysam.c | 950 ++++----- samtools/bam_flags.c | 66 +- samtools/bam_flags.c.pysam.c | 66 +- samtools/bam_import.c | 487 +++++ samtools/bam_import.c.pysam.c | 489 +++++ samtools/bam_index.c.pysam.c | 2 +- samtools/bam_markdup.c | 675 +++++-- samtools/bam_markdup.c.pysam.c | 675 +++++-- samtools/bam_mate.c | 12 +- samtools/bam_mate.c.pysam.c | 12 +- samtools/bam_md.c | 185 +- samtools/bam_md.c.pysam.c | 185 +- samtools/bam_plcmd.c | 66 +- samtools/bam_plcmd.c.pysam.c | 94 +- samtools/bam_reheader.c.pysam.c | 4 +- samtools/bam_rmdupse.c.pysam.c | 6 +- samtools/bam_sort.c | 533 ++++- samtools/bam_sort.c.pysam.c | 533 ++++- samtools/bam_stat.c | 113 +- samtools/bam_stat.c.pysam.c | 115 +- samtools/bamtk.c | 94 +- samtools/bamtk.c.pysam.c | 99 +- samtools/bedcov.c | 103 +- samtools/bedcov.c.pysam.c | 103 +- samtools/bedidx.c | 8 + samtools/bedidx.c.pysam.c | 8 + samtools/coverage.c | 241 ++- samtools/coverage.c.pysam.c | 241 ++- samtools/cut_target.c | 12 +- samtools/cut_target.c.pysam.c | 12 +- samtools/dict.c | 28 +- samtools/dict.c.pysam.c | 34 +- samtools/faidx.c | 53 +- samtools/faidx.c.pysam.c | 53 +- samtools/htslib-1.10/LICENSE | 69 - samtools/htslib-1.10/README | 5 - samtools/padding.c | 61 +- samtools/padding.c.pysam.c | 61 +- samtools/phase.c | 9 +- samtools/phase.c.pysam.c | 9 +- samtools/sam_view.c | 452 +++-- samtools/sam_view.c.pysam.c | 452 +++-- samtools/samtools.pysam.c | 20 + samtools/samtools.pysam.h | 13 + samtools/stats.c | 82 +- samtools/stats.c.pysam.c | 84 +- samtools/stats_isize.c.pysam.c | 2 +- samtools/tmp_file.h | 2 +- samtools/version.sh | 2 +- setup.py | 130 +- tests/AlignedSegment_test.py | 13 +- tests/AlignmentFileHeader_test.py | 6 +- tests/AlignmentFilePileup_test.py | 6 +- tests/AlignmentFile_test.py | 53 +- tests/StreamFiledescriptors_test.py | 6 +- tests/TestUtils.py | 13 + tests/VariantFile_test.py | 21 +- tests/VariantRecord_test.py | 6 +- tests/cbcf_data/Makefile | 8 +- tests/compile_test.py | 8 +- tests/faidx_test.py | 6 +- ...ader.bam => 0example_no_seq_in_header.bam} | Bin ... 0example_no_seq_in_header_null_bytes.bam} | Bin tests/pysam_data/Makefile | 38 +- tests/pysam_data/ex1.sam.gz | Bin 113194 -> 109698 bytes tests/refactoring.txt | 2 +- tests/samtools_test.py | 18 +- tests/tabix_data/Makefile | 7 + tests/tabix_data/example.bed.gz.tbi | Bin 192 -> 190 bytes tests/tabix_data/example.gff3.gz.tbi | Bin 1457 -> 1454 bytes tests/tabix_data/example.gtf.gz.tbi | Bin 196 -> 196 bytes tests/tabix_data/example.vcf.gz.tbi | Bin 182 -> 180 bytes .../tabix_data/example_badcomments.bed.gz.tbi | Bin 194 -> 194 bytes .../tabix_data/example_badcomments.gtf.gz.tbi | Bin 198 -> 198 bytes .../tabix_data/example_badcomments.vcf.gz.tbi | Bin 186 -> 184 bytes tests/tabix_data/example_comments.bed.gz.tbi | Bin 194 -> 194 bytes tests/tabix_data/example_comments.gtf.gz.tbi | Bin 198 -> 198 bytes tests/tabix_data/example_comments.vcf.gz.tbi | Bin 186 -> 184 bytes tests/tabix_test.py | 8 +- tests/tabixproxies_test.py | 6 +- tests/test_samtools_python.py | 6 +- 256 files changed, 31299 insertions(+), 9372 deletions(-) create mode 100644 .github/workflows/ci.yaml create mode 100644 .github/workflows/release.yaml rename .travis.yml => .travis.disabled.yml (81%) create mode 100644 bcftools/abuf.c create mode 100644 bcftools/abuf.c.pysam.c create mode 100644 bcftools/abuf.h create mode 100644 bcftools/dist.c create mode 100644 bcftools/dist.c.pysam.c create mode 100644 bcftools/dist.h create mode 100644 bcftools/extsort.c create mode 100644 bcftools/extsort.c.pysam.c create mode 100644 bcftools/extsort.h delete mode 100644 bcftools/htslib-1.10.2/LICENSE delete mode 100644 bcftools/htslib-1.10.2/README create mode 100644 bcftools/str_finder.c create mode 100644 bcftools/str_finder.c.pysam.c create mode 100644 bcftools/str_finder.h create mode 100644 bcftools/utlist.h delete mode 100644 pysam.py create mode 100644 samtools/amplicon_stats.c create mode 100644 samtools/amplicon_stats.c.pysam.c create mode 100644 samtools/bam_ampliconclip.c create mode 100644 samtools/bam_ampliconclip.c.pysam.c create mode 100644 samtools/bam_ampliconclip.h create mode 100644 samtools/bam_import.c create mode 100644 samtools/bam_import.c.pysam.c delete mode 100644 samtools/htslib-1.10/LICENSE delete mode 100644 samtools/htslib-1.10/README rename tests/pysam_data/{example_no_seq_in_header.bam => 0example_no_seq_in_header.bam} (100%) rename tests/pysam_data/{example_no_seq_in_header_null_bytes.bam => 0example_no_seq_in_header_null_bytes.bam} (100%) create mode 100644 tests/tabix_data/Makefile diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..4075f1c --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,152 @@ +name: CI + +# on: [push, pull_request] +on: [pull_request] + +jobs: + direct: + runs-on: ${{ matrix.os }}-latest + strategy: + matrix: + os: [ubuntu, macos] + python-version: [2.7, 3.6, 3.7, 3.8, 3.9] + exclude: + # Run only the latest 2.x and 3.x on macOS + - os: macos + python-version: 3.6 + - os: macos + python-version: 3.7 + - os: macos + python-version: 3.8 + + steps: + - name: Checkout pysam + uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install prerequisite Python libraries + run: pip install cython pytest pytest-pep8 + + - name: Install build prerequisites + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev + + - name: Build (directly from checkout) + run: python setup.py build + + - name: Install test prerequisites + run: | + case $RUNNER_OS in + Linux) + sudo apt-get install -q --no-install-recommends --no-install-suggests samtools bcftools tabix + ;; + macOS) + brew install -q samtools bcftools + ;; + esac + + - name: Run tests + run: | + export PYTHONPATH=$(echo $GITHUB_WORKSPACE/build/lib.*) + export REF_PATH=':' + pytest + + + sdist: + runs-on: ${{ matrix.os }}-latest + strategy: + matrix: + os: [ubuntu, macos] + python-version: [3.9] + + steps: + - name: Checkout pysam + uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install prerequisite Python libraries + run: pip install cython pytest pytest-pep8 + + - name: Install build prerequisites + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev + + - name: Create source distribution + run: python setup.py sdist --owner=root --group=root + + - name: Build (via sdist tarball) + run: pip install --verbose --no-deps --no-binary=':all:' pysam-*.tar.gz + working-directory: dist + + - name: Install test prerequisites + run: | + case $RUNNER_OS in + Linux) + sudo apt-get install -q --no-install-recommends --no-install-suggests samtools bcftools tabix + ;; + macOS) + brew install -q samtools bcftools + ;; + esac + + - name: Run tests + run: REF_PATH=':' pytest + + - name: Upload sdist tarball + if: runner.os == 'Linux' + uses: actions/upload-artifact@v2 + with: + name: sdist + path: dist/pysam-*.tar.gz + retention-days: 14 + + + conda: + timeout-minutes: 20 + runs-on: ${{ matrix.os }}-latest + strategy: + matrix: + os: [ubuntu] + python-version: [3.7] + defaults: + run: + shell: bash -l {0} # needed for conda activation + env: + HTSLIB_CONFIGURE_OPTIONS: "--disable-libcurl" + + steps: + - name: Checkout pysam + uses: actions/checkout@v2 + + - uses: conda-incubator/setup-miniconda@v2 + with: + channel-priority: strict + activate-environment: testenv + auto-activate-base: false + use-only-tar-bz2: true + + - name: Set up Conda and Python ${{ matrix.python-version }} + run: | + conda config --add channels bioconda --add channels conda-forge + conda install python=${{ matrix.python-version }} cython + + - name: Build (directly from checkout) + run: python setup.py install + + - name: Install test prerequisites via Conda + run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest + + - name: Run tests + run: REF_PATH=':' pytest diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..bbc954f --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,115 @@ +name: Publish pysam wheels to PyPI and TestPyPI + +on: + push: + branches: + - v[0-9]+.[0-9]+.x + tags: + - v* + release: + types: + - published + +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-20.04, macos-10.15] # windows-2019, + + steps: + - name: Checkout pysam + uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: Install prerequisite Python libraries + run: | + python -m pip install --upgrade pip + pip install cython pytest pytest-pep8 + + - name: Build wheels for linux + if: runner.os == 'Linux' + uses: pypa/cibuildwheel@v2.1.2 + env: + CIBW_BUILD: cp36-* cp37-* cp38-* cp39-* + CIBW_BEFORE_BUILD: yum install -y libcurl-devel zlib-devel bzip2-devel xz-devel && pip install cython + CIBW_MANYLINUX_X86_64_IMAGE: manylinux1 + CIBW_MANYLINUX_I686_IMAGE: manylinux1 + + - name: Build wheels for macos + if: runner.os != 'Linux' + uses: pypa/cibuildwheel@v2.1.2 + env: + CIBW_BUILD: cp36-* cp37-* cp38-* cp39-* + CIBW_BEFORE_BUILD: pip install cython + + - name: Upload artifacts + uses: actions/upload-artifact@v2 + with: + path: ./wheelhouse/*.whl + + build_sdist: + + runs-on: ${{ matrix.os }}-latest + strategy: + matrix: + os: [ubuntu, macos] + python-version: [3.9] + + steps: + - name: Checkout pysam + uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install prerequisite Python libraries + run: pip install cython pytest pytest-pep8 + + - name: Install build prerequisites + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev + + - name: Create source distribution + run: python setup.py sdist + + - uses: actions/upload-artifact@v2 + with: + path: dist/*.tar.gz + + upload_pypi: + + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + + steps: + - name: Get Artifacts + uses: actions/download-artifact@v2 + with: + name: artifact + path: dist + + - name: Publish distribution to Test PyPI + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + + - name: Publish distribution to PyPI + if: github.event_name == 'release' && github.event.action == 'published' + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + diff --git a/.gitignore b/.gitignore index b07a532..6ec2d26 100644 --- a/.gitignore +++ b/.gitignore @@ -17,11 +17,14 @@ tests/cbcf_data tests/tabix_data samtools/config.h +samtools/samtools_config_vars.h bcftools/config.h htslib/config.status htslib/config.h htslib/config.log htslib/config.mk +htslib/config_vars.h +htslib/htscodecs.mk htslib/htslib.pc.tmp htslib/htslib-uninstalled.pc pysam/config.py diff --git a/.travis.yml b/.travis.disabled.yml similarity index 81% rename from .travis.yml rename to .travis.disabled.yml index 47ce194..5b7bcc8 100644 --- a/.travis.yml +++ b/.travis.disabled.yml @@ -39,6 +39,16 @@ _cibw_linux: &cibw_linux - docker <<: *cibw_common +_cibw_linux_aarch64: &cibw_linux_aarch64 + stage: deploy + os: linux + arch: arm64 + language: python + python: '3.9' + services: + - docker + <<: *cibw_common + matrix: include: - stage: deploy @@ -74,6 +84,13 @@ matrix: - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}' - CIBW_TEST_COMMAND='python -c "import pysam"' + - <<: *cibw_linux_aarch64 + env: + - CIBW_BUILD="*_aarch64" + - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt" + - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' + - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}' + - CIBW_TEST_COMMAND='python -c "import pysam"' - stage: deploy os: osx language: generic diff --git a/AUTHORS b/AUTHORS index 4b00536..4e9c5eb 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,8 +1,17 @@ +Many people have contributed to pysam. The list of github contributors +is the best place to get a full list of authors and their contributions. +The list and summary below is a out-of-date and represents the earlier +stages of the project. + List of contributors: Andreas Heger, Tildon Grant Belgard, Florian Finkernagel, Leo Goodstadt, Martin Goodson all contributed code to pysam. +John Marshall has been looking after pysam and its community for +several years, as well as making many code contributions and improving +the engineering of pysam. + Kevin B. Jacobs implemented a Cython wrapper for the VCF/BCF reader/writer in htslib. diff --git a/INSTALL b/INSTALL index 9636125..5016dcc 100644 --- a/INSTALL +++ b/INSTALL @@ -47,7 +47,7 @@ features. If these fail, for example due to missing library dependencies (`libcurl`, `libcrypto`), it will fall back to conservative defaults. -Options can be passed to the configure script explicitely by +Options can be passed to the configure script explicitly by setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`. For example:: diff --git a/MANIFEST.in b/MANIFEST.in index aaacb22..25e9a1a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -16,45 +16,39 @@ include pysam/libc*.pyx include pysam/libc*.c include pysam/*.c include pysam/*.h +exclude pysam/config.py + +include win32/*.[ch] # exclude tests from pypi tar-ball - they # require additional data prune tests/ # samtools -include samtools/configure -include samtools/config.mk.in -include samtools/config.h.in -include samtools/*.h -include samtools/*.c -exclude samtools/config.h -include samtools/*/*.h +include samtools/LICENSE samtools/README samtools/lz4/LICENSE +recursive-include samtools *.[ch] +include samtools/version.sh +exclude samtools/*config*.h # bcftools -include bcftools/*.h -include bcftools/*.c -exclude bcftools/config.h +include bcftools/LICENSE bcftools/README +include bcftools/*.[ch] +include bcftools/version.sh +exclude bcftools/*config*.h # htslib -include htslib/*.c -include htslib/*.h -include htslib/INSTALL -include htslib/NEWS -exclude htslib/config.h -include htslib/Makefile -include htslib/htslib_vars.mk -include htslib/configure -include htslib/config.mk.in -include htslib/config.h.in -include htslib/htslib.pc.in -include htslib/htslib/*.h -include htslib/cram/*.c -include htslib/cram/*.h -include htslib/os/*.c -include htslib/os/*.h +include htslib/LICENSE htslib/README +recursive-include htslib *.[ch] +exclude htslib/*config*.h + +include htslib/configure.ac htslib/m4/*.m4 htslib/*.in +include htslib/configure htslib/version.sh +include htslib/Makefile htslib/*.mk +exclude htslib/config.mk htslib/htscodecs.mk + include cy_build.py -include pysam.py include requirements.txt # documentation -include doc/* +include doc/*.py doc/*.rst +include doc/Makefile doc/make.bat diff --git a/NEWS b/NEWS index 49ce485..75d9249 100644 --- a/NEWS +++ b/NEWS @@ -209,7 +209,7 @@ Release 0.11.2 ============== This release wraps htslib/samtools/bcfools versions 1.4.1 in response -to a security fix in these libraries. Additionaly the following +to a security fix in these libraries. Additionally the following issues have been fixed: * [#452] add GFF3 support for tabix parsers @@ -330,7 +330,7 @@ Overview -------- The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other -enchancements and bugfixes. See below for a detailed list. +enhancements and bugfixes. See below for a detailed list. `Htslib 1.3 `_ comes with additional capabilities for remote file access which depend @@ -373,7 +373,7 @@ Detailed release notes and code bloat. * run configure for the builtin htslib library in order to detect optional libraries such as libcurl. Configure behaviour can be - controlled by setting the environmet variable + controlled by setting the environment variable HTSLIB_CONFIGURE_OPTIONS. * get_reference_sequence() now returns the reference sequence and not something looking like it. This bug had effects on @@ -576,7 +576,7 @@ Other changes: Backwards incompatible changes -* Empty cigarstring now returns None (intstead of '') +* Empty cigarstring now returns None (instead of '') * Empty cigar now returns None (instead of []) * When using the extension classes in cython modules, AlignedRead needs to be substituted with AlignedSegment. diff --git a/README.rst b/README.rst index 4efa827..368984a 100644 --- a/README.rst +++ b/README.rst @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured compilation options. Especially for OS X this will potentially save a lot of trouble. -The current version of pysam wraps 3rd-party code from htslib-1.10.2, samtools-1.10, and bcftools-1.10.2. +The current version of pysam wraps 3rd-party code from htslib-1.13, samtools-1.13, and bcftools-1.13. Pysam is available through `pypi `_. To install, type:: diff --git a/bcftools/HMM.c b/bcftools/HMM.c index 70ad8d6..c2d302f 100644 --- a/bcftools/HMM.c +++ b/bcftools/HMM.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2015 Genome Research Ltd. + Copyright (c) 2014-2017 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/HMM.c.pysam.c b/bcftools/HMM.c.pysam.c index 2280c0d..d039367 100644 --- a/bcftools/HMM.c.pysam.c +++ b/bcftools/HMM.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2014-2015 Genome Research Ltd. + Copyright (c) 2014-2017 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/HMM.h b/bcftools/HMM.h index 70c9cb8..3a6cab3 100644 --- a/bcftools/HMM.h +++ b/bcftools/HMM.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2015 Genome Research Ltd. + Copyright (c) 2014-2016 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/LICENSE b/bcftools/LICENSE index 75aeb6c..f223b09 100644 --- a/bcftools/LICENSE +++ b/bcftools/LICENSE @@ -9,7 +9,7 @@ the INSTALL document), the use of this software is governed by the GPL license. The MIT/Expat License -Copyright (C) 2012-2014 Genome Research Ltd. +Copyright (C) 2012-2021 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -746,3 +746,28 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +----------------------------------------------------------------------------- + +LICENSE for utlist.h + +Copyright (c) 2007-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/bcftools/README b/bcftools/README index 5cb1bbd..fff0cb7 100644 --- a/bcftools/README +++ b/bcftools/README @@ -3,3 +3,25 @@ SAMtools) and manipulating VCF and BCF files. The program is intended to replace the Perl-based tools from vcftools. See INSTALL for building and installation instructions. + +Please cite this paper when using BCFtools for your publications: + +Twelve years of SAMtools and BCFtools +Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li +GigaScience, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008 + +@article{10.1093/gigascience/giab008, + author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng}, + title = "{Twelve years of SAMtools and BCFtools}", + journal = {GigaScience}, + volume = {10}, + number = {2}, + year = {2021}, + month = {02}, + abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\>1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}", + issn = {2047-217X}, + doi = {10.1093/gigascience/giab008}, + url = {https://doi.org/10.1093/gigascience/giab008}, + note = {giab008}, + eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf}, +} diff --git a/bcftools/abuf.c b/bcftools/abuf.c new file mode 100644 index 0000000..5e45e9e --- /dev/null +++ b/bcftools/abuf.c @@ -0,0 +1,713 @@ +/* The MIT License + + Copyright (c) 2021 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include "bcftools.h" +#include "abuf.h" +#include "rbuf.h" + +typedef enum +{ + M_FIRST, M_SUM +} +merge_rule_t; + +typedef struct +{ + kstring_t ref, alt; + int ial; // the index of the original ALT allele, 1-based + int beg, end; // 0-based inclusive offsets to ref,alt +} +atom_t; + +typedef struct +{ + bcf1_t *rec; + int nori, nout; // number of ALTs in the input, and VCF rows on output + uint8_t *tbl; // nori columns, nout rows; indicates allele contribution to output rows, see "The atomization works as follows" below + uint8_t *overlaps; // is the star allele needed for this variant? + atom_t **atoms; + int matoms, mtbl, moverlaps; + char *info_tag; +} +split_t; + +struct _abuf_t +{ + abuf_opt_t mode; + split_t split; + atom_t *atoms; + int natoms, matoms; + const bcf_hdr_t *hdr; + bcf_hdr_t *out_hdr; + bcf1_t **vcf; // dimensions stored in rbuf + rbuf_t rbuf; + + kstring_t tmps; + void *tmp, *tmp2; + int32_t *gt, *tmpi; + int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2; + int star_allele; +}; + +abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode) +{ + if ( mode!=SPLIT ) error("todo\n"); + abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t)); + buf->hdr = hdr; + buf->out_hdr = (bcf_hdr_t*) hdr; + buf->mode = mode; + buf->star_allele = 1; + rbuf_init(&buf->rbuf, 0); + return buf; +} + +void abuf_destroy(abuf_t *buf) +{ + int i; + for (i=0; imatoms; i++) + { + free(buf->atoms[i].ref.s); + free(buf->atoms[i].alt.s); + } + free(buf->atoms); + free(buf->split.atoms); + free(buf->split.overlaps); + free(buf->split.tbl); + for (i=0; irbuf.m; i++) + if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]); + free(buf->vcf); + free(buf->gt); + free(buf->tmpi); + free(buf->tmp); + free(buf->tmp2); + free(buf->tmps.s); + free(buf); +} + +void abuf_set(abuf_t *buf, abuf_opt_t key, void *value) +{ + if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; } + if ( key==INFO_TAG ) + { + buf->split.info_tag = *((char**)value); + bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); + return; + } + if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; } +} + +/* + Split alleles into primitivs, e.g. + CC>TT becomes C>T,C>T + GCGT>GTGA becomes C>T,T>A + + There is no sequence alignment, just trimming and hungry matching + from left side. +*/ +static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) +{ + // Trim identical sequence from right + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[ial]; + int rlen = strlen(ref); + int alen = strlen(alt); + while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--; + int Mlen = rlen > alen ? rlen : alen; + + atom_t *atom = NULL; + int i; + for (i=0; ialt); + if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; } + } + else + { + buf->natoms++; + hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); + atom = &buf->atoms[buf->natoms-1]; + atom->ref.l = 0; + atom->alt.l = 0; + kputc(refb, &atom->ref); + kputc(altb, &atom->alt); + atom->beg = atom->end = i; + atom->ial = ial; + } + continue; + } + if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion? + { + buf->natoms++; + hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); + atom = &buf->atoms[buf->natoms-1]; + atom->ref.l = 0; + atom->alt.l = 0; + kputc(refb, &atom->ref); + kputc(altb, &atom->alt); + atom->beg = atom->end = i; + atom->ial = ial; + } + } +} +static int _atoms_inconsistent(const atom_t *a, const atom_t *b) +{ + if ( a->beg < b->beg ) return -1; + if ( a->beg > b->beg ) return 1; + int rcmp = strcasecmp(a->ref.s,b->ref.s); + if ( rcmp ) return rcmp; + return strcasecmp(a->alt.s,b->alt.s); +} +/* + For reproducibility of tests on different platforms, we need to guarantee the same order of identical + atoms originating from different source ALTs. Even though they are consistent, different values can be + picked for VCF annotations as currently the values from the one that comes first are used. +*/ +static int _cmp_atoms(const void *aptr, const void *bptr) +{ + const atom_t *a = (const atom_t*) aptr; + const atom_t *b = (const atom_t*) bptr; + int rcmp = _atoms_inconsistent(a,b); + if ( rcmp ) return rcmp; + if ( a->ial < b->ial ) return -1; + if ( a->ial > b->ial ) return 1; + return 0; +} +static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms) +{ + buf->split.rec = rec; + buf->split.nori = rec->n_allele - 1; + buf->split.nout = 0; + hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl); + hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms); + hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps); + memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms); +} +static void _split_table_new(abuf_t *buf, atom_t *atom) +{ + int i, iout = buf->split.nout++; + buf->split.atoms[iout] = atom; + uint8_t *ptr = buf->split.tbl + iout*buf->split.nori; + for (i=0; isplit.nori; i++) ptr[i] = 0; + ptr[atom->ial-1] = 1; +} +static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom) +{ + uint8_t *ptr = buf->split.tbl + iout*buf->split.nori; + ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1; + buf->split.overlaps[iout] = 1; +} +#if 0 +static void _split_table_print(abuf_t *buf) +{ + int i,j; + for (i=0; isplit.nout; i++) + { + atom_t *atom = buf->split.atoms[i]; + uint8_t *ptr = buf->split.tbl + i*buf->split.nori; + fprintf(stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s); + for (j=0; jsplit.nori; j++) fprintf(stderr,"\t%d",(int)ptr[j]); + fprintf(stderr,"\n"); + } +} +static void _split_table_print_atoms(abuf_t *buf) +{ + int i; + for (i=0; inatoms; i++) + { + atom_t *atom = &buf->atoms[i]; + fprintf(stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end); + } +} +#endif +static inline uint8_t _has_star_allele(abuf_t *buf, int iout) +{ + if ( !buf->star_allele ) return 0; + return buf->split.overlaps[iout]; +} +static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial) +{ + if ( !ial ) return ial; + return buf->split.tbl[irow*buf->split.nori + ial - 1]; +} +static void _split_table_set_chrom_qual(abuf_t *buf) +{ + int iout,j; + bcf1_t *rec = buf->split.rec; + for (iout=0; ioutsplit.nout; iout++) + { + rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); + j = rbuf_append(&buf->rbuf); + if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1(); + bcf1_t *out = buf->vcf[j]; + bcf_clear1(out); + + atom_t *atom = buf->split.atoms[iout]; + out->rid = rec->rid; + out->pos = rec->pos + atom->beg; + bcf_update_id(buf->out_hdr, out, rec->d.id); + + const char *als[3]; + als[0] = atom->ref.s; + als[1] = atom->alt.s; + als[2] = "*"; + int nals = _has_star_allele(buf,iout) ? 3 : 2; + bcf_update_alleles(buf->out_hdr, out, als, nals); + + if ( bcf_float_is_missing(rec->qual) ) + bcf_float_set_missing(out->qual); + else + out->qual = rec->qual; + + bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt); + } +} +static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode) +{ + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key); + int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key); + int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key); + if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags + if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings + if ( type==BCF_HT_LONG ) return; // todo: 64bit integers + + bcf1_t *rec = buf->split.rec; + int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp; + int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type); + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4; + + // Check for incorrect number of values. Note this check does not consider all values missing + // and will remove annotations that don't pass. + if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return; + + if ( buf->mtmp2 < buf->mtmp ) + { + buf->tmp2 = realloc(buf->tmp2, buf->mtmp); + if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp); + buf->mtmp2 = buf->mtmp; + } + + int32_t missing = bcf_int32_missing; + void *missing_ptr = (void*)&missing; + if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + + int iout,i; + for (iout=0; ioutsplit.nout; iout++) + { + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int star_allele = _has_star_allele(buf,iout); + int ret = 0; + if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) + ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type); + else if ( len==BCF_VL_A ) + { + int iori = buf->split.atoms[iout]->ial - 1; + assert( ioritmp2,buf->tmp+4*iori,4); + if ( star_allele ) + memcpy(buf->tmp2+4,missing_ptr,4); + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type); + } + else if ( len==BCF_VL_R ) + { + memcpy(buf->tmp2,buf->tmp,4); // REF contributes to all records + int iori = buf->split.atoms[iout]->ial; + assert( iorisplit.nori ); + memcpy(buf->tmp2+4,buf->tmp+4*iori,4); + if ( type==BCF_HT_INT && mode==M_SUM ) + { + uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; + for (i=iori; isplit.nori; i++) + { + if ( tbl[i]==1 ) ((int32_t*)buf->tmp2)[1] += ((int32_t*)buf->tmp)[i+1]; + } + } + if ( star_allele ) + memcpy(buf->tmp2+8,missing_ptr,4); + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type); + } + if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag); + } +} +static void _split_table_set_history(abuf_t *buf) +{ + int i,j; + bcf1_t *rec = buf->split.rec; + buf->tmps.l = 0; + ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]); + for (i=1; in_allele; i++) + { + kputs(rec->d.allele[i],&buf->tmps); + if ( i+1n_allele ) kputc(',',&buf->tmps); + else kputc(',',&buf->tmps); + } + int len = buf->tmps.l; + buf->tmps.s[buf->tmps.l-1] = '|'; + + for (i=0; isplit.nout; i++) + { + buf->tmps.l = len; + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)]; + uint8_t *ptr = buf->split.tbl + i*buf->split.nori; + for (j=0; jsplit.nori; j++) + { + if ( ptr[j]!=1 ) continue; + kputw(j+1,&buf->tmps); + kputc(',',&buf->tmps); + } + buf->tmps.s[--buf->tmps.l] = 0; + if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 ) + error("An error occurred while updating INFO/%s\n",buf->split.info_tag); + } +} +static void _split_table_set_gt(abuf_t *buf) +{ + int nsmpl = bcf_hdr_nsamples(buf->hdr); + if ( !nsmpl ) return; + + bcf1_t *rec = buf->split.rec; + buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt); + if ( buf->ngt<=0 ) return; + else + hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi); + + int iout,i,j; + for (iout=0; ioutsplit.nout; iout++) + { + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int star_allele = _has_star_allele(buf,iout); + int max_ploidy = buf->ngt/nsmpl; + int32_t *src = buf->gt, *dst = buf->tmpi; + for (i=0; i=rec->n_allele ) + error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1); + int ial = _split_table_get_ial(buf,iout,iori); + if ( ial==2 && !star_allele ) + dst[j] = bcf_gt_missing; + else + dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial); + } + src += max_ploidy; + dst += max_ploidy; + } + bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt); + } +} +static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mode) +{ + int nsmpl = bcf_hdr_nsamples(buf->hdr); + if ( !nsmpl ) return; + + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id); + if ( tag[0]=='G' && tag[1]=='T' && !tag[2] ) // FORMAT/GT + { + _split_table_set_gt(buf); + return; + } + + int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id); + int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id); + if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings + if ( type==BCF_HT_LONG ) return; // todo: 64bit integers + + const int num_size = 4; + assert( num_size==sizeof(int32_t) && num_size==sizeof(float) ); + int32_t missing = bcf_int32_missing; + void *missing_ptr = (void*)&missing; + if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + + bcf1_t *rec = buf->split.rec; + int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items + int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type); + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes + + if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid + + // Check for incorrect number of values. Note this check does not consider all values missing + // and will remove annotations that don't pass. + if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return; + + // Increase buffer size to accommodate star allele + int nval1 = nval / nsmpl; + mtmp = buf->mtmp; + if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele + else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3); + + if ( buf->mtmp2 < mtmp ) + { + buf->tmp2 = realloc(buf->tmp2, mtmp); + if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp); + buf->mtmp2 = mtmp; + } + + int iout, i, j; + for (iout=0; ioutsplit.nout; iout++) + { + int star_allele = _has_star_allele(buf,iout); + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int ret = 0; + if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type); + else if ( len==BCF_VL_A ) + { + int iori = buf->split.atoms[iout]->ial - 1; + assert( ioritmp + nval1*num_size*i; + void *dst = buf->tmp2 + num_size*i*(star_allele+1); + memcpy(dst,src+iori*num_size,num_size); + if ( star_allele ) + memcpy(dst+num_size,missing_ptr,num_size); + } + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type); + } + else if ( len==BCF_VL_R ) + { + int iori = buf->split.atoms[iout]->ial; + assert( iori<=nval ); + for (i=0; itmp + nval1*num_size*i; + void *dst = buf->tmp2 + num_size*i*(star_allele+2); + memcpy(dst,src,num_size); + memcpy(dst+num_size,src+iori*num_size,num_size); + + if ( type==BCF_HT_INT && mode==M_SUM ) + { + uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; + for (j=iori; jsplit.nori; j++) + if ( tbl[j]==1 ) ((int32_t*)dst)[1] += ((int32_t*)src)[j+1]; + } + if ( star_allele ) + memcpy(dst+num_size*2,missing_ptr,num_size); + } + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type); + } + else if ( len==BCF_VL_G ) + { + int iori = buf->split.atoms[iout]->ial; + int i01 = bcf_alleles2gt(0,iori); + int i11 = bcf_alleles2gt(iori,iori); + assert( ioritmp + i*nval1; \ + type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \ + int n=0; /* determine ploidy of this genotype */ \ + while ( ntmp + i*nval1; \ + memcpy(dst++,src,sizeof(type)); \ + int nmiss = 0, nend = 0; \ + if ( n==rec->n_allele ) /* haploid */ \ + { \ + memcpy(dst++,src+iori,sizeof(type)); \ + if ( star_allele ) { nmiss = 1; nend = 3; } \ + else nend = 1; \ + } \ + else if ( n==nval1 ) \ + { \ + memcpy(dst++,src+i01,sizeof(type)); \ + memcpy(dst++,src+i11,sizeof(type)); \ + if ( star_allele ) nmiss = 3; \ + } \ + else if ( n==1 && is_missing ) \ + { \ + if ( star_allele ) nend = 5; \ + else nend = 2; \ + } \ + else \ + error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \ + bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \ + for (j=0; jout_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type); + } + if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag); + } +} +static inline int _is_acgtn(char *seq) +{ + while ( *seq ) + { + char c = toupper(*seq); + if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0; + seq++; + } + return 1; +} +/* + The atomization works as follows: + - Atomize each alternate allele separately by leaving out sequence identical to the reference. No + alignment is performed, just greedy trimming of the end, then from left. This operation returns + a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets + to the original REF allele + - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each + with a single ALT. + - For each new VCF record determine how to translate the original allele index (iori) to this new + record: + - 1: the original allele matches the atom + - 0: the original allele does not overlap this atom or the overlapping part matches the REF + allele + - 2 (or equivalently "."): there is a mismatch between the original allele and the atom + The mapping is encoded in a table with columns corresponding to the original ALTs and rows + to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching + atoms and 2's for overlapping mismatching atoms. + + Note that different ALT alleles can result in the same atom (the same output line) and this code + does not know how to reconcile possibly conflicting VCF annotations. This could be improved + and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should + be summed for the same atomized output allele. However, this level of complexity is not addressed + in this initial draft. Higher priority for now is to provide the inverse "join" operation. + + Update 2021-04-09: + Tags QS,AD are now automatically incremented as they should be, for both INFO and FORMAT. + Note that the code will fail on missing values (todo) and it needs to be generalized and + made customizable. +*/ +void _abuf_split(abuf_t *buf, bcf1_t *rec) +{ + int i,j; + if ( rec->n_allele < 2 ) + { + rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); + int j = rbuf_append(&buf->rbuf); + if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]); + buf->vcf[j] = bcf_dup(rec); + return; + } + for (i=1; in_allele; i++) + { + if ( _is_acgtn(rec->d.allele[i]) ) continue; + rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); + int j = rbuf_append(&buf->rbuf); + if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]); + buf->vcf[j] = bcf_dup(rec); + return; + } + + buf->natoms = 0; + for (i=1; in_allele; i++) _atomize_allele(buf,rec,i); + qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms); + _split_table_init(buf,rec,buf->natoms); + for (i=0; inatoms; i++) + { + if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue; + _split_table_new(buf, &buf->atoms[i]); // add a new unique output atom + } + for (i=0; inatoms; i++) + { + // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles + atom_t *atom = &buf->atoms[i]; + for (j=0; jsplit.nout; j++) + { + atom_t *out = buf->split.atoms[j]; + if ( atom == out ) continue; // table already set to 1 + if ( atom->beg > out->end ) continue; // cannot overlap this output atom + if ( atom->end < out->beg ) break; // this atom is ahead of all subsequent output records + _split_table_overlap(buf, j, atom); + } + } + assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode + + // Create the output records, transferring all annotations: + // CHROM-QUAL + _split_table_set_chrom_qual(buf); + + // INFO + for (i=0; in_info; i++) + { + // this implementation of merging rules is temporary: generalize and made customizable through the API + merge_rule_t mode = M_FIRST; + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.info[i].key); + if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM; + + _split_table_set_info(buf, &rec->d.info[i], mode); + } + + // Set INFO tag showing the original record + if ( buf->split.info_tag ) + _split_table_set_history(buf); + + // FORMAT + for (i=0; in_fmt; i++) + { + // this implementation of merging rules is temporary: generalize and made customizable through the API + merge_rule_t mode = M_FIRST; + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.fmt[i].id); + if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM; + + _split_table_set_format(buf, &rec->d.fmt[i], mode); + } +} + +void abuf_push(abuf_t *buf, bcf1_t *rec) +{ + bcf_unpack(rec, BCF_UN_ALL); + if ( buf->mode==SPLIT ) _abuf_split(buf,rec); +} + +bcf1_t *abuf_flush(abuf_t *buf, int flush_all) +{ + int i; + + if ( buf->rbuf.n==0 ) return NULL; + if ( flush_all ) goto ret; + +ret: + i = rbuf_shift(&buf->rbuf); + return buf->vcf[i]; +} + diff --git a/bcftools/abuf.c.pysam.c b/bcftools/abuf.c.pysam.c new file mode 100644 index 0000000..811ef10 --- /dev/null +++ b/bcftools/abuf.c.pysam.c @@ -0,0 +1,715 @@ +#include "bcftools.pysam.h" + +/* The MIT License + + Copyright (c) 2021 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include "bcftools.h" +#include "abuf.h" +#include "rbuf.h" + +typedef enum +{ + M_FIRST, M_SUM +} +merge_rule_t; + +typedef struct +{ + kstring_t ref, alt; + int ial; // the index of the original ALT allele, 1-based + int beg, end; // 0-based inclusive offsets to ref,alt +} +atom_t; + +typedef struct +{ + bcf1_t *rec; + int nori, nout; // number of ALTs in the input, and VCF rows on output + uint8_t *tbl; // nori columns, nout rows; indicates allele contribution to output rows, see "The atomization works as follows" below + uint8_t *overlaps; // is the star allele needed for this variant? + atom_t **atoms; + int matoms, mtbl, moverlaps; + char *info_tag; +} +split_t; + +struct _abuf_t +{ + abuf_opt_t mode; + split_t split; + atom_t *atoms; + int natoms, matoms; + const bcf_hdr_t *hdr; + bcf_hdr_t *out_hdr; + bcf1_t **vcf; // dimensions stored in rbuf + rbuf_t rbuf; + + kstring_t tmps; + void *tmp, *tmp2; + int32_t *gt, *tmpi; + int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2; + int star_allele; +}; + +abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode) +{ + if ( mode!=SPLIT ) error("todo\n"); + abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t)); + buf->hdr = hdr; + buf->out_hdr = (bcf_hdr_t*) hdr; + buf->mode = mode; + buf->star_allele = 1; + rbuf_init(&buf->rbuf, 0); + return buf; +} + +void abuf_destroy(abuf_t *buf) +{ + int i; + for (i=0; imatoms; i++) + { + free(buf->atoms[i].ref.s); + free(buf->atoms[i].alt.s); + } + free(buf->atoms); + free(buf->split.atoms); + free(buf->split.overlaps); + free(buf->split.tbl); + for (i=0; irbuf.m; i++) + if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]); + free(buf->vcf); + free(buf->gt); + free(buf->tmpi); + free(buf->tmp); + free(buf->tmp2); + free(buf->tmps.s); + free(buf); +} + +void abuf_set(abuf_t *buf, abuf_opt_t key, void *value) +{ + if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; } + if ( key==INFO_TAG ) + { + buf->split.info_tag = *((char**)value); + bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); + return; + } + if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; } +} + +/* + Split alleles into primitivs, e.g. + CC>TT becomes C>T,C>T + GCGT>GTGA becomes C>T,T>A + + There is no sequence alignment, just trimming and hungry matching + from left side. +*/ +static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) +{ + // Trim identical sequence from right + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[ial]; + int rlen = strlen(ref); + int alen = strlen(alt); + while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--; + int Mlen = rlen > alen ? rlen : alen; + + atom_t *atom = NULL; + int i; + for (i=0; ialt); + if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; } + } + else + { + buf->natoms++; + hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); + atom = &buf->atoms[buf->natoms-1]; + atom->ref.l = 0; + atom->alt.l = 0; + kputc(refb, &atom->ref); + kputc(altb, &atom->alt); + atom->beg = atom->end = i; + atom->ial = ial; + } + continue; + } + if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion? + { + buf->natoms++; + hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); + atom = &buf->atoms[buf->natoms-1]; + atom->ref.l = 0; + atom->alt.l = 0; + kputc(refb, &atom->ref); + kputc(altb, &atom->alt); + atom->beg = atom->end = i; + atom->ial = ial; + } + } +} +static int _atoms_inconsistent(const atom_t *a, const atom_t *b) +{ + if ( a->beg < b->beg ) return -1; + if ( a->beg > b->beg ) return 1; + int rcmp = strcasecmp(a->ref.s,b->ref.s); + if ( rcmp ) return rcmp; + return strcasecmp(a->alt.s,b->alt.s); +} +/* + For reproducibility of tests on different platforms, we need to guarantee the same order of identical + atoms originating from different source ALTs. Even though they are consistent, different values can be + picked for VCF annotations as currently the values from the one that comes first are used. +*/ +static int _cmp_atoms(const void *aptr, const void *bptr) +{ + const atom_t *a = (const atom_t*) aptr; + const atom_t *b = (const atom_t*) bptr; + int rcmp = _atoms_inconsistent(a,b); + if ( rcmp ) return rcmp; + if ( a->ial < b->ial ) return -1; + if ( a->ial > b->ial ) return 1; + return 0; +} +static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms) +{ + buf->split.rec = rec; + buf->split.nori = rec->n_allele - 1; + buf->split.nout = 0; + hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl); + hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms); + hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps); + memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms); +} +static void _split_table_new(abuf_t *buf, atom_t *atom) +{ + int i, iout = buf->split.nout++; + buf->split.atoms[iout] = atom; + uint8_t *ptr = buf->split.tbl + iout*buf->split.nori; + for (i=0; isplit.nori; i++) ptr[i] = 0; + ptr[atom->ial-1] = 1; +} +static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom) +{ + uint8_t *ptr = buf->split.tbl + iout*buf->split.nori; + ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1; + buf->split.overlaps[iout] = 1; +} +#if 0 +static void _split_table_print(abuf_t *buf) +{ + int i,j; + for (i=0; isplit.nout; i++) + { + atom_t *atom = buf->split.atoms[i]; + uint8_t *ptr = buf->split.tbl + i*buf->split.nori; + fprintf(bcftools_stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s); + for (j=0; jsplit.nori; j++) fprintf(bcftools_stderr,"\t%d",(int)ptr[j]); + fprintf(bcftools_stderr,"\n"); + } +} +static void _split_table_print_atoms(abuf_t *buf) +{ + int i; + for (i=0; inatoms; i++) + { + atom_t *atom = &buf->atoms[i]; + fprintf(bcftools_stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end); + } +} +#endif +static inline uint8_t _has_star_allele(abuf_t *buf, int iout) +{ + if ( !buf->star_allele ) return 0; + return buf->split.overlaps[iout]; +} +static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial) +{ + if ( !ial ) return ial; + return buf->split.tbl[irow*buf->split.nori + ial - 1]; +} +static void _split_table_set_chrom_qual(abuf_t *buf) +{ + int iout,j; + bcf1_t *rec = buf->split.rec; + for (iout=0; ioutsplit.nout; iout++) + { + rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); + j = rbuf_append(&buf->rbuf); + if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1(); + bcf1_t *out = buf->vcf[j]; + bcf_clear1(out); + + atom_t *atom = buf->split.atoms[iout]; + out->rid = rec->rid; + out->pos = rec->pos + atom->beg; + bcf_update_id(buf->out_hdr, out, rec->d.id); + + const char *als[3]; + als[0] = atom->ref.s; + als[1] = atom->alt.s; + als[2] = "*"; + int nals = _has_star_allele(buf,iout) ? 3 : 2; + bcf_update_alleles(buf->out_hdr, out, als, nals); + + if ( bcf_float_is_missing(rec->qual) ) + bcf_float_set_missing(out->qual); + else + out->qual = rec->qual; + + bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt); + } +} +static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode) +{ + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key); + int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key); + int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key); + if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags + if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings + if ( type==BCF_HT_LONG ) return; // todo: 64bit integers + + bcf1_t *rec = buf->split.rec; + int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp; + int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type); + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4; + + // Check for incorrect number of values. Note this check does not consider all values missing + // and will remove annotations that don't pass. + if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return; + + if ( buf->mtmp2 < buf->mtmp ) + { + buf->tmp2 = realloc(buf->tmp2, buf->mtmp); + if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp); + buf->mtmp2 = buf->mtmp; + } + + int32_t missing = bcf_int32_missing; + void *missing_ptr = (void*)&missing; + if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + + int iout,i; + for (iout=0; ioutsplit.nout; iout++) + { + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int star_allele = _has_star_allele(buf,iout); + int ret = 0; + if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) + ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type); + else if ( len==BCF_VL_A ) + { + int iori = buf->split.atoms[iout]->ial - 1; + assert( ioritmp2,buf->tmp+4*iori,4); + if ( star_allele ) + memcpy(buf->tmp2+4,missing_ptr,4); + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type); + } + else if ( len==BCF_VL_R ) + { + memcpy(buf->tmp2,buf->tmp,4); // REF contributes to all records + int iori = buf->split.atoms[iout]->ial; + assert( iorisplit.nori ); + memcpy(buf->tmp2+4,buf->tmp+4*iori,4); + if ( type==BCF_HT_INT && mode==M_SUM ) + { + uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; + for (i=iori; isplit.nori; i++) + { + if ( tbl[i]==1 ) ((int32_t*)buf->tmp2)[1] += ((int32_t*)buf->tmp)[i+1]; + } + } + if ( star_allele ) + memcpy(buf->tmp2+8,missing_ptr,4); + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type); + } + if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag); + } +} +static void _split_table_set_history(abuf_t *buf) +{ + int i,j; + bcf1_t *rec = buf->split.rec; + buf->tmps.l = 0; + ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]); + for (i=1; in_allele; i++) + { + kputs(rec->d.allele[i],&buf->tmps); + if ( i+1n_allele ) kputc(',',&buf->tmps); + else kputc(',',&buf->tmps); + } + int len = buf->tmps.l; + buf->tmps.s[buf->tmps.l-1] = '|'; + + for (i=0; isplit.nout; i++) + { + buf->tmps.l = len; + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)]; + uint8_t *ptr = buf->split.tbl + i*buf->split.nori; + for (j=0; jsplit.nori; j++) + { + if ( ptr[j]!=1 ) continue; + kputw(j+1,&buf->tmps); + kputc(',',&buf->tmps); + } + buf->tmps.s[--buf->tmps.l] = 0; + if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 ) + error("An error occurred while updating INFO/%s\n",buf->split.info_tag); + } +} +static void _split_table_set_gt(abuf_t *buf) +{ + int nsmpl = bcf_hdr_nsamples(buf->hdr); + if ( !nsmpl ) return; + + bcf1_t *rec = buf->split.rec; + buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt); + if ( buf->ngt<=0 ) return; + else + hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi); + + int iout,i,j; + for (iout=0; ioutsplit.nout; iout++) + { + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int star_allele = _has_star_allele(buf,iout); + int max_ploidy = buf->ngt/nsmpl; + int32_t *src = buf->gt, *dst = buf->tmpi; + for (i=0; i=rec->n_allele ) + error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1); + int ial = _split_table_get_ial(buf,iout,iori); + if ( ial==2 && !star_allele ) + dst[j] = bcf_gt_missing; + else + dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial); + } + src += max_ploidy; + dst += max_ploidy; + } + bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt); + } +} +static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mode) +{ + int nsmpl = bcf_hdr_nsamples(buf->hdr); + if ( !nsmpl ) return; + + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id); + if ( tag[0]=='G' && tag[1]=='T' && !tag[2] ) // FORMAT/GT + { + _split_table_set_gt(buf); + return; + } + + int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id); + int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id); + if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings + if ( type==BCF_HT_LONG ) return; // todo: 64bit integers + + const int num_size = 4; + assert( num_size==sizeof(int32_t) && num_size==sizeof(float) ); + int32_t missing = bcf_int32_missing; + void *missing_ptr = (void*)&missing; + if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + + bcf1_t *rec = buf->split.rec; + int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items + int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type); + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes + + if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid + + // Check for incorrect number of values. Note this check does not consider all values missing + // and will remove annotations that don't pass. + if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return; + + // Increase buffer size to accommodate star allele + int nval1 = nval / nsmpl; + mtmp = buf->mtmp; + if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele + else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3); + + if ( buf->mtmp2 < mtmp ) + { + buf->tmp2 = realloc(buf->tmp2, mtmp); + if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp); + buf->mtmp2 = mtmp; + } + + int iout, i, j; + for (iout=0; ioutsplit.nout; iout++) + { + int star_allele = _has_star_allele(buf,iout); + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int ret = 0; + if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type); + else if ( len==BCF_VL_A ) + { + int iori = buf->split.atoms[iout]->ial - 1; + assert( ioritmp + nval1*num_size*i; + void *dst = buf->tmp2 + num_size*i*(star_allele+1); + memcpy(dst,src+iori*num_size,num_size); + if ( star_allele ) + memcpy(dst+num_size,missing_ptr,num_size); + } + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type); + } + else if ( len==BCF_VL_R ) + { + int iori = buf->split.atoms[iout]->ial; + assert( iori<=nval ); + for (i=0; itmp + nval1*num_size*i; + void *dst = buf->tmp2 + num_size*i*(star_allele+2); + memcpy(dst,src,num_size); + memcpy(dst+num_size,src+iori*num_size,num_size); + + if ( type==BCF_HT_INT && mode==M_SUM ) + { + uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; + for (j=iori; jsplit.nori; j++) + if ( tbl[j]==1 ) ((int32_t*)dst)[1] += ((int32_t*)src)[j+1]; + } + if ( star_allele ) + memcpy(dst+num_size*2,missing_ptr,num_size); + } + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type); + } + else if ( len==BCF_VL_G ) + { + int iori = buf->split.atoms[iout]->ial; + int i01 = bcf_alleles2gt(0,iori); + int i11 = bcf_alleles2gt(iori,iori); + assert( ioritmp + i*nval1; \ + type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \ + int n=0; /* determine ploidy of this genotype */ \ + while ( ntmp + i*nval1; \ + memcpy(dst++,src,sizeof(type)); \ + int nmiss = 0, nend = 0; \ + if ( n==rec->n_allele ) /* haploid */ \ + { \ + memcpy(dst++,src+iori,sizeof(type)); \ + if ( star_allele ) { nmiss = 1; nend = 3; } \ + else nend = 1; \ + } \ + else if ( n==nval1 ) \ + { \ + memcpy(dst++,src+i01,sizeof(type)); \ + memcpy(dst++,src+i11,sizeof(type)); \ + if ( star_allele ) nmiss = 3; \ + } \ + else if ( n==1 && is_missing ) \ + { \ + if ( star_allele ) nend = 5; \ + else nend = 2; \ + } \ + else \ + error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \ + bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \ + for (j=0; jout_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type); + } + if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag); + } +} +static inline int _is_acgtn(char *seq) +{ + while ( *seq ) + { + char c = toupper(*seq); + if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0; + seq++; + } + return 1; +} +/* + The atomization works as follows: + - Atomize each alternate allele separately by leaving out sequence identical to the reference. No + alignment is performed, just greedy trimming of the end, then from left. This operation returns + a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets + to the original REF allele + - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each + with a single ALT. + - For each new VCF record determine how to translate the original allele index (iori) to this new + record: + - 1: the original allele matches the atom + - 0: the original allele does not overlap this atom or the overlapping part matches the REF + allele + - 2 (or equivalently "."): there is a mismatch between the original allele and the atom + The mapping is encoded in a table with columns corresponding to the original ALTs and rows + to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching + atoms and 2's for overlapping mismatching atoms. + + Note that different ALT alleles can result in the same atom (the same output line) and this code + does not know how to reconcile possibly conflicting VCF annotations. This could be improved + and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should + be summed for the same atomized output allele. However, this level of complexity is not addressed + in this initial draft. Higher priority for now is to provide the inverse "join" operation. + + Update 2021-04-09: + Tags QS,AD are now automatically incremented as they should be, for both INFO and FORMAT. + Note that the code will fail on missing values (todo) and it needs to be generalized and + made customizable. +*/ +void _abuf_split(abuf_t *buf, bcf1_t *rec) +{ + int i,j; + if ( rec->n_allele < 2 ) + { + rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); + int j = rbuf_append(&buf->rbuf); + if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]); + buf->vcf[j] = bcf_dup(rec); + return; + } + for (i=1; in_allele; i++) + { + if ( _is_acgtn(rec->d.allele[i]) ) continue; + rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); + int j = rbuf_append(&buf->rbuf); + if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]); + buf->vcf[j] = bcf_dup(rec); + return; + } + + buf->natoms = 0; + for (i=1; in_allele; i++) _atomize_allele(buf,rec,i); + qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms); + _split_table_init(buf,rec,buf->natoms); + for (i=0; inatoms; i++) + { + if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue; + _split_table_new(buf, &buf->atoms[i]); // add a new unique output atom + } + for (i=0; inatoms; i++) + { + // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles + atom_t *atom = &buf->atoms[i]; + for (j=0; jsplit.nout; j++) + { + atom_t *out = buf->split.atoms[j]; + if ( atom == out ) continue; // table already set to 1 + if ( atom->beg > out->end ) continue; // cannot overlap this output atom + if ( atom->end < out->beg ) break; // this atom is ahead of all subsequent output records + _split_table_overlap(buf, j, atom); + } + } + assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode + + // Create the output records, transferring all annotations: + // CHROM-QUAL + _split_table_set_chrom_qual(buf); + + // INFO + for (i=0; in_info; i++) + { + // this implementation of merging rules is temporary: generalize and made customizable through the API + merge_rule_t mode = M_FIRST; + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.info[i].key); + if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM; + + _split_table_set_info(buf, &rec->d.info[i], mode); + } + + // Set INFO tag showing the original record + if ( buf->split.info_tag ) + _split_table_set_history(buf); + + // FORMAT + for (i=0; in_fmt; i++) + { + // this implementation of merging rules is temporary: generalize and made customizable through the API + merge_rule_t mode = M_FIRST; + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.fmt[i].id); + if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM; + + _split_table_set_format(buf, &rec->d.fmt[i], mode); + } +} + +void abuf_push(abuf_t *buf, bcf1_t *rec) +{ + bcf_unpack(rec, BCF_UN_ALL); + if ( buf->mode==SPLIT ) _abuf_split(buf,rec); +} + +bcf1_t *abuf_flush(abuf_t *buf, int flush_all) +{ + int i; + + if ( buf->rbuf.n==0 ) return NULL; + if ( flush_all ) goto ret; + +ret: + i = rbuf_shift(&buf->rbuf); + return buf->vcf[i]; +} + diff --git a/bcftools/abuf.h b/bcftools/abuf.h new file mode 100644 index 0000000..5fc1e00 --- /dev/null +++ b/bcftools/abuf.h @@ -0,0 +1,78 @@ +/* The MIT License + + Copyright (c) 2021 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +/* + Atomize/deatomize complex variants +*/ + +#ifndef __ABUF_H__ +#define __ABUF_H__ + +#include + +typedef struct _abuf_t abuf_t; + +// Modes of operation +typedef enum +{ + NONE, + + // mode of operation, to be passed to abuf_init + SPLIT, + JOIN, + + BCF_HDR, // should the records be annotated, a writable bcf header is required + INFO_TAG, // set BCF_HDR first + STAR_ALLELE // 1: use STAR allele (the default), 0: set overlaps to missing +} +abuf_opt_t; + +#define abuf_set_opt(buf,type,key,value) { type tmp = value; abuf_set(buf, key, (void*)&tmp); } +void abuf_set(abuf_t *buf, abuf_opt_t key, void *value); + +/* + * abuf_init() - init buffer + * @win: number of sites (>0) or bp (<0) + */ +abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode); +void abuf_destroy(abuf_t *buf); + +/* + * abuf_push() - Push a new site for analysis + */ +void abuf_push(abuf_t *buf, bcf1_t *rec); + +/* + * abuf_flush() - Return next buffered record + * @flush_all: Set to 1 if no more overlapping records are coming (e.g. end of chromosome or end of file), + * the buffer can be emptied. + * return: The next atomized/deatomized VCF record or NULL if no record is ready. The returned + * structure will be cleaned by abuf. + */ +bcf1_t *abuf_flush(abuf_t *buf, int flush_all); + +#endif + diff --git a/bcftools/bam2bcf.c b/bcftools/bam2bcf.c index d080917..336e2f6 100644 --- a/bcftools/bam2bcf.c +++ b/bcftools/bam2bcf.c @@ -1,7 +1,7 @@ /* bam2bcf.c -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Heng Li @@ -40,7 +40,8 @@ extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); #define CAP_DIST 25 -bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) +bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ, + int delta_baseQ) { bcf_callaux_t *bca; if (theta <= 0.) theta = CALL_DEFTHETA; @@ -48,6 +49,8 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->capQ = 60; bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; bca->min_baseQ = min_baseQ; + bca->max_baseQ = max_baseQ; + bca->delta_baseQ = delta_baseQ; bca->e = errmod_init(1. - theta); bca->min_frac = 0.002; bca->min_support = 1; @@ -55,9 +58,13 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->npos = 100; bca->ref_pos = (int*) malloc(bca->npos*sizeof(int)); bca->alt_pos = (int*) malloc(bca->npos*sizeof(int)); + bca->iref_pos= (int*) malloc(bca->npos*sizeof(int)); + bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int)); bca->nqual = 60; bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int)); bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int)); bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int)); bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int)); bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int)); @@ -69,47 +76,68 @@ void bcf_call_destroy(bcf_callaux_t *bca) { if (bca == 0) return; errmod_destroy(bca->e); - if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } - free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq); + if (bca->npos) { + free(bca->ref_pos); free(bca->alt_pos); + free(bca->iref_pos); free(bca->ialt_pos); + bca->npos = 0; + } + free(bca->ref_mq); free(bca->alt_mq); + free(bca->iref_mq); free(bca->ialt_mq); + free(bca->ref_bq); free(bca->alt_bq); free(bca->fwd_mqs); free(bca->rev_mqs); bca->nqual = 0; free(bca->bases); free(bca->inscns); free(bca); } // position in the sequence with respect to the aligned part of the read -static int get_position(const bam_pileup1_t *p, int *len) -{ - int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; - for (icig=0; icigb->core.n_cigar; icig++) - { - int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK; - int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; - if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) - { - n_tot_bases += ncig; - iread += ncig; - continue; - } - if ( cig==BAM_CINS ) - { - n_tot_bases += ncig; - iread += ncig; +static int get_position(const bam_pileup1_t *p, int *len, + int *sc_len, int *sc_dist) { + int i, j, edist = p->qpos + 1; + int sc_left = 0, sc_right = 0; + int sc_left_dist = -1, sc_right_dist = -1; + + // left end + for (i = 0; i < p->b->core.n_cigar; i++) { + int cig = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK; + if (cig == BAM_CHARD_CLIP) continue; - } - if ( cig==BAM_CSOFT_CLIP ) - { - iread += ncig; - if ( iread<=p->qpos ) edist -= ncig; + else if (cig == BAM_CSOFT_CLIP) + sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT; + else + break; + } + if (sc_left) + sc_left_dist = p->qpos+1 - sc_left; + edist -= sc_left; + + // right end + for (j = p->b->core.n_cigar-1; j >= i; j--) { + int cig = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK; + if (cig == BAM_CHARD_CLIP) continue; + else if (cig == BAM_CSOFT_CLIP) + sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT; + else + break; + } + if (sc_right) + sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos; + + // Distance to nearest soft-clips and length of that clip. + if (sc_left_dist >= 0) { + if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) { + *sc_len = sc_left; + *sc_dist = sc_left_dist; } - if ( cig==BAM_CDEL ) continue; - if ( cig==BAM_CHARD_CLIP ) continue; - if ( cig==BAM_CPAD ) continue; - if ( cig==BAM_CREF_SKIP ) continue; - fprintf(stderr,"todo: cigar %d\n", cig); - assert(0); - } - *len = n_tot_bases; + } else if (sc_right_dist >= 0) { + *sc_len = sc_right; + *sc_dist = sc_right_dist; + } else { + *sc_len = 0; + *sc_dist = 0; + } + + *len = p->b->core.l_qseq - sc_left - sc_right; return edist; } @@ -117,8 +145,12 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) { memset(bca->ref_pos,0,sizeof(int)*bca->npos); memset(bca->alt_pos,0,sizeof(int)*bca->npos); + memset(bca->iref_pos,0,sizeof(int)*bca->npos); + memset(bca->ialt_pos,0,sizeof(int)*bca->npos); memset(bca->ref_mq,0,sizeof(int)*bca->nqual); memset(bca->alt_mq,0,sizeof(int)*bca->nqual); + memset(bca->iref_mq,0,sizeof(int)*bca->nqual); + memset(bca->ialt_mq,0,sizeof(int)*bca->nqual); memset(bca->ref_bq,0,sizeof(int)*bca->nqual); memset(bca->alt_bq,0,sizeof(int)*bca->nqual); memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual); @@ -126,13 +158,18 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); + memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES); + memset(bca->ref_scl, 0, 100*sizeof(int)); + memset(bca->alt_scl, 0, 100*sizeof(int)); + memset(bca->iref_scl, 0, 100*sizeof(int)); + memset(bca->ialt_scl, 0, 100*sizeof(int)); } /* Notes: - - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies - which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation. - Later it's used for multiallelic calling by bcftools -m + - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies + which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations. + Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`. - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. */ /* @@ -150,7 +187,6 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // clean from previous run r->ori_depth = 0; r->mq0 = 0; - memset(r->qsum,0,sizeof(float)*4); memset(r->anno,0,sizeof(double)*16); memset(r->p,0,sizeof(float)*25); r->SCR = 0; @@ -166,30 +202,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t kroundup32(bca->max_bases); bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } + // fill the bases array + double nqual_over_60 = bca->nqual / 60.0; + int ADR_ref_missed[4] = {0}; + int ADF_ref_missed[4] = {0}; for (i = n = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; + if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; if (p->is_del && !is_indel) continue; ++ori_depth; if (is_indel) { - b = p->aux>>16&0x3f; - baseQ = q = p->aux&0xff; - // This read is not counted as indel. Instead of skipping it, treat it as ref. It is - // still only an approximation, but gives more accurate AD counts and calls correctly - // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam) - if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos]; - seqQ = p->aux>>8&0xff; + b = p->aux>>16&0x3f; + seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias + if (q < bca->min_baseQ) + { + if (!p->indel && b < 4) + { + if (bam_is_rev(p->b)) + ADR_ref_missed[b]++; + else + ADF_ref_missed[b]++; + } + continue; + } + if (p->indel == 0 && (q < _n/2 || _n > 20)) { + // high quality indel calls without p->indel set aren't + // particularly indicative of being a good REF match either, + // at least not in low coverage. So require solid coverage + // before we start utilising such quals. + b = 0; + q = (int)bam_get_qual(p->b)[p->qpos]; + seqQ = (3*seqQ + 2*q)/8; + } + if (_n > 20 && seqQ > 40) seqQ = 40; + baseQ = p->aux>>8&0xff; + is_diff = (b != 0); } else { b = bam_seqi(bam_get_seq(p->b), p->qpos); // base b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base - baseQ = q = (int)bam_get_qual(p->b)[p->qpos]; + + // Lowest of this and neighbour quality values + uint8_t *qual = bam_get_qual(p->b); + q = qual[p->qpos]; + if (p->qpos > 0 && + q > qual[p->qpos-1]+bca->delta_baseQ) + q = qual[p->qpos-1]+bca->delta_baseQ; + if (p->qpos+1 < p->b->core.l_qseq && + q > qual[p->qpos+1]+bca->delta_baseQ) + q = qual[p->qpos+1]+bca->delta_baseQ; + if (q < bca->min_baseQ) continue; + if (q > bca->max_baseQ) q = bca->max_baseQ; + baseQ = q; seqQ = 99; is_diff = (ref4 < 4 && b == ref4)? 0 : 1; } @@ -201,11 +272,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (q > 63) q = 63; if (q < 4) q = 4; // MQ=0 reads count as BQ=4 bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; - if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; // collect annotations if (b < 4) { - r->qsum[b] += q; + r->QS[b] += q; if ( r->ADF ) { if ( bam_is_rev(p->b) ) @@ -228,29 +298,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // collect for bias tests if ( baseQ > 59 ) baseQ = 59; if ( mapQ > 59 ) mapQ = 59; - int len, epos = 0; - if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) ) + int len, epos = 0, sc_len = 0, sc_dist = 0; + if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) ) { - int pos = get_position(p, &len); + int pos = get_position(p, &len, &sc_len, &sc_dist); epos = (double)pos/(len+1) * bca->npos; + + if (sc_len) { + sc_len = 15.0*sc_len / sc_dist; + if (sc_len > 99) sc_len = 99; + } } - int ibq = baseQ/60. * bca->nqual; - int imq = mapQ/60. * bca->nqual; - if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; - else bca->fwd_mqs[imq]++; + + int imq = mapQ * nqual_over_60; + int ibq = baseQ * nqual_over_60; + + if ( bam_is_rev(p->b) ) + bca->rev_mqs[imq]++; + else + bca->fwd_mqs[imq]++; + if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) { bca->ref_pos[epos]++; bca->ref_bq[ibq]++; bca->ref_mq[imq]++; + bca->ref_scl[sc_len]++; } else { bca->alt_pos[epos]++; bca->alt_bq[ibq]++; bca->alt_mq[imq]++; + bca->alt_scl[sc_len]++; } } + + // Compensate for AD not being counted on low quality REF indel matches. + if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 ) + { + for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0? + { + r->ADR[i] += ADR_ref_missed[i]; + r->ADF[i] += ADF_ref_missed[i]; + } + } + else if ( r->ADF && bca->ambig_reads==B2B_INC_AD ) + { + int dp = 0, dp_ambig = 0; + for (i=0; i<4; i++) dp += r->ADR[i]; + for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i]; + if ( dp ) + for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp); + dp = 0, dp_ambig = 0; + for (i=0; i<4; i++) dp += r->ADF[i]; + for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i]; + if ( dp ) + for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp); + } + r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype @@ -437,7 +543,7 @@ double calc_mwu_bias_cdf(int *a, int *b, int n) return pval>1 ? 1 : pval; } -double calc_mwu_bias(int *a, int *b, int n) +double calc_mwu_bias(int *a, int *b, int n, int left) { int na = 0, nb = 0, i; double U = 0, ties = 0; @@ -461,6 +567,7 @@ double calc_mwu_bias(int *a, int *b, int n) if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely double mean = ((double)na*nb)*0.5; + if (left && U > mean) return 1; // for MQB which is asymmetrical if ( na==2 || nb==2 ) { // Linear approximation @@ -483,6 +590,85 @@ double calc_mwu_bias(int *a, int *b, int n) return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2); } +// A Z-score version of the above function. +// +// See "Normal approximation and tie correction" at +// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test +// +// The Z score is the number of standard deviations above or below the mean +// with 0 being equality of the two distributions and +ve/-ve from there. +// +// This is a more robust score to filter on. +double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) { + int i; + int64_t t; + + // Optimisation + for (i = 0; i < n; i++) + if (b[i]) + break; + int b_empty = (i == n); + + // Count equal (e), less-than (l) and greater-than (g) permutations. + int e = 0, l = 0, na = 0, nb = 0; + if (b_empty) { + for (t = 0, i = n-1; i >= 0; i--) { + na += a[i]; + t += (a[i]*a[i]-1)*a[i]; // adjustment score for ties + } + } else { + for (t = 0, i = n-1; i >= 0; i--) { + // Combinations of a[i] and b[j] for i==j + e += a[i]*b[i]; + + // nb is running total of b[i+1]..b[n-1]. + // Therefore a[i]*nb is the number of combinations of a[i] and b[j] + // for all i < j. + l += a[i]*nb; // a= 0 ? 0.5 : -0.5)) / sd; // gatk method? + return (U - m) / sqrt(var2); + } + + // Else U score, which can be asymmetric for some data types. + if (left_only && U > m) + return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not. + + if (na >= 8 || nb >= 8) { + // Normal approximation, very good for na>=8 && nb>=8 and + // reasonable if na<8 or nb<8 + return exp(-0.5*(U-m)*(U-m)/var2); + } + + // Exact calculation + if (na==1 || nb == 1) + return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2); + else + return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2); +} + static inline double logsumexp2(double a, double b) { if ( a>b ) @@ -558,7 +744,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j; - float qsum[5] = {0,0,0,0,0}; + float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0}; if (ref_base >= 0) { call->ori_ref = ref4 = seq_nt16_int[ref_base]; if (ref4 > 4) ref4 = 4; @@ -569,9 +755,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int for (i = 0; i < n; ++i) { float sum = 0; - for (j = 0; j < 4; ++j) sum += calls[i].qsum[j]; + for (j = 0; j < 4; ++j) sum += calls[i].QS[j]; if ( sum ) - for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum; + for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum; } // sort qsum in ascending order (insertion sort) @@ -583,7 +769,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int // Set the reference allele and alternative allele(s) for (i=0; i<5; i++) call->a[i] = -1; - for (i=0; i<5; i++) call->qsum[i] = 0; + for (i=0; iqsum[i] = 0; call->unseen = -1; call->a[0] = ref4; for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering @@ -695,6 +881,21 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int adf += B2B_MAX_ALLELES; } } + if ( bca->fmt_flag & B2B_FMT_QS ) + { + assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well + + // reorder QS to match the allele ordering at this site + int32_t tmp[B2B_MAX_ALLELES]; + int32_t *qs = call->QS, *qs_out = call->QS; + for (i=0; in_alleles; j++) tmp[j] = qs[ call->a[j] ]; + for (j=0; jn_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32; + qs_out += call->n_alleles; + qs += B2B_MAX_ALLELES; + } + } // if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); call->shift = (int)(sum_min + .499); @@ -717,11 +918,43 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - if ( bca->fmt_flag & B2B_INFO_RPB ) - call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); + if (bca->fmt_flag & B2B_INFO_ZSCORE) { + // U z-normalised as +/- number of standard deviations from mean. + if (call->ori_ref < 0) { + if (bca->fmt_flag & B2B_INFO_RPB) + call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, + bca->npos, 0, 1); + call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, + bca->nqual,1,1); + if ( bca->fmt_flag & B2B_INFO_SCB ) + call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, + 100, 0,1); + } else { + if (bca->fmt_flag & B2B_INFO_RPB) + call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, + bca->npos, 0, 1); + call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, + bca->nqual,1,1); + call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, + bca->nqual,0,1); + call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, + bca->nqual,0,1); + if ( bca->fmt_flag & B2B_INFO_SCB ) + call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, + 100, 0,1); + } + } else { + // Old method; U as probability between 0 and 1 + if ( bca->fmt_flag & B2B_INFO_RPB ) + call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, + bca->npos, 0, 0); + call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, + bca->nqual, 1, 0); + call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, + bca->nqual, 0, 0); + call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, + bca->nqual, 0, 0); + } #if CDF_MWU_TESTS // CDF version of MWU tests is not calculated by default @@ -732,7 +965,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); #endif - if ( bca->fmt_flag & B2B_INFO_VDB ) + if ( bca->fmt_flag & B2B_INFO_VDB ) call->vdb = calc_vdb(bca->alt_pos, bca->npos); return 0; @@ -819,10 +1052,32 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + + if (bca->fmt_flag & B2B_INFO_ZSCORE) { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); + if ( bc->mwu_sc != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); + } else { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + } + + if ( bc->strand_bias != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); + #if CDF_MWU_TESTS if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); @@ -884,6 +1139,8 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, } if ( fmt_flag&B2B_FMT_SCR ) bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample); + if ( fmt_flag&B2B_FMT_QS ) + bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele); return 0; } diff --git a/bcftools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c index 16a559a..001363e 100644 --- a/bcftools/bam2bcf.c.pysam.c +++ b/bcftools/bam2bcf.c.pysam.c @@ -3,7 +3,7 @@ /* bam2bcf.c -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Heng Li @@ -42,7 +42,8 @@ extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); #define CAP_DIST 25 -bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) +bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ, + int delta_baseQ) { bcf_callaux_t *bca; if (theta <= 0.) theta = CALL_DEFTHETA; @@ -50,6 +51,8 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->capQ = 60; bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; bca->min_baseQ = min_baseQ; + bca->max_baseQ = max_baseQ; + bca->delta_baseQ = delta_baseQ; bca->e = errmod_init(1. - theta); bca->min_frac = 0.002; bca->min_support = 1; @@ -57,9 +60,13 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->npos = 100; bca->ref_pos = (int*) malloc(bca->npos*sizeof(int)); bca->alt_pos = (int*) malloc(bca->npos*sizeof(int)); + bca->iref_pos= (int*) malloc(bca->npos*sizeof(int)); + bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int)); bca->nqual = 60; bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int)); bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int)); + bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int)); bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int)); bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int)); bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int)); @@ -71,47 +78,68 @@ void bcf_call_destroy(bcf_callaux_t *bca) { if (bca == 0) return; errmod_destroy(bca->e); - if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } - free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq); + if (bca->npos) { + free(bca->ref_pos); free(bca->alt_pos); + free(bca->iref_pos); free(bca->ialt_pos); + bca->npos = 0; + } + free(bca->ref_mq); free(bca->alt_mq); + free(bca->iref_mq); free(bca->ialt_mq); + free(bca->ref_bq); free(bca->alt_bq); free(bca->fwd_mqs); free(bca->rev_mqs); bca->nqual = 0; free(bca->bases); free(bca->inscns); free(bca); } // position in the sequence with respect to the aligned part of the read -static int get_position(const bam_pileup1_t *p, int *len) -{ - int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; - for (icig=0; icigb->core.n_cigar; icig++) - { - int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK; - int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; - if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) - { - n_tot_bases += ncig; - iread += ncig; - continue; - } - if ( cig==BAM_CINS ) - { - n_tot_bases += ncig; - iread += ncig; +static int get_position(const bam_pileup1_t *p, int *len, + int *sc_len, int *sc_dist) { + int i, j, edist = p->qpos + 1; + int sc_left = 0, sc_right = 0; + int sc_left_dist = -1, sc_right_dist = -1; + + // left end + for (i = 0; i < p->b->core.n_cigar; i++) { + int cig = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK; + if (cig == BAM_CHARD_CLIP) continue; - } - if ( cig==BAM_CSOFT_CLIP ) - { - iread += ncig; - if ( iread<=p->qpos ) edist -= ncig; + else if (cig == BAM_CSOFT_CLIP) + sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT; + else + break; + } + if (sc_left) + sc_left_dist = p->qpos+1 - sc_left; + edist -= sc_left; + + // right end + for (j = p->b->core.n_cigar-1; j >= i; j--) { + int cig = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK; + if (cig == BAM_CHARD_CLIP) continue; + else if (cig == BAM_CSOFT_CLIP) + sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT; + else + break; + } + if (sc_right) + sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos; + + // Distance to nearest soft-clips and length of that clip. + if (sc_left_dist >= 0) { + if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) { + *sc_len = sc_left; + *sc_dist = sc_left_dist; } - if ( cig==BAM_CDEL ) continue; - if ( cig==BAM_CHARD_CLIP ) continue; - if ( cig==BAM_CPAD ) continue; - if ( cig==BAM_CREF_SKIP ) continue; - fprintf(bcftools_stderr,"todo: cigar %d\n", cig); - assert(0); - } - *len = n_tot_bases; + } else if (sc_right_dist >= 0) { + *sc_len = sc_right; + *sc_dist = sc_right_dist; + } else { + *sc_len = 0; + *sc_dist = 0; + } + + *len = p->b->core.l_qseq - sc_left - sc_right; return edist; } @@ -119,8 +147,12 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) { memset(bca->ref_pos,0,sizeof(int)*bca->npos); memset(bca->alt_pos,0,sizeof(int)*bca->npos); + memset(bca->iref_pos,0,sizeof(int)*bca->npos); + memset(bca->ialt_pos,0,sizeof(int)*bca->npos); memset(bca->ref_mq,0,sizeof(int)*bca->nqual); memset(bca->alt_mq,0,sizeof(int)*bca->nqual); + memset(bca->iref_mq,0,sizeof(int)*bca->nqual); + memset(bca->ialt_mq,0,sizeof(int)*bca->nqual); memset(bca->ref_bq,0,sizeof(int)*bca->nqual); memset(bca->alt_bq,0,sizeof(int)*bca->nqual); memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual); @@ -128,13 +160,18 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); + memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES); + memset(bca->ref_scl, 0, 100*sizeof(int)); + memset(bca->alt_scl, 0, 100*sizeof(int)); + memset(bca->iref_scl, 0, 100*sizeof(int)); + memset(bca->ialt_scl, 0, 100*sizeof(int)); } /* Notes: - - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies - which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation. - Later it's used for multiallelic calling by bcftools -m + - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies + which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations. + Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`. - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. */ /* @@ -152,7 +189,6 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // clean from previous run r->ori_depth = 0; r->mq0 = 0; - memset(r->qsum,0,sizeof(float)*4); memset(r->anno,0,sizeof(double)*16); memset(r->p,0,sizeof(float)*25); r->SCR = 0; @@ -168,30 +204,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t kroundup32(bca->max_bases); bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } + // fill the bases array + double nqual_over_60 = bca->nqual / 60.0; + int ADR_ref_missed[4] = {0}; + int ADF_ref_missed[4] = {0}; for (i = n = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; + if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; if (p->is_del && !is_indel) continue; ++ori_depth; if (is_indel) { - b = p->aux>>16&0x3f; - baseQ = q = p->aux&0xff; - // This read is not counted as indel. Instead of skipping it, treat it as ref. It is - // still only an approximation, but gives more accurate AD counts and calls correctly - // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam) - if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos]; - seqQ = p->aux>>8&0xff; + b = p->aux>>16&0x3f; + seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias + if (q < bca->min_baseQ) + { + if (!p->indel && b < 4) + { + if (bam_is_rev(p->b)) + ADR_ref_missed[b]++; + else + ADF_ref_missed[b]++; + } + continue; + } + if (p->indel == 0 && (q < _n/2 || _n > 20)) { + // high quality indel calls without p->indel set aren't + // particularly indicative of being a good REF match either, + // at least not in low coverage. So require solid coverage + // before we start utilising such quals. + b = 0; + q = (int)bam_get_qual(p->b)[p->qpos]; + seqQ = (3*seqQ + 2*q)/8; + } + if (_n > 20 && seqQ > 40) seqQ = 40; + baseQ = p->aux>>8&0xff; + is_diff = (b != 0); } else { b = bam_seqi(bam_get_seq(p->b), p->qpos); // base b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base - baseQ = q = (int)bam_get_qual(p->b)[p->qpos]; + + // Lowest of this and neighbour quality values + uint8_t *qual = bam_get_qual(p->b); + q = qual[p->qpos]; + if (p->qpos > 0 && + q > qual[p->qpos-1]+bca->delta_baseQ) + q = qual[p->qpos-1]+bca->delta_baseQ; + if (p->qpos+1 < p->b->core.l_qseq && + q > qual[p->qpos+1]+bca->delta_baseQ) + q = qual[p->qpos+1]+bca->delta_baseQ; + if (q < bca->min_baseQ) continue; + if (q > bca->max_baseQ) q = bca->max_baseQ; + baseQ = q; seqQ = 99; is_diff = (ref4 < 4 && b == ref4)? 0 : 1; } @@ -203,11 +274,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (q > 63) q = 63; if (q < 4) q = 4; // MQ=0 reads count as BQ=4 bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; - if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; // collect annotations if (b < 4) { - r->qsum[b] += q; + r->QS[b] += q; if ( r->ADF ) { if ( bam_is_rev(p->b) ) @@ -230,29 +300,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // collect for bias tests if ( baseQ > 59 ) baseQ = 59; if ( mapQ > 59 ) mapQ = 59; - int len, epos = 0; - if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) ) + int len, epos = 0, sc_len = 0, sc_dist = 0; + if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) ) { - int pos = get_position(p, &len); + int pos = get_position(p, &len, &sc_len, &sc_dist); epos = (double)pos/(len+1) * bca->npos; + + if (sc_len) { + sc_len = 15.0*sc_len / sc_dist; + if (sc_len > 99) sc_len = 99; + } } - int ibq = baseQ/60. * bca->nqual; - int imq = mapQ/60. * bca->nqual; - if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; - else bca->fwd_mqs[imq]++; + + int imq = mapQ * nqual_over_60; + int ibq = baseQ * nqual_over_60; + + if ( bam_is_rev(p->b) ) + bca->rev_mqs[imq]++; + else + bca->fwd_mqs[imq]++; + if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) { bca->ref_pos[epos]++; bca->ref_bq[ibq]++; bca->ref_mq[imq]++; + bca->ref_scl[sc_len]++; } else { bca->alt_pos[epos]++; bca->alt_bq[ibq]++; bca->alt_mq[imq]++; + bca->alt_scl[sc_len]++; } } + + // Compensate for AD not being counted on low quality REF indel matches. + if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 ) + { + for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0? + { + r->ADR[i] += ADR_ref_missed[i]; + r->ADF[i] += ADF_ref_missed[i]; + } + } + else if ( r->ADF && bca->ambig_reads==B2B_INC_AD ) + { + int dp = 0, dp_ambig = 0; + for (i=0; i<4; i++) dp += r->ADR[i]; + for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i]; + if ( dp ) + for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp); + dp = 0, dp_ambig = 0; + for (i=0; i<4; i++) dp += r->ADF[i]; + for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i]; + if ( dp ) + for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp); + } + r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype @@ -439,7 +545,7 @@ double calc_mwu_bias_cdf(int *a, int *b, int n) return pval>1 ? 1 : pval; } -double calc_mwu_bias(int *a, int *b, int n) +double calc_mwu_bias(int *a, int *b, int n, int left) { int na = 0, nb = 0, i; double U = 0, ties = 0; @@ -463,6 +569,7 @@ double calc_mwu_bias(int *a, int *b, int n) if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely double mean = ((double)na*nb)*0.5; + if (left && U > mean) return 1; // for MQB which is asymmetrical if ( na==2 || nb==2 ) { // Linear approximation @@ -485,6 +592,85 @@ double calc_mwu_bias(int *a, int *b, int n) return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2); } +// A Z-score version of the above function. +// +// See "Normal approximation and tie correction" at +// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test +// +// The Z score is the number of standard deviations above or below the mean +// with 0 being equality of the two distributions and +ve/-ve from there. +// +// This is a more robust score to filter on. +double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) { + int i; + int64_t t; + + // Optimisation + for (i = 0; i < n; i++) + if (b[i]) + break; + int b_empty = (i == n); + + // Count equal (e), less-than (l) and greater-than (g) permutations. + int e = 0, l = 0, na = 0, nb = 0; + if (b_empty) { + for (t = 0, i = n-1; i >= 0; i--) { + na += a[i]; + t += (a[i]*a[i]-1)*a[i]; // adjustment score for ties + } + } else { + for (t = 0, i = n-1; i >= 0; i--) { + // Combinations of a[i] and b[j] for i==j + e += a[i]*b[i]; + + // nb is running total of b[i+1]..b[n-1]. + // Therefore a[i]*nb is the number of combinations of a[i] and b[j] + // for all i < j. + l += a[i]*nb; // a= 0 ? 0.5 : -0.5)) / sd; // gatk method? + return (U - m) / sqrt(var2); + } + + // Else U score, which can be asymmetric for some data types. + if (left_only && U > m) + return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not. + + if (na >= 8 || nb >= 8) { + // Normal approximation, very good for na>=8 && nb>=8 and + // reasonable if na<8 or nb<8 + return exp(-0.5*(U-m)*(U-m)/var2); + } + + // Exact calculation + if (na==1 || nb == 1) + return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2); + else + return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2); +} + static inline double logsumexp2(double a, double b) { if ( a>b ) @@ -560,7 +746,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j; - float qsum[5] = {0,0,0,0,0}; + float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0}; if (ref_base >= 0) { call->ori_ref = ref4 = seq_nt16_int[ref_base]; if (ref4 > 4) ref4 = 4; @@ -571,9 +757,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int for (i = 0; i < n; ++i) { float sum = 0; - for (j = 0; j < 4; ++j) sum += calls[i].qsum[j]; + for (j = 0; j < 4; ++j) sum += calls[i].QS[j]; if ( sum ) - for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum; + for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum; } // sort qsum in ascending order (insertion sort) @@ -585,7 +771,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int // Set the reference allele and alternative allele(s) for (i=0; i<5; i++) call->a[i] = -1; - for (i=0; i<5; i++) call->qsum[i] = 0; + for (i=0; iqsum[i] = 0; call->unseen = -1; call->a[0] = ref4; for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering @@ -697,6 +883,21 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int adf += B2B_MAX_ALLELES; } } + if ( bca->fmt_flag & B2B_FMT_QS ) + { + assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well + + // reorder QS to match the allele ordering at this site + int32_t tmp[B2B_MAX_ALLELES]; + int32_t *qs = call->QS, *qs_out = call->QS; + for (i=0; in_alleles; j++) tmp[j] = qs[ call->a[j] ]; + for (j=0; jn_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32; + qs_out += call->n_alleles; + qs += B2B_MAX_ALLELES; + } + } // if (ref_base < 0) fprintf(bcftools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); call->shift = (int)(sum_min + .499); @@ -719,11 +920,43 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - if ( bca->fmt_flag & B2B_INFO_RPB ) - call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); + if (bca->fmt_flag & B2B_INFO_ZSCORE) { + // U z-normalised as +/- number of standard deviations from mean. + if (call->ori_ref < 0) { + if (bca->fmt_flag & B2B_INFO_RPB) + call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, + bca->npos, 0, 1); + call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, + bca->nqual,1,1); + if ( bca->fmt_flag & B2B_INFO_SCB ) + call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, + 100, 0,1); + } else { + if (bca->fmt_flag & B2B_INFO_RPB) + call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, + bca->npos, 0, 1); + call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, + bca->nqual,1,1); + call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, + bca->nqual,0,1); + call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, + bca->nqual,0,1); + if ( bca->fmt_flag & B2B_INFO_SCB ) + call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, + 100, 0,1); + } + } else { + // Old method; U as probability between 0 and 1 + if ( bca->fmt_flag & B2B_INFO_RPB ) + call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, + bca->npos, 0, 0); + call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, + bca->nqual, 1, 0); + call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, + bca->nqual, 0, 0); + call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, + bca->nqual, 0, 0); + } #if CDF_MWU_TESTS // CDF version of MWU tests is not calculated by default @@ -734,7 +967,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); #endif - if ( bca->fmt_flag & B2B_INFO_VDB ) + if ( bca->fmt_flag & B2B_INFO_VDB ) call->vdb = calc_vdb(bca->alt_pos, bca->npos); return 0; @@ -821,10 +1054,32 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + + if (bca->fmt_flag & B2B_INFO_ZSCORE) { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); + if ( bc->mwu_sc != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); + } else { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + } + + if ( bc->strand_bias != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); + #if CDF_MWU_TESTS if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); @@ -886,6 +1141,8 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, } if ( fmt_flag&B2B_FMT_SCR ) bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample); + if ( fmt_flag&B2B_FMT_QS ) + bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele); return 0; } diff --git a/bcftools/bam2bcf.h b/bcftools/bam2bcf.h index 2d2cf83..e8b0fb9 100644 --- a/bcftools/bam2bcf.h +++ b/bcftools/bam2bcf.h @@ -1,7 +1,7 @@ /* bam2bcf.h -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2014,2016 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Heng Li @@ -59,21 +59,36 @@ DEALINGS IN THE SOFTWARE. */ #define B2B_FMT_SCR (1<<13) #define B2B_INFO_VDB (1<<14) #define B2B_INFO_RPB (1<<15) +#define B2B_FMT_QS (1<<16) +#define B2B_INFO_SCB (1<<17) +#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised #define B2B_MAX_ALLELES 5 +#define B2B_DROP 0 +#define B2B_INC_AD 1 +#define B2B_INC_AD0 2 + #define PLP_HAS_SOFT_CLIP(i) ((i)&1) -#define PLP_SAMPLE_ID(i) ((i)>>1) +#define PLP_HAS_INDEL(i) ((i)&2) +#define PLP_SAMPLE_ID(i) ((i)>>2) + +#define PLP_SET_SOFT_CLIP(i) ((i)|=1) +#define PLP_SET_INDEL(i) ((i)|=2) +#define PLP_SET_SAMPLE_ID(i,n) ((i)|=(n)<<2) typedef struct __bcf_callaux_t { - int fmt_flag; - int capQ, min_baseQ; + int fmt_flag, ambig_reads; + int capQ, min_baseQ, max_baseQ, delta_baseQ; int openQ, extQ, tandemQ; // for indels uint32_t min_support, max_support; // for collecting indel candidates double min_frac; // for collecting indel candidates float max_frac; // for collecting indel candidates int per_sample_flt; // indel filtering strategy int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests + int *iref_pos, *ialt_pos, *iref_mq, *ialt_mq; // for indels + int ref_scl[100], alt_scl[100]; // soft-clip length bias; SNP + int iref_scl[100], ialt_scl[100]; // soft-clip length bias; INDEL // for internal uses int max_bases; int indel_types[4]; // indel lengths @@ -83,14 +98,14 @@ typedef struct __bcf_callaux_t { uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types) errmod_t *e; void *rghash; + float indel_bias; // adjusts indel score threshold; lower => call more. } bcf_callaux_t; // per-sample values typedef struct { - uint32_t ori_depth; + uint32_t ori_depth; // ori_depth = anno[0..3] but before --min-BQ is applied unsigned int mq0; - int32_t *ADF, *ADR, SCR; - float qsum[4]; + int32_t *ADF, *ADR, SCR, *QS; // FMT/QS // The fields are: // depth fwd .. ref (0) and non-ref (2) // depth rev .. ref (1) and non-ref (3) @@ -112,19 +127,20 @@ typedef struct { int tid, pos; bcf_hdr_t *bcf_hdr; int a[5]; // alleles: ref, alt, alt2, alt3 - float qsum[5]; // for the QS tag + float qsum[B2B_MAX_ALLELES]; // INFO/QS tag int n, n_alleles, shift, ori_ref, unseen; int n_supp; // number of supporting non-reference reads double anno[16]; unsigned int depth, ori_depth, mq0; - int32_t *PL, *DP4, *ADR, *ADF, *SCR; + int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS; uint8_t *fmt_arr; float vdb; // variant distance bias - float mwu_pos, mwu_mq, mwu_bq, mwu_mqs; + float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc; #if CDF_MWU_TESTS float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf; #endif float seg_bias; + float strand_bias; // phred-scaled fisher-exact test kstring_t tmp; } bcf_call_t; @@ -132,7 +148,8 @@ typedef struct { extern "C" { #endif - bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); + bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ, + int delta_baseQ); void bcf_call_destroy(bcf_callaux_t *bca); int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); diff --git a/bcftools/bam2bcf_indel.c b/bcftools/bam2bcf_indel.c index 6c367da..facb3bf 100644 --- a/bcftools/bam2bcf_indel.c +++ b/bcftools/bam2bcf_indel.c @@ -1,7 +1,7 @@ /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012-2014,2016 Genome Research Ltd. + Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd. Author: Heng Li @@ -26,19 +26,29 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include #include "bam2bcf.h" +#include "str_finder.h" #include KSORT_INIT_GENERIC(uint32_t) #define MINUS_CONST 0x10000000 -#define INDEL_WINDOW_SIZE 50 +#define INDEL_WINDOW_SIZE 110 +#define MAX_TYPES 64 + +// Take a reference position tpos and convert to a query position (returned). +// This uses the CIGAR string plus alignment c->pos to do the mapping. +// +// *_tpos is returned as tpos if query overlaps tpos, but for deletions +// it'll be either the start (is_left) or end (!is_left) ref position. static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) { + // x = pos in ref, y = pos in query seq int k, x = c->pos, y = 0, last_y = 0; *_tpos = c->pos; for (k = 0; k < c->n_cigar; ++k) { @@ -64,6 +74,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, *_tpos = x; return last_y; } + // FIXME: check if the inserted sequence is consistent with the homopolymer run // l is the relative gap length and l_run is the length of the homopolymer on the reference static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) @@ -87,21 +98,609 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) return max_i - pos; } +// Identify spft-clip length, position in seq, and clipped seq len +static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p, + int *sc_len_r, int *slen_r, int *epos_r, int *end) { + bam1_t *b = p->b; + int sc_len = 0, sc_dist = -1, at_left = 1; + int epos = p->qpos, slen = b->core.l_qseq; + int k; + uint32_t *cigar = bam_get_cigar(b); + *end = -1; + for (k = 0; k < b->core.n_cigar; k++) { + int op = bam_cigar_op(cigar[k]); + if (op == BAM_CSOFT_CLIP) { + slen -= bam_cigar_oplen(cigar[k]); + if (at_left) { + // left end + sc_len += bam_cigar_oplen(cigar[k]); + epos -= sc_len; // don't count SC in seq pos + sc_dist = epos; + *end = 0; + } else { + // right end + int srlen = bam_cigar_oplen(cigar[k]); + int rd = b->core.l_qseq - srlen - p->qpos; + if (sc_dist < 0 || sc_dist > rd) { + // closer to right end than left + // FIXME: compensate for indel length too? + sc_dist = rd; + sc_len = srlen; + *end = 1; + } + } + } else if (op != BAM_CHARD_CLIP) { + at_left = 0; + } + } + + if (p->indel > 0 && slen - (epos+p->indel) < epos) + epos += p->indel-1; // end of insertion, if near end of seq + + // slen is now length of sequence minus soft-clips and + // epos is position of indel in seq minus left-clip. + *epos_r = (double)epos / (slen+1) * bca->npos; + + if (sc_len) { + // scale importance of clip by distance to closest end + *sc_len_r = 15.0*sc_len / (sc_dist+1); + if (*sc_len_r > 99) *sc_len_r = 99; + } else { + *sc_len_r = 0; + } + + *slen_r = slen; +} + +// Part of bcf_call_gap_prep. +// +// Scans the pileup to identify all the different sizes of indels +// present. +// +// Returns types and fills out n_types_r, max_rd_len_r and ref_type_r, +// or NULL on error. +static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int *max_rd_len_r, int *n_types_r, + int *ref_type_r, int *N_r) { + int i, j, t, s, N, m, max_rd_len, n_types; + int n_alt = 0, n_tot = 0, indel_support_ok = 0; + uint32_t *aux; + int *types; + + // N is the total number of reads + for (s = N = 0; s < n; ++s) + N += n_plp[s]; + + bca->max_support = bca->max_frac = 0; + aux = (uint32_t*) calloc(N + 1, 4); + if (!aux) + return NULL; + + m = max_rd_len = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type (REF) + + // Fill out aux[] array with all the non-zero indel sizes. + // Also tally number with indels (n_alt) and total (n_tot). + for (s = 0; s < n; ++s) { + int na = 0, nt = 0; + for (i = 0; i < n_plp[s]; ++i) { + const bam_pileup1_t *p = plp[s] + i; + ++nt; + if (p->indel != 0) { + ++na; + aux[m++] = MINUS_CONST + p->indel; + } + + // FIXME: cache me in pileup struct. + j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + double frac = (double)na/nt; + if ( !indel_support_ok && na >= bca->min_support + && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) + bca->max_support = na, bca->max_frac = frac; + + n_alt += na; + n_tot += nt; + } + + // Sort aux[] and dedup + ks_introsort(uint32_t, m, aux); + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + + // Taking totals makes it hard to call rare indels (IMF filter) + if ( !bca->per_sample_flt ) + indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac + || n_alt < bca->min_support ) + ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); + return NULL; + } + + // Bail out if we have far too many types of indel + if (n_types >= MAX_TYPES) { + free(aux); + // TODO revisit how/whether to control printing this warning + if (hts_verbose >= 2) + fprintf(stderr, "[%s] excessive INDEL alleles at position %d. " + "Skip the position.\n", __func__, pos + 1); + return NULL; + } + + // To prevent long stretches of N's to be mistaken for indels + // (sometimes thousands of bases), check the number of N's in the + // sequence and skip places where half or more reference bases are Ns. + int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len + ?2*INDEL_WINDOW_SIZE : max_rd_len); + for (i=pos; i(i-pos) ) { + free(aux); + return NULL; + } + + // Finally fill out the types[] array detailing the size of insertion + // or deletion. + types = (int*)calloc(n_types, sizeof(int)); + if (!types) { + free(aux); + return NULL; + } + t = 0; + types[t++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) + if (aux[i] != aux[i-1]) + types[t++] = aux[i] - MINUS_CONST; + free(aux); + + // Find reference type; types[?] == 0) + for (t = 0; t < n_types; ++t) + if (types[t] == 0) break; + + *ref_type_r = t; + *n_types_r = n_types; + *max_rd_len_r = max_rd_len; + *N_r = N; + + return types; +} + +// Part of bcf_call_gap_prep. +// +// Construct per-sample consensus. +// +// Returns an array of consensus seqs, +// or NULL on failure. +static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int left, int right) { + int i, k, s, L = right - left + 1, max_i, max2_i; + char **ref_sample; // returned + uint32_t *cns = NULL, max, max2; + char *ref0 = NULL, *r; + ref_sample = (char**) calloc(n, sizeof(char*)); + cns = (uint32_t*) calloc(L, 4); + ref0 = (char*) calloc(L, 1); + if (!ref_sample || !cns || !ref0) { + n = 0; + goto err; + } + + // Convert ref ASCII to 0-15. + for (i = 0; i < right - left; ++i) + ref0[i] = seq_nt16_table[(int)ref[i+left]]; + + // NB: one consensus per sample 'n', not per indel type. + // FIXME: consider fixing this. We should compute alignments vs + // types, not vs samples? Or types/sample combined? + for (s = 0; s < n; ++s) { + r = ref_sample[s] = (char*) calloc(L, 1); + if (!r) { + n = s-1; + goto err; + } + + memset(cns, 0, sizeof(int) * L); + + // collect ref and non-ref counts in cns + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + int x = b->core.pos, y = 0; + + // TODO: pileup exposes pileup_ind, but we also need e.g. + // pileup_len to know how much of the current CIGAR op-len + // we've used (or have remaining). If we had that, we + // could start at p->qpos without having to scan through + // the entire CIGAR string until we find it. + // + // Without it about all we could do is have a side channel + // to cache the last known coords. Messy, so punt for now. + // This is no longer the bottle neck until we get to 1000s of + // CIGAR ops. + + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k]&0xf; + int j, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if (x + l >= left) { + j = left - x > 0 ? left - x : 0; + int j_end = right - x < l ? right - x : l; + for (; j < j_end; j++) + // Append to cns. Note this is ref coords, + // so insertions aren't in cns and deletions + // will have lower coverage. + + // FIXME: want true consensus (with ins) per + // type, so we can independently compare each + // seq to each consensus and see which it + // matches best, so we get proper GT analysis. + cns[x+j-left] += + (bam_seqi(seq, y+j) == ref0[x+j-left]) + ? 1 // REF + : (1<<16); // ALT + } + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + x += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + } + + if (x > right) + break; + } + } + + // Determine a sample specific reference. + for (i = 0; i < right - left; ++i) + r[i] = ref0[i]; + + // Find deepest and 2nd deepest ALT region (max & max2). + max = max2 = 0; max_i = max2_i = -1; + for (i = 0; i < right - left; ++i) { + if (cns[i]>>16 >= max>>16) + max2 = max, max2_i = max_i, max = cns[i], max_i = i; + else if (cns[i]>>16 >= max2>>16) + max2 = cns[i], max2_i = i; + } + + // Masks mismatches present in at least 70% of the reads with 'N'. + // This code is nREF/(nREF+n_ALT) >= 70% for deepest region. + // The effect is that at least 30% of bases differing to REF will + // use "N" in consensus, so we don't penalise ALT or REF when + // aligning against it. (A poor man IUPAC code) + // + // Why is it only done in two loci at most? + if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) + max_i = -1; + if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) + max2_i = -1; + if (max_i >= 0) r[max_i] = 15; + if (max2_i >= 0) r[max2_i] = 15; + + //for (i = 0; i < right - left; ++i) + // fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); + //fputc('\n', stderr); + } + + free(ref0); + free(cns); + + return ref_sample; + + err: + free(ref0); + free(cns); + if (ref_sample) { + for (s = 0; s < n; s++) + free(ref_sample[s]); + free(ref_sample); + } + + return NULL; +} + +// The length of the homopolymer run around the current position +static int bcf_cgp_l_run(const char *ref, int pos) { + int i, l_run; + + int c = seq_nt16_table[(int)ref[pos + 1]]; + if (c == 15) { + l_run = 1; + } else { + for (i = pos + 2; ref[i]; ++i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run = i; + for (i = pos; i >= 0; --i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run -= i + 1; + } + + return l_run; +} + + +// Compute the consensus for this sample 's', minus indels which +// get added later. +static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, + int pos, int *types, int n_types, + int max_ins, int s) { + int i, j, t, k; + int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int)); + if (!inscns_aux) + return NULL; + + // Count the number of occurrences of each base at each position for + // each type of insertion. + for (t = 0; t < n_types; ++t) { + if (types[t] > 0) { + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + if (p->indel == types[t]) { + uint8_t *seq = bam_get_seq(p->b); + for (k = 1; k <= p->indel; ++k) { + int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; + } + } + } + } + } + } + + // Use the majority rule to construct the consensus + char *inscns = (char *)calloc(n_types * max_ins, 1); + for (t = 0; t < n_types; ++t) { + for (j = 0; j < types[t]; ++j) { + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) + if (ia[k] > max) + max = ia[k], max_k = k; + inscns[t*max_ins + j] = max ? max_k : 4; + if (max_k == 4) { + // discard insertions which contain N's + types[t] = 0; + break; + } + } + } + free(inscns_aux); + + return inscns; +} + +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +// Part of bcf_call_gap_prep. +// +// Realign using BAQ to get an alignment score of a single read vs +// a haplotype consensus. +// +// Fills out score +// Returns 0 on success, +// <0 on error +static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, + int type, uint8_t *ref2, uint8_t *query, + int r_start, int r_end, int long_read, + int tbeg, int tend, + int left, int right, + int qbeg, int qend, + int qpos, int max_deletion, + int *score) { + // Illumina + probaln_par_t apf = { 1e-4, 1e-2, 10 }; + + // Parameters that work better on PacBio CCS 15k. + // We should consider querying the header and RG PU field. + // See also htslib/realn.c:sam_prob_realn() + if (long_read) { + apf.d = 1e-3; + apf.e = 1e-1; + } + + type = abs(type); + apf.bw = type + 3; + int l, sc; + const uint8_t *qual = bam_get_qual(p->b), *bq; + uint8_t *qq; + + // Get segment of quality, either ZQ tag or if absent QUAL. + if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) + return -1; + bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); + if (bq) ++bq; // skip type + for (l = qbeg; l < qend; ++l) { + int qval = bq? qual[l] + (bq[l] - 64) : qual[l]; + if (qval > 30) + qval = 30; + if (qval < 7) + qval = 7; + qq[l - qbeg] = qval; + } + + // The bottom 8 bits are length-normalised score while + // the top bits are unnormalised. + sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, + query, qend - qbeg, qq, &apf, 0, 0); + if (sc < 0) { + *score = 0xffffff; + free(qq); + return 0; + } + + // used for adjusting indelQ below + l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias; + *score = sc<<8 | MIN(255, l); + + rep_ele *reps, *elt, *tmp; + uint8_t *seg = ref2 + tbeg - left; + int seg_len = tend - tbeg + type; + + // Note: although seg moves (tbeg varies), ref2 is reused many times + // so we could factor out some find_STR calls. However it's not the + // bottleneck for now. + + // FIXME: need to make this work on IUPAC. + reps = find_STR((char *)seg, seg_len, 0); + int iscore = 0; + + // Identify STRs in ref covering the indel up to + // (or close to) the end of the sequence. + // Those having an indel and right at the sequence + // end do not confirm the total length of indel + // size. Specifically a *lack* of indel at the + // end, where we know indels occur in other + // sequences, is a possible reference bias. + // + // This is emphasised further if the sequence ends with + // soft clipping. + DL_FOREACH_SAFE(reps, elt, tmp) { + if (elt->start <= qpos && elt->end >= qpos) { + iscore += (elt->end-elt->start) / elt->rep_len; // c + if (elt->start+tbeg <= r_start || + elt->end+tbeg >= r_end) + iscore += 2*(elt->end-elt->start); + } + + DL_DELETE(reps, elt); + free(elt); + } + + // Apply STR score to existing indelQ + l = (*score&0xff)*.8 + iscore*2; + *score = (*score & ~0xff) | MIN(255, l); + + free(qq); + + return 0; +} + +// Part of bcf_call_gap_prep. +// +// Returns n_alt on success +// -1 on failure +static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, + bcf_callaux_t *bca, char *inscns, + int l_run, int max_ins, + int ref_type, int *types, int n_types, + int *score) { + // FIXME: n_types has a maximum; no need to alloc - use a #define? + int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp; + memset(sumq, 0, n_types * sizeof(int)); + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int *sct = &score[K*n_types], seqQ, indelQ; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + + /* errmod_cal() assumes that if the call is wrong, the + * likelihoods of other events are equal. This is about + * right for substitutions, but is not desired for + * indels. To reuse errmod_cal(), I have to make + * compromise for multi-allelic indels. + */ + if ((sc[0]&0x3f) == ref_type) { + indelQ = (sc[1]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ = (sc[t]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); + } + tmp = sc[0]>>6 & 0xff; + // reduce indelQ + indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); + + // Doesn't really help accuracy, but permits -h to take + // affect still. + if (indelQ > seqQ) indelQ = seqQ; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total + sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; + // fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + } + } + // determine bca->indel_types[] and bca->inscns + bca->maxins = max_ins; + bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); + if (bca->maxins && !bca->inscns) + return -1; + for (t = 0; t < n_types; ++t) + sumq[t] = sumq[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first + tmp = sumq[t]; + for (; t > 0; --t) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; + for (t = 0; t < 4 && t < n_types; ++t) { + bca->indel_types[t] = types[sumq[t]&0x3f]; + if (bca->maxins) + memcpy(&bca->inscns[t * bca->maxins], + &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); + } + // update p->aux + for (s = n_alt = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + int x = types[p->aux>>16&0x3f]; + for (j = 0; j < 4; ++j) + if (x == bca->indel_types[j]) break; + p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); + if ((p->aux>>16&0x3f) > 0) ++n_alt; + //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + } + } + + return n_alt; +} + +/* +FIXME: with high number of samples, do we handle IMF correctly? Is it +fraction of indels across entire data set, or just fraction for this +specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt. + */ + /* notes: - - n .. number of samples - - the routine sets bam_pileup1_t.aux of each read as follows: - - 6: unused - - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f - - 8: estimated sequence quality .. (aux>>8)&0xff - - 8: indel quality .. aux&0xff + - n .. number of samples + - the routine sets bam_pileup1_t.aux of each read as follows: + - 6: unused + - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f + - 8: estimated sequence quality .. (aux>>8)&0xff + - 8: indel quality .. aux&0xff */ -int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) +int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, + bcf_callaux_t *bca, const char *ref) { - int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; + if (ref == 0 || bca == 0) return -1; + + int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins; + int *score, max_ref2; int N, K, l_run, ref_type, n_alt; char *inscns = 0, *ref2, *query, **ref_sample; - if (ref == 0 || bca == 0) return -1; // determine if there is a gap for (s = N = 0; s < n; ++s) { @@ -109,77 +708,29 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (plp[s][i].indel != 0) break; if (i < n_plp[s]) break; } - if (s == n) return -1; // there is no indel at this position. - for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads - { // find out how many types of indels are present - bca->max_support = bca->max_frac = 0; - int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; - uint32_t *aux; - aux = (uint32_t*) calloc(N + 1, 4); - m = max_rd_len = 0; - aux[m++] = MINUS_CONST; // zero indel is always a type - for (s = 0; s < n; ++s) { - int na = 0, nt = 0; - for (i = 0; i < n_plp[s]; ++i) { - const bam_pileup1_t *p = plp[s] + i; - ++nt; - if (p->indel != 0) { - ++na; - aux[m++] = MINUS_CONST + p->indel; - } - j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); - if (j > max_rd_len) max_rd_len = j; - } - double frac = (double)na/nt; - if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) - indel_support_ok = 1; - if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; - n_alt += na; - n_tot += nt; - } - // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), - // check the number of N's in the sequence and skip places where half or more reference bases are Ns. - int nN=0; for (i=pos; i-pos(i-pos) ) { free(aux); return -1; } - - ks_introsort(uint32_t, m, aux); - // squeeze out identical types - for (i = 1, n_types = 1; i < m; ++i) - if (aux[i] != aux[i-1]) ++n_types; - // Taking totals makes it hard to call rare indels - if ( !bca->per_sample_flt ) - indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; - if ( n_types == 1 || !indel_support_ok ) { // then skip - free(aux); return -1; - } - if (n_types >= 64) { - free(aux); - // TODO revisit how/whether to control printing this warning - if (hts_verbose >= 2) - fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); - return -1; - } - types = (int*)calloc(n_types, sizeof(int)); - t = 0; - types[t++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) - if (aux[i] != aux[i-1]) - types[t++] = aux[i] - MINUS_CONST; - free(aux); - for (t = 0; t < n_types; ++t) - if (types[t] == 0) break; - ref_type = t; // the index of the reference type (0) - } - { // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; - if (types[0] < 0) right -= types[0]; - // in case the alignments stand out the reference - for (i = pos; i < right; ++i) - if (ref[i] == 0) break; - right = i; - } - /* The following block fixes a long-existing flaw in the INDEL + if (s == n) + // there is no indel at this position. + return -1; + + // find out how many types of indels are present + types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref, + &max_rd_len, &n_types, &ref_type, &N); + if (!types) + return -1; + + + // calculate left and right boundary + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; + if (types[0] < 0) right -= types[0]; + + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + + + /* The following call fixes a long-existing flaw in the INDEL * calling model: the interference of nearby SNPs. However, it also * reduces the power because sometimes, substitutions caused by * indels are not distinguishable from true mutations. Multiple @@ -187,284 +738,211 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla * * Masks mismatches present in at least 70% of the reads with 'N'. */ - { // construct per-sample consensus - int L = right - left + 1, max_i, max2_i; - uint32_t *cns, max, max2; - char *ref0, *r; - ref_sample = (char**) calloc(n, sizeof(char*)); - cns = (uint32_t*) calloc(L, 4); - ref0 = (char*) calloc(L, 1); - for (i = 0; i < right - left; ++i) - ref0[i] = seq_nt16_table[(int)ref[i+left]]; - for (s = 0; s < n; ++s) { - r = ref_sample[s] = (char*) calloc(L, 1); - memset(cns, 0, sizeof(int) * L); - // collect ref and non-ref counts - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); - int x = b->core.pos, y = 0; - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) - if (x + j >= left && x + j < right) - cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - } - } - // determine the consensus - for (i = 0; i < right - left; ++i) r[i] = ref0[i]; - max = max2 = 0; max_i = max2_i = -1; - for (i = 0; i < right - left; ++i) { - if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; - else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; - } - if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; - if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; - if (max_i >= 0) r[max_i] = 15; - if (max2_i >= 0) r[max2_i] = 15; - //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); - } - free(ref0); free(cns); - } - { // the length of the homopolymer run around the current position - int c = seq_nt16_table[(int)ref[pos + 1]]; - if (c == 15) l_run = 1; - else { - for (i = pos + 2; ref[i]; ++i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run = i; - for (i = pos; i >= 0; --i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run -= i + 1; - } - } - // construct the consensus sequence + ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right); + + // The length of the homopolymer run around the current position + l_run = bcf_cgp_l_run(ref, pos); + + // construct the consensus sequence (minus indels, which are added later) max_ins = types[n_types - 1]; // max_ins is at least 0 if (max_ins > 0) { - int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int)); - // count the number of occurrences of each base at each position for each type of insertion - for (t = 0; t < n_types; ++t) { - if (types[t] > 0) { - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - if (p->indel == types[t]) { - uint8_t *seq = bam_get_seq(p->b); - for (k = 1; k <= p->indel; ++k) { - int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; - assert(c<5); - ++inscns_aux[(t*max_ins+(k-1))*5 + c]; - } - } - } - } - } - } - // use the majority rule to construct the consensus - inscns = (char*) calloc(n_types * max_ins, 1); - for (t = 0; t < n_types; ++t) { - for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; - for (k = 0; k < 5; ++k) - if (ia[k] > max) - max = ia[k], max_k = k; - inscns[t*max_ins + j] = max? max_k : 4; - if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's - } - } - free(inscns_aux); + inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos, + types, n_types, max_ins, s); + if (!inscns) + return -1; } + // compute the likelihood given each type of indel for each read max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); ref2 = (char*) calloc(max_ref2, 1); query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); - score1 = (int*) calloc(N * n_types, sizeof(int)); - score2 = (int*) calloc(N * n_types, sizeof(int)); + score = (int*) calloc(N * n_types, sizeof(int)); bca->indelreg = 0; + double nqual_over_60 = bca->nqual / 60.0; + for (t = 0; t < n_types; ++t) { int l, ir; - probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; - apf1.bw = apf2.bw = abs(types[t]) + 3; + // compute indelreg - if (types[t] == 0) ir = 0; - else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); - else ir = est_indelreg(pos, ref, -types[t], 0); - if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir); - // realignment + if (types[t] == 0) + ir = 0; + else if (types[t] > 0) + ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); + else + ir = est_indelreg(pos, ref, -types[t], 0); + + if (ir > bca->indelreg) + bca->indelreg = ir; + + // Identify max deletion length + int max_deletion = 0; + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + if (max_deletion < -p->indel) + max_deletion = -p->indel; + } + } + + // Realignment score, computed via BAQ for (s = K = 0; s < n; ++s) { - // write ref2 + // Construct ref2 from ref_sample, inscns and indels. + // This is now the true sample consensus (possibly prepended + // and appended with reference if sample data doesn't span + // the full length). for (k = 0, j = left; j <= pos; ++j) ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - if (types[t] <= 0) j += -types[t]; - else for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; + + if (types[t] <= 0) + j += -types[t]; + else + for (l = 0; l < types[t]; ++l) + ref2[k++] = inscns[t*max_ins + l]; + for (; j < right && ref[j]; ++j) ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - for (; k < max_ref2; ++k) ref2[k] = 4; - if (j < right) right = j; + for (; k < max_ref2; ++k) + ref2[k] = 4; + + if (right > j) + right = j; + // align each read to ref2 for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; - int qbeg, qend, tbeg, tend, sc, kk; + + // Some basic ref vs alt stats. + int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual; + imq *= nqual_over_60; + + int sc_len, slen, epos, sc_end; + + // Only need to gather stats on one type, as it's + // identical calculation for all the subsequent ones + // and we're sharing the same stats array + if (t == 0) { + // Gather stats for INFO field to aid filtering. + // mq and sc_len not very helpful for filtering, but could + // help in assigning a better QUAL value. + // + // Pos is slightly useful. + // Base qual can be useful, but need qual prior to BAQ? + // May need to cache orig quals in aux tag so we can fetch + // them even after mpileup step. + get_pos(bca, p, &sc_len, &slen, &epos, &sc_end); + + assert(imq >= 0 && imq < bca->nqual); + assert(epos >= 0 && epos < bca->npos); + assert(sc_len >= 0 && sc_len < 100); + if (p->indel) { + bca->ialt_mq[imq]++; + bca->ialt_scl[sc_len]++; + bca->ialt_pos[epos]++; + } else { + bca->iref_mq[imq]++; + bca->iref_scl[sc_len]++; + bca->iref_pos[epos]++; + } + } + + int qbeg, qpos, qend, tbeg, tend, kk; uint8_t *seq = bam_get_seq(p->b); uint32_t *cigar = bam_get_cigar(p->b); - if (p->b->core.flag&4) continue; // unmapped reads - // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. + if (p->b->core.flag & BAM_FUNMAP) continue; + + // FIXME: the following loop should be better moved outside; + // nonetheless, realignment should be much slower anyway. for (kk = 0; kk < p->b->core.n_cigar; ++kk) - if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; - if (kk < p->b->core.n_cigar) continue; - // FIXME: the following skips soft clips, but using them may be more sensitive. + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) + break; + if (kk < p->b->core.n_cigar) + continue; + // determine the start and end of sequences for alignment - qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); - qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); + // FIXME: loops over CIGAR multiple times + int left2 = left, right2 = right; + if (p->b->core.l_qseq > 1000) { + // long read data needs less context. It also tends to + // have many more candidate indels to investigate so + // speed here matters more. + if (pos - left >= INDEL_WINDOW_SIZE) + left2 += INDEL_WINDOW_SIZE/2; + if (right-pos >= INDEL_WINDOW_SIZE) + right2 -= INDEL_WINDOW_SIZE/2; + } + + int r_start = p->b->core.pos; + int r_end = bam_cigar2rlen(p->b->core.n_cigar, + bam_get_cigar(p->b)) + -1 + r_start; + + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2, + 0, &tbeg); + qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos, + 0, &tend) - qbeg; + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2, + 1, &tend); + if (types[t] < 0) { int l = -types[t]; tbeg = tbeg - l > left? tbeg - l : left; } + // write the query sequence for (l = qbeg; l < qend; ++l) query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; - { // do realignment; this is the bottleneck - const uint8_t *qual = bam_get_qual(p->b), *bq; - uint8_t *qq; - qq = (uint8_t*) calloc(qend - qbeg, 1); - bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); - if (bq) ++bq; // skip type - for (l = qbeg; l < qend; ++l) { - qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; - if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; - if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; - } - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below - if (l > 255) l = 255; - score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; - if (sc > 5) { - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); - if (l > 255) l = 255; - score2[K*n_types + t] = sc<<8 | l; + + // A fudge for now. Consider checking SAM header for + // RG platform field. + int long_read = p->b->core.l_qseq > 1000; + + // do realignment; this is the bottleneck + if (tend > tbeg) { + if (bcf_cgp_align_score(p, bca, types[t], + (uint8_t *)ref2 + left2-left, + (uint8_t *)query, + r_start, r_end, long_read, + tbeg, tend, left2, right2, + qbeg, qend, qpos, max_deletion, + &score[K*n_types + t]) < 0) { + score[K*n_types + t] = 0xffffff; + return -1; } - free(qq); + } else { + // place holder large cost for reads that cover the + // region entirely within a deletion (thus tend < tbeg). + score[K*n_types + t] = 0xffffff; } #if 0 for (l = 0; l < tend - tbeg + abs(types[t]); ++l) fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr); + for (l = 0; l < qend - qbeg; ++l) + fputc("ACGTN"[(int)query[l]], stderr); fputc('\n', stderr); - fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc); + fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s " + "qbeg=%d tbeg=%d score=%d\n", + pos, types[t], s, i, bam_get_qname(p->b), + qbeg, tbeg, sc); #endif } } } - free(ref2); free(query); - { // compute indelQ - int sc_a[16], sumq_a[16]; - int tmp, *sc = sc_a, *sumq = sumq_a; - if (n_types > 16) { - sc = (int *)malloc(n_types * sizeof(int)); - sumq = (int *)malloc(n_types * sizeof(int)); - } - memset(sumq, 0, n_types * sizeof(int)); - for (s = K = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - /* errmod_cal() assumes that if the call is wrong, the - * likelihoods of other events are equal. This is about - * right for substitutions, but is not desired for - * indels. To reuse errmod_cal(), I have to make - * compromise for multi-allelic indels. - */ - if ((sc[0]&0x3f) == ref_type) { - indelQ1 = (sc[1]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ1 = (sc[t]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); - } - tmp = sc[0]>>6 & 0xff; - indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ - sct = &score2[K*n_types]; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - if ((sc[0]&0x3f) == ref_type) { - indelQ2 = (sc[1]>>14) - (sc[0]>>14); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ2 = (sc[t]>>14) - (sc[0]>>14); - } - tmp = sc[0]>>6 & 0xff; - indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); - // pick the smaller between indelQ1 and indelQ2 - indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; - if (indelQ > 255) indelQ = 255; - if (seqQ > 255) seqQ = 255; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); - } - } - // determine bca->indel_types[] and bca->inscns - bca->maxins = max_ins; - bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); - for (t = 0; t < n_types; ++t) - sumq[t] = sumq[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) - tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sumq[t]&0x3f) == ref_type) break; - if (t) { // then move the reference type to the first - tmp = sumq[t]; - for (; t > 0; --t) sumq[t] = sumq[t-1]; - sumq[0] = tmp; - } - for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; - for (t = 0; t < 4 && t < n_types; ++t) { - bca->indel_types[t] = types[sumq[t]&0x3f]; - memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); - } - // update p->aux - for (s = n_alt = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - int x = types[p->aux>>16&0x3f]; - for (j = 0; j < 4; ++j) - if (x == bca->indel_types[j]) break; - p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); - if ((p->aux>>16&0x3f) > 0) ++n_alt; - //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); - } - } - if (sc != sc_a) free(sc); - if (sumq != sumq_a) free(sumq); - } - free(score1); free(score2); + // compute indelQ + n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins, + ref_type, types, n_types, score); + // free - for (i = 0; i < n; ++i) free(ref_sample[i]); + free(ref2); + free(query); + free(score); + + for (i = 0; i < n; ++i) + free(ref_sample[i]); + free(ref_sample); free(types); free(inscns); + return n_alt > 0? 0 : -1; } diff --git a/bcftools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c index 67fff21..82bf31c 100644 --- a/bcftools/bam2bcf_indel.c.pysam.c +++ b/bcftools/bam2bcf_indel.c.pysam.c @@ -3,7 +3,7 @@ /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012-2014,2016 Genome Research Ltd. + Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd. Author: Heng Li @@ -28,19 +28,29 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include #include "bam2bcf.h" +#include "str_finder.h" #include KSORT_INIT_GENERIC(uint32_t) #define MINUS_CONST 0x10000000 -#define INDEL_WINDOW_SIZE 50 +#define INDEL_WINDOW_SIZE 110 +#define MAX_TYPES 64 + +// Take a reference position tpos and convert to a query position (returned). +// This uses the CIGAR string plus alignment c->pos to do the mapping. +// +// *_tpos is returned as tpos if query overlaps tpos, but for deletions +// it'll be either the start (is_left) or end (!is_left) ref position. static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) { + // x = pos in ref, y = pos in query seq int k, x = c->pos, y = 0, last_y = 0; *_tpos = c->pos; for (k = 0; k < c->n_cigar; ++k) { @@ -66,6 +76,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, *_tpos = x; return last_y; } + // FIXME: check if the inserted sequence is consistent with the homopolymer run // l is the relative gap length and l_run is the length of the homopolymer on the reference static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) @@ -89,21 +100,609 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) return max_i - pos; } +// Identify spft-clip length, position in seq, and clipped seq len +static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p, + int *sc_len_r, int *slen_r, int *epos_r, int *end) { + bam1_t *b = p->b; + int sc_len = 0, sc_dist = -1, at_left = 1; + int epos = p->qpos, slen = b->core.l_qseq; + int k; + uint32_t *cigar = bam_get_cigar(b); + *end = -1; + for (k = 0; k < b->core.n_cigar; k++) { + int op = bam_cigar_op(cigar[k]); + if (op == BAM_CSOFT_CLIP) { + slen -= bam_cigar_oplen(cigar[k]); + if (at_left) { + // left end + sc_len += bam_cigar_oplen(cigar[k]); + epos -= sc_len; // don't count SC in seq pos + sc_dist = epos; + *end = 0; + } else { + // right end + int srlen = bam_cigar_oplen(cigar[k]); + int rd = b->core.l_qseq - srlen - p->qpos; + if (sc_dist < 0 || sc_dist > rd) { + // closer to right end than left + // FIXME: compensate for indel length too? + sc_dist = rd; + sc_len = srlen; + *end = 1; + } + } + } else if (op != BAM_CHARD_CLIP) { + at_left = 0; + } + } + + if (p->indel > 0 && slen - (epos+p->indel) < epos) + epos += p->indel-1; // end of insertion, if near end of seq + + // slen is now length of sequence minus soft-clips and + // epos is position of indel in seq minus left-clip. + *epos_r = (double)epos / (slen+1) * bca->npos; + + if (sc_len) { + // scale importance of clip by distance to closest end + *sc_len_r = 15.0*sc_len / (sc_dist+1); + if (*sc_len_r > 99) *sc_len_r = 99; + } else { + *sc_len_r = 0; + } + + *slen_r = slen; +} + +// Part of bcf_call_gap_prep. +// +// Scans the pileup to identify all the different sizes of indels +// present. +// +// Returns types and fills out n_types_r, max_rd_len_r and ref_type_r, +// or NULL on error. +static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int *max_rd_len_r, int *n_types_r, + int *ref_type_r, int *N_r) { + int i, j, t, s, N, m, max_rd_len, n_types; + int n_alt = 0, n_tot = 0, indel_support_ok = 0; + uint32_t *aux; + int *types; + + // N is the total number of reads + for (s = N = 0; s < n; ++s) + N += n_plp[s]; + + bca->max_support = bca->max_frac = 0; + aux = (uint32_t*) calloc(N + 1, 4); + if (!aux) + return NULL; + + m = max_rd_len = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type (REF) + + // Fill out aux[] array with all the non-zero indel sizes. + // Also tally number with indels (n_alt) and total (n_tot). + for (s = 0; s < n; ++s) { + int na = 0, nt = 0; + for (i = 0; i < n_plp[s]; ++i) { + const bam_pileup1_t *p = plp[s] + i; + ++nt; + if (p->indel != 0) { + ++na; + aux[m++] = MINUS_CONST + p->indel; + } + + // FIXME: cache me in pileup struct. + j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + double frac = (double)na/nt; + if ( !indel_support_ok && na >= bca->min_support + && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) + bca->max_support = na, bca->max_frac = frac; + + n_alt += na; + n_tot += nt; + } + + // Sort aux[] and dedup + ks_introsort(uint32_t, m, aux); + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + + // Taking totals makes it hard to call rare indels (IMF filter) + if ( !bca->per_sample_flt ) + indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac + || n_alt < bca->min_support ) + ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); + return NULL; + } + + // Bail out if we have far too many types of indel + if (n_types >= MAX_TYPES) { + free(aux); + // TODO revisit how/whether to control printing this warning + if (hts_verbose >= 2) + fprintf(bcftools_stderr, "[%s] excessive INDEL alleles at position %d. " + "Skip the position.\n", __func__, pos + 1); + return NULL; + } + + // To prevent long stretches of N's to be mistaken for indels + // (sometimes thousands of bases), check the number of N's in the + // sequence and skip places where half or more reference bases are Ns. + int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len + ?2*INDEL_WINDOW_SIZE : max_rd_len); + for (i=pos; i(i-pos) ) { + free(aux); + return NULL; + } + + // Finally fill out the types[] array detailing the size of insertion + // or deletion. + types = (int*)calloc(n_types, sizeof(int)); + if (!types) { + free(aux); + return NULL; + } + t = 0; + types[t++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) + if (aux[i] != aux[i-1]) + types[t++] = aux[i] - MINUS_CONST; + free(aux); + + // Find reference type; types[?] == 0) + for (t = 0; t < n_types; ++t) + if (types[t] == 0) break; + + *ref_type_r = t; + *n_types_r = n_types; + *max_rd_len_r = max_rd_len; + *N_r = N; + + return types; +} + +// Part of bcf_call_gap_prep. +// +// Construct per-sample consensus. +// +// Returns an array of consensus seqs, +// or NULL on failure. +static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int left, int right) { + int i, k, s, L = right - left + 1, max_i, max2_i; + char **ref_sample; // returned + uint32_t *cns = NULL, max, max2; + char *ref0 = NULL, *r; + ref_sample = (char**) calloc(n, sizeof(char*)); + cns = (uint32_t*) calloc(L, 4); + ref0 = (char*) calloc(L, 1); + if (!ref_sample || !cns || !ref0) { + n = 0; + goto err; + } + + // Convert ref ASCII to 0-15. + for (i = 0; i < right - left; ++i) + ref0[i] = seq_nt16_table[(int)ref[i+left]]; + + // NB: one consensus per sample 'n', not per indel type. + // FIXME: consider fixing this. We should compute alignments vs + // types, not vs samples? Or types/sample combined? + for (s = 0; s < n; ++s) { + r = ref_sample[s] = (char*) calloc(L, 1); + if (!r) { + n = s-1; + goto err; + } + + memset(cns, 0, sizeof(int) * L); + + // collect ref and non-ref counts in cns + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + int x = b->core.pos, y = 0; + + // TODO: pileup exposes pileup_ind, but we also need e.g. + // pileup_len to know how much of the current CIGAR op-len + // we've used (or have remaining). If we had that, we + // could start at p->qpos without having to scan through + // the entire CIGAR string until we find it. + // + // Without it about all we could do is have a side channel + // to cache the last known coords. Messy, so punt for now. + // This is no longer the bottle neck until we get to 1000s of + // CIGAR ops. + + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k]&0xf; + int j, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if (x + l >= left) { + j = left - x > 0 ? left - x : 0; + int j_end = right - x < l ? right - x : l; + for (; j < j_end; j++) + // Append to cns. Note this is ref coords, + // so insertions aren't in cns and deletions + // will have lower coverage. + + // FIXME: want true consensus (with ins) per + // type, so we can independently compare each + // seq to each consensus and see which it + // matches best, so we get proper GT analysis. + cns[x+j-left] += + (bam_seqi(seq, y+j) == ref0[x+j-left]) + ? 1 // REF + : (1<<16); // ALT + } + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + x += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + } + + if (x > right) + break; + } + } + + // Determine a sample specific reference. + for (i = 0; i < right - left; ++i) + r[i] = ref0[i]; + + // Find deepest and 2nd deepest ALT region (max & max2). + max = max2 = 0; max_i = max2_i = -1; + for (i = 0; i < right - left; ++i) { + if (cns[i]>>16 >= max>>16) + max2 = max, max2_i = max_i, max = cns[i], max_i = i; + else if (cns[i]>>16 >= max2>>16) + max2 = cns[i], max2_i = i; + } + + // Masks mismatches present in at least 70% of the reads with 'N'. + // This code is nREF/(nREF+n_ALT) >= 70% for deepest region. + // The effect is that at least 30% of bases differing to REF will + // use "N" in consensus, so we don't penalise ALT or REF when + // aligning against it. (A poor man IUPAC code) + // + // Why is it only done in two loci at most? + if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) + max_i = -1; + if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) + max2_i = -1; + if (max_i >= 0) r[max_i] = 15; + if (max2_i >= 0) r[max2_i] = 15; + + //for (i = 0; i < right - left; ++i) + // fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], bcftools_stderr); + //fputc('\n', bcftools_stderr); + } + + free(ref0); + free(cns); + + return ref_sample; + + err: + free(ref0); + free(cns); + if (ref_sample) { + for (s = 0; s < n; s++) + free(ref_sample[s]); + free(ref_sample); + } + + return NULL; +} + +// The length of the homopolymer run around the current position +static int bcf_cgp_l_run(const char *ref, int pos) { + int i, l_run; + + int c = seq_nt16_table[(int)ref[pos + 1]]; + if (c == 15) { + l_run = 1; + } else { + for (i = pos + 2; ref[i]; ++i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run = i; + for (i = pos; i >= 0; --i) + if (seq_nt16_table[(int)ref[i]] != c) break; + l_run -= i + 1; + } + + return l_run; +} + + +// Compute the consensus for this sample 's', minus indels which +// get added later. +static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, + int pos, int *types, int n_types, + int max_ins, int s) { + int i, j, t, k; + int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int)); + if (!inscns_aux) + return NULL; + + // Count the number of occurrences of each base at each position for + // each type of insertion. + for (t = 0; t < n_types; ++t) { + if (types[t] > 0) { + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + if (p->indel == types[t]) { + uint8_t *seq = bam_get_seq(p->b); + for (k = 1; k <= p->indel; ++k) { + int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; + } + } + } + } + } + } + + // Use the majority rule to construct the consensus + char *inscns = (char *)calloc(n_types * max_ins, 1); + for (t = 0; t < n_types; ++t) { + for (j = 0; j < types[t]; ++j) { + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) + if (ia[k] > max) + max = ia[k], max_k = k; + inscns[t*max_ins + j] = max ? max_k : 4; + if (max_k == 4) { + // discard insertions which contain N's + types[t] = 0; + break; + } + } + } + free(inscns_aux); + + return inscns; +} + +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +// Part of bcf_call_gap_prep. +// +// Realign using BAQ to get an alignment score of a single read vs +// a haplotype consensus. +// +// Fills out score +// Returns 0 on success, +// <0 on error +static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, + int type, uint8_t *ref2, uint8_t *query, + int r_start, int r_end, int long_read, + int tbeg, int tend, + int left, int right, + int qbeg, int qend, + int qpos, int max_deletion, + int *score) { + // Illumina + probaln_par_t apf = { 1e-4, 1e-2, 10 }; + + // Parameters that work better on PacBio CCS 15k. + // We should consider querying the header and RG PU field. + // See also htslib/realn.c:sam_prob_realn() + if (long_read) { + apf.d = 1e-3; + apf.e = 1e-1; + } + + type = abs(type); + apf.bw = type + 3; + int l, sc; + const uint8_t *qual = bam_get_qual(p->b), *bq; + uint8_t *qq; + + // Get segment of quality, either ZQ tag or if absent QUAL. + if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) + return -1; + bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); + if (bq) ++bq; // skip type + for (l = qbeg; l < qend; ++l) { + int qval = bq? qual[l] + (bq[l] - 64) : qual[l]; + if (qval > 30) + qval = 30; + if (qval < 7) + qval = 7; + qq[l - qbeg] = qval; + } + + // The bottom 8 bits are length-normalised score while + // the top bits are unnormalised. + sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, + query, qend - qbeg, qq, &apf, 0, 0); + if (sc < 0) { + *score = 0xffffff; + free(qq); + return 0; + } + + // used for adjusting indelQ below + l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias; + *score = sc<<8 | MIN(255, l); + + rep_ele *reps, *elt, *tmp; + uint8_t *seg = ref2 + tbeg - left; + int seg_len = tend - tbeg + type; + + // Note: although seg moves (tbeg varies), ref2 is reused many times + // so we could factor out some find_STR calls. However it's not the + // bottleneck for now. + + // FIXME: need to make this work on IUPAC. + reps = find_STR((char *)seg, seg_len, 0); + int iscore = 0; + + // Identify STRs in ref covering the indel up to + // (or close to) the end of the sequence. + // Those having an indel and right at the sequence + // end do not confirm the total length of indel + // size. Specifically a *lack* of indel at the + // end, where we know indels occur in other + // sequences, is a possible reference bias. + // + // This is emphasised further if the sequence ends with + // soft clipping. + DL_FOREACH_SAFE(reps, elt, tmp) { + if (elt->start <= qpos && elt->end >= qpos) { + iscore += (elt->end-elt->start) / elt->rep_len; // c + if (elt->start+tbeg <= r_start || + elt->end+tbeg >= r_end) + iscore += 2*(elt->end-elt->start); + } + + DL_DELETE(reps, elt); + free(elt); + } + + // Apply STR score to existing indelQ + l = (*score&0xff)*.8 + iscore*2; + *score = (*score & ~0xff) | MIN(255, l); + + free(qq); + + return 0; +} + +// Part of bcf_call_gap_prep. +// +// Returns n_alt on success +// -1 on failure +static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, + bcf_callaux_t *bca, char *inscns, + int l_run, int max_ins, + int ref_type, int *types, int n_types, + int *score) { + // FIXME: n_types has a maximum; no need to alloc - use a #define? + int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp; + memset(sumq, 0, n_types * sizeof(int)); + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int *sct = &score[K*n_types], seqQ, indelQ; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + + /* errmod_cal() assumes that if the call is wrong, the + * likelihoods of other events are equal. This is about + * right for substitutions, but is not desired for + * indels. To reuse errmod_cal(), I have to make + * compromise for multi-allelic indels. + */ + if ((sc[0]&0x3f) == ref_type) { + indelQ = (sc[1]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ = (sc[t]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); + } + tmp = sc[0]>>6 & 0xff; + // reduce indelQ + indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); + + // Doesn't really help accuracy, but permits -h to take + // affect still. + if (indelQ > seqQ) indelQ = seqQ; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total + sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; + // fprintf(bcftools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + } + } + // determine bca->indel_types[] and bca->inscns + bca->maxins = max_ins; + bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); + if (bca->maxins && !bca->inscns) + return -1; + for (t = 0; t < n_types; ++t) + sumq[t] = sumq[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first + tmp = sumq[t]; + for (; t > 0; --t) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; + for (t = 0; t < 4 && t < n_types; ++t) { + bca->indel_types[t] = types[sumq[t]&0x3f]; + if (bca->maxins) + memcpy(&bca->inscns[t * bca->maxins], + &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); + } + // update p->aux + for (s = n_alt = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + int x = types[p->aux>>16&0x3f]; + for (j = 0; j < 4; ++j) + if (x == bca->indel_types[j]) break; + p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); + if ((p->aux>>16&0x3f) > 0) ++n_alt; + //fprintf(bcftools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + } + } + + return n_alt; +} + +/* +FIXME: with high number of samples, do we handle IMF correctly? Is it +fraction of indels across entire data set, or just fraction for this +specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt. + */ + /* notes: - - n .. number of samples - - the routine sets bam_pileup1_t.aux of each read as follows: - - 6: unused - - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f - - 8: estimated sequence quality .. (aux>>8)&0xff - - 8: indel quality .. aux&0xff + - n .. number of samples + - the routine sets bam_pileup1_t.aux of each read as follows: + - 6: unused + - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f + - 8: estimated sequence quality .. (aux>>8)&0xff + - 8: indel quality .. aux&0xff */ -int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) +int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, + bcf_callaux_t *bca, const char *ref) { - int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; + if (ref == 0 || bca == 0) return -1; + + int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins; + int *score, max_ref2; int N, K, l_run, ref_type, n_alt; char *inscns = 0, *ref2, *query, **ref_sample; - if (ref == 0 || bca == 0) return -1; // determine if there is a gap for (s = N = 0; s < n; ++s) { @@ -111,77 +710,29 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (plp[s][i].indel != 0) break; if (i < n_plp[s]) break; } - if (s == n) return -1; // there is no indel at this position. - for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads - { // find out how many types of indels are present - bca->max_support = bca->max_frac = 0; - int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; - uint32_t *aux; - aux = (uint32_t*) calloc(N + 1, 4); - m = max_rd_len = 0; - aux[m++] = MINUS_CONST; // zero indel is always a type - for (s = 0; s < n; ++s) { - int na = 0, nt = 0; - for (i = 0; i < n_plp[s]; ++i) { - const bam_pileup1_t *p = plp[s] + i; - ++nt; - if (p->indel != 0) { - ++na; - aux[m++] = MINUS_CONST + p->indel; - } - j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); - if (j > max_rd_len) max_rd_len = j; - } - double frac = (double)na/nt; - if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) - indel_support_ok = 1; - if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; - n_alt += na; - n_tot += nt; - } - // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), - // check the number of N's in the sequence and skip places where half or more reference bases are Ns. - int nN=0; for (i=pos; i-pos(i-pos) ) { free(aux); return -1; } - - ks_introsort(uint32_t, m, aux); - // squeeze out identical types - for (i = 1, n_types = 1; i < m; ++i) - if (aux[i] != aux[i-1]) ++n_types; - // Taking totals makes it hard to call rare indels - if ( !bca->per_sample_flt ) - indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; - if ( n_types == 1 || !indel_support_ok ) { // then skip - free(aux); return -1; - } - if (n_types >= 64) { - free(aux); - // TODO revisit how/whether to control printing this warning - if (hts_verbose >= 2) - fprintf(bcftools_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); - return -1; - } - types = (int*)calloc(n_types, sizeof(int)); - t = 0; - types[t++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) - if (aux[i] != aux[i-1]) - types[t++] = aux[i] - MINUS_CONST; - free(aux); - for (t = 0; t < n_types; ++t) - if (types[t] == 0) break; - ref_type = t; // the index of the reference type (0) - } - { // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; - if (types[0] < 0) right -= types[0]; - // in case the alignments stand out the reference - for (i = pos; i < right; ++i) - if (ref[i] == 0) break; - right = i; - } - /* The following block fixes a long-existing flaw in the INDEL + if (s == n) + // there is no indel at this position. + return -1; + + // find out how many types of indels are present + types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref, + &max_rd_len, &n_types, &ref_type, &N); + if (!types) + return -1; + + + // calculate left and right boundary + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; + if (types[0] < 0) right -= types[0]; + + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + + + /* The following call fixes a long-existing flaw in the INDEL * calling model: the interference of nearby SNPs. However, it also * reduces the power because sometimes, substitutions caused by * indels are not distinguishable from true mutations. Multiple @@ -189,284 +740,211 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla * * Masks mismatches present in at least 70% of the reads with 'N'. */ - { // construct per-sample consensus - int L = right - left + 1, max_i, max2_i; - uint32_t *cns, max, max2; - char *ref0, *r; - ref_sample = (char**) calloc(n, sizeof(char*)); - cns = (uint32_t*) calloc(L, 4); - ref0 = (char*) calloc(L, 1); - for (i = 0; i < right - left; ++i) - ref0[i] = seq_nt16_table[(int)ref[i+left]]; - for (s = 0; s < n; ++s) { - r = ref_sample[s] = (char*) calloc(L, 1); - memset(cns, 0, sizeof(int) * L); - // collect ref and non-ref counts - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); - int x = b->core.pos, y = 0; - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) - if (x + j >= left && x + j < right) - cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - } - } - // determine the consensus - for (i = 0; i < right - left; ++i) r[i] = ref0[i]; - max = max2 = 0; max_i = max2_i = -1; - for (i = 0; i < right - left; ++i) { - if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; - else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; - } - if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; - if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; - if (max_i >= 0) r[max_i] = 15; - if (max2_i >= 0) r[max2_i] = 15; - //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], bcftools_stderr); fputc('\n', bcftools_stderr); - } - free(ref0); free(cns); - } - { // the length of the homopolymer run around the current position - int c = seq_nt16_table[(int)ref[pos + 1]]; - if (c == 15) l_run = 1; - else { - for (i = pos + 2; ref[i]; ++i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run = i; - for (i = pos; i >= 0; --i) - if (seq_nt16_table[(int)ref[i]] != c) break; - l_run -= i + 1; - } - } - // construct the consensus sequence + ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right); + + // The length of the homopolymer run around the current position + l_run = bcf_cgp_l_run(ref, pos); + + // construct the consensus sequence (minus indels, which are added later) max_ins = types[n_types - 1]; // max_ins is at least 0 if (max_ins > 0) { - int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int)); - // count the number of occurrences of each base at each position for each type of insertion - for (t = 0; t < n_types; ++t) { - if (types[t] > 0) { - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - if (p->indel == types[t]) { - uint8_t *seq = bam_get_seq(p->b); - for (k = 1; k <= p->indel; ++k) { - int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)]; - assert(c<5); - ++inscns_aux[(t*max_ins+(k-1))*5 + c]; - } - } - } - } - } - } - // use the majority rule to construct the consensus - inscns = (char*) calloc(n_types * max_ins, 1); - for (t = 0; t < n_types; ++t) { - for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; - for (k = 0; k < 5; ++k) - if (ia[k] > max) - max = ia[k], max_k = k; - inscns[t*max_ins + j] = max? max_k : 4; - if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's - } - } - free(inscns_aux); + inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos, + types, n_types, max_ins, s); + if (!inscns) + return -1; } + // compute the likelihood given each type of indel for each read max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); ref2 = (char*) calloc(max_ref2, 1); query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); - score1 = (int*) calloc(N * n_types, sizeof(int)); - score2 = (int*) calloc(N * n_types, sizeof(int)); + score = (int*) calloc(N * n_types, sizeof(int)); bca->indelreg = 0; + double nqual_over_60 = bca->nqual / 60.0; + for (t = 0; t < n_types; ++t) { int l, ir; - probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; - apf1.bw = apf2.bw = abs(types[t]) + 3; + // compute indelreg - if (types[t] == 0) ir = 0; - else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); - else ir = est_indelreg(pos, ref, -types[t], 0); - if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(bcftools_stderr, "%d, %d, %d\n", pos, types[t], ir); - // realignment + if (types[t] == 0) + ir = 0; + else if (types[t] > 0) + ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); + else + ir = est_indelreg(pos, ref, -types[t], 0); + + if (ir > bca->indelreg) + bca->indelreg = ir; + + // Identify max deletion length + int max_deletion = 0; + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + if (max_deletion < -p->indel) + max_deletion = -p->indel; + } + } + + // Realignment score, computed via BAQ for (s = K = 0; s < n; ++s) { - // write ref2 + // Construct ref2 from ref_sample, inscns and indels. + // This is now the true sample consensus (possibly prepended + // and appended with reference if sample data doesn't span + // the full length). for (k = 0, j = left; j <= pos; ++j) ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - if (types[t] <= 0) j += -types[t]; - else for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; + + if (types[t] <= 0) + j += -types[t]; + else + for (l = 0; l < types[t]; ++l) + ref2[k++] = inscns[t*max_ins + l]; + for (; j < right && ref[j]; ++j) ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - for (; k < max_ref2; ++k) ref2[k] = 4; - if (j < right) right = j; + for (; k < max_ref2; ++k) + ref2[k] = 4; + + if (right > j) + right = j; + // align each read to ref2 for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; - int qbeg, qend, tbeg, tend, sc, kk; + + // Some basic ref vs alt stats. + int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual; + imq *= nqual_over_60; + + int sc_len, slen, epos, sc_end; + + // Only need to gather stats on one type, as it's + // identical calculation for all the subsequent ones + // and we're sharing the same stats array + if (t == 0) { + // Gather stats for INFO field to aid filtering. + // mq and sc_len not very helpful for filtering, but could + // help in assigning a better QUAL value. + // + // Pos is slightly useful. + // Base qual can be useful, but need qual prior to BAQ? + // May need to cache orig quals in aux tag so we can fetch + // them even after mpileup step. + get_pos(bca, p, &sc_len, &slen, &epos, &sc_end); + + assert(imq >= 0 && imq < bca->nqual); + assert(epos >= 0 && epos < bca->npos); + assert(sc_len >= 0 && sc_len < 100); + if (p->indel) { + bca->ialt_mq[imq]++; + bca->ialt_scl[sc_len]++; + bca->ialt_pos[epos]++; + } else { + bca->iref_mq[imq]++; + bca->iref_scl[sc_len]++; + bca->iref_pos[epos]++; + } + } + + int qbeg, qpos, qend, tbeg, tend, kk; uint8_t *seq = bam_get_seq(p->b); uint32_t *cigar = bam_get_cigar(p->b); - if (p->b->core.flag&4) continue; // unmapped reads - // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. + if (p->b->core.flag & BAM_FUNMAP) continue; + + // FIXME: the following loop should be better moved outside; + // nonetheless, realignment should be much slower anyway. for (kk = 0; kk < p->b->core.n_cigar; ++kk) - if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; - if (kk < p->b->core.n_cigar) continue; - // FIXME: the following skips soft clips, but using them may be more sensitive. + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) + break; + if (kk < p->b->core.n_cigar) + continue; + // determine the start and end of sequences for alignment - qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg); - qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend); + // FIXME: loops over CIGAR multiple times + int left2 = left, right2 = right; + if (p->b->core.l_qseq > 1000) { + // long read data needs less context. It also tends to + // have many more candidate indels to investigate so + // speed here matters more. + if (pos - left >= INDEL_WINDOW_SIZE) + left2 += INDEL_WINDOW_SIZE/2; + if (right-pos >= INDEL_WINDOW_SIZE) + right2 -= INDEL_WINDOW_SIZE/2; + } + + int r_start = p->b->core.pos; + int r_end = bam_cigar2rlen(p->b->core.n_cigar, + bam_get_cigar(p->b)) + -1 + r_start; + + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2, + 0, &tbeg); + qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos, + 0, &tend) - qbeg; + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2, + 1, &tend); + if (types[t] < 0) { int l = -types[t]; tbeg = tbeg - l > left? tbeg - l : left; } + // write the query sequence for (l = qbeg; l < qend; ++l) query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; - { // do realignment; this is the bottleneck - const uint8_t *qual = bam_get_qual(p->b), *bq; - uint8_t *qq; - qq = (uint8_t*) calloc(qend - qbeg, 1); - bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); - if (bq) ++bq; // skip type - for (l = qbeg; l < qend; ++l) { - qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; - if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; - if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; - } - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below - if (l > 255) l = 255; - score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; - if (sc > 5) { - sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); - if (l > 255) l = 255; - score2[K*n_types + t] = sc<<8 | l; + + // A fudge for now. Consider checking SAM header for + // RG platform field. + int long_read = p->b->core.l_qseq > 1000; + + // do realignment; this is the bottleneck + if (tend > tbeg) { + if (bcf_cgp_align_score(p, bca, types[t], + (uint8_t *)ref2 + left2-left, + (uint8_t *)query, + r_start, r_end, long_read, + tbeg, tend, left2, right2, + qbeg, qend, qpos, max_deletion, + &score[K*n_types + t]) < 0) { + score[K*n_types + t] = 0xffffff; + return -1; } - free(qq); + } else { + // place holder large cost for reads that cover the + // region entirely within a deletion (thus tend < tbeg). + score[K*n_types + t] = 0xffffff; } #if 0 for (l = 0; l < tend - tbeg + abs(types[t]); ++l) fputc("ACGTN"[(int)ref2[tbeg-left+l]], bcftools_stderr); fputc('\n', bcftools_stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], bcftools_stderr); + for (l = 0; l < qend - qbeg; ++l) + fputc("ACGTN"[(int)query[l]], bcftools_stderr); fputc('\n', bcftools_stderr); - fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc); + fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s " + "qbeg=%d tbeg=%d score=%d\n", + pos, types[t], s, i, bam_get_qname(p->b), + qbeg, tbeg, sc); #endif } } } - free(ref2); free(query); - { // compute indelQ - int sc_a[16], sumq_a[16]; - int tmp, *sc = sc_a, *sumq = sumq_a; - if (n_types > 16) { - sc = (int *)malloc(n_types * sizeof(int)); - sumq = (int *)malloc(n_types * sizeof(int)); - } - memset(sumq, 0, n_types * sizeof(int)); - for (s = K = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - /* errmod_cal() assumes that if the call is wrong, the - * likelihoods of other events are equal. This is about - * right for substitutions, but is not desired for - * indels. To reuse errmod_cal(), I have to make - * compromise for multi-allelic indels. - */ - if ((sc[0]&0x3f) == ref_type) { - indelQ1 = (sc[1]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ1 = (sc[t]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); - } - tmp = sc[0]>>6 & 0xff; - indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ - sct = &score2[K*n_types]; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - if ((sc[0]&0x3f) == ref_type) { - indelQ2 = (sc[1]>>14) - (sc[0]>>14); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ2 = (sc[t]>>14) - (sc[0]>>14); - } - tmp = sc[0]>>6 & 0xff; - indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); - // pick the smaller between indelQ1 and indelQ2 - indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; - if (indelQ > 255) indelQ = 255; - if (seqQ > 255) seqQ = 255; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(bcftools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); - } - } - // determine bca->indel_types[] and bca->inscns - bca->maxins = max_ins; - bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); - for (t = 0; t < n_types; ++t) - sumq[t] = sumq[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) - tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sumq[t]&0x3f) == ref_type) break; - if (t) { // then move the reference type to the first - tmp = sumq[t]; - for (; t > 0; --t) sumq[t] = sumq[t-1]; - sumq[0] = tmp; - } - for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; - for (t = 0; t < 4 && t < n_types; ++t) { - bca->indel_types[t] = types[sumq[t]&0x3f]; - memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); - } - // update p->aux - for (s = n_alt = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - int x = types[p->aux>>16&0x3f]; - for (j = 0; j < 4; ++j) - if (x == bca->indel_types[j]) break; - p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); - if ((p->aux>>16&0x3f) > 0) ++n_alt; - //fprintf(bcftools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); - } - } - if (sc != sc_a) free(sc); - if (sumq != sumq_a) free(sumq); - } - free(score1); free(score2); + // compute indelQ + n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins, + ref_type, types, n_types, score); + // free - for (i = 0; i < n; ++i) free(ref_sample[i]); + free(ref2); + free(query); + free(score); + + for (i = 0; i < n; ++i) + free(ref_sample[i]); + free(ref_sample); free(types); free(inscns); + return n_alt > 0? 0 : -1; } diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index 96237ee..953cf6b 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -1,6 +1,6 @@ /* bcftools.h -- utility function declarations. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -50,25 +50,40 @@ void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); +const char *hts_bcf_wmode2(int file_type, char *fname); +char *init_tmp_prefix(const char *prefix); void *smalloc(size_t size); // safe malloc -static inline char gt2iupac(char a, char b) +static inline int iupac2bitmask(char iupac) { - static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} }; - if ( a>='a' ) a -= 'a' - 'A'; - if ( b>='a' ) b -= 'a' - 'A'; - if ( a=='A' ) a = 0; - else if ( a=='C' ) a = 1; - else if ( a=='G' ) a = 2; - else if ( a=='T' ) a = 3; - else return 'N'; - if ( b=='A' ) b = 0; - else if ( b=='C' ) b = 1; - else if ( b=='G' ) b = 2; - else if ( b=='T' ) b = 3; - else return 'N'; - return iupac[(int)a][(int)b]; + const int A = 1; + const int C = 2; + const int G = 4; + const int T = 8; + if ( iupac >= 97 ) iupac -= 32; + if ( iupac == 'A' ) return A; + if ( iupac == 'C' ) return C; + if ( iupac == 'G' ) return G; + if ( iupac == 'T' ) return T; + if ( iupac == 'M' ) return A|C; + if ( iupac == 'R' ) return A|G; + if ( iupac == 'W' ) return A|T; + if ( iupac == 'S' ) return C|G; + if ( iupac == 'Y' ) return C|T; + if ( iupac == 'K' ) return G|T; + if ( iupac == 'V' ) return A|C|G; + if ( iupac == 'H' ) return A|C|T; + if ( iupac == 'D' ) return A|G|T; + if ( iupac == 'B' ) return C|G|T; + if ( iupac == 'N' ) return A|C|G|T; + return -1; +} +static inline char bitmask2iupac(int bitmask) +{ + const char iupac[16] = {'.','A','C','M','G','R','S','V','T','W','Y','H','K','D','B','N'}; + if ( bitmask <= 0 || bitmask > 15 ) return 0; + return iupac[bitmask]; } static inline int iupac_consistent(char iupac, char nt) @@ -101,4 +116,24 @@ static inline double phred_score(double prob) return prob>99 ? 99 : prob; } +static const uint64_t bcf_double_missing = 0x7ff0000000000001; +static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; +static inline void bcf_double_set(double *ptr, uint64_t value) +{ + union { uint64_t i; double d; } u; + u.i = value; + *ptr = u.d; +} +static inline int bcf_double_test(double d, uint64_t value) +{ + union { uint64_t i; double d; } u; + u.d = d; + return u.i==value ? 1 : 0; +} +#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end) +#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) +#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) +#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) +#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) + #endif diff --git a/bcftools/bcftools.pysam.c b/bcftools/bcftools.pysam.c index de8739d..c6f4fd8 100644 --- a/bcftools/bcftools.pysam.c +++ b/bcftools/bcftools.pysam.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,25 @@ int bcftools_puts(const char *s) return putc('\n', bcftools_stdout); } + +static jmp_buf bcftools_jmpbuf; +static int bcftools_status = 0; + +int bcftools_dispatch(int argc, char *argv[]) +{ + if (setjmp(bcftools_jmpbuf) == 0) + return bcftools_main(argc, argv); + else + return bcftools_status; +} + +void bcftools_exit(int status) +{ + bcftools_status = status; + longjmp(bcftools_jmpbuf, 1); +} + + void bcftools_set_optind(int val) { // setting this in cython via diff --git a/bcftools/bcftools.pysam.h b/bcftools/bcftools.pysam.h index 453567a..b8bf93e 100644 --- a/bcftools/bcftools.pysam.h +++ b/bcftools/bcftools.pysam.h @@ -3,6 +3,17 @@ #include +#ifndef __has_attribute +#define __has_attribute(attribute) 0 +#endif +#ifndef PYSAM_NORETURN +#if __has_attribute(__noreturn__) || __GNUC__ >= 3 +#define PYSAM_NORETURN __attribute__((__noreturn__)) +#else +#define PYSAM_NORETURN +#endif +#endif + extern FILE * bcftools_stderr; extern FILE * bcftools_stdout; @@ -40,6 +51,8 @@ int bcftools_puts(const char *s); int bcftools_dispatch(int argc, char *argv[]); +void PYSAM_NORETURN bcftools_exit(int status); + void bcftools_set_optind(int); extern int bcftools_main(int argc, char *argv[]); diff --git a/bcftools/bin.c b/bcftools/bin.c index 95a2be1..a4817cf 100644 --- a/bcftools/bin.c +++ b/bcftools/bin.c @@ -25,6 +25,7 @@ */ #include +#include #include "bcftools.h" #include "bin.h" diff --git a/bcftools/bin.c.pysam.c b/bcftools/bin.c.pysam.c index 426ef45..1a177be 100644 --- a/bcftools/bin.c.pysam.c +++ b/bcftools/bin.c.pysam.c @@ -27,6 +27,7 @@ */ #include +#include #include "bcftools.h" #include "bin.h" diff --git a/bcftools/call.h b/bcftools/call.h index 50e4815..16bf0b6 100644 --- a/bcftools/call.h +++ b/bcftools/call.h @@ -1,6 +1,6 @@ /* call.h -- variant calling declarations. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2015, 2019-2020 Genome Research Ltd. Author: Petr Danecek @@ -34,7 +34,7 @@ THE SOFTWARE. */ #define CALL_CONSTR_TRIO (1<<2) #define CALL_CONSTR_ALLELES (1<<3) // -// +#define CALL_FMT_PV4 (1<<5) #define CALL_FMT_GQ (1<<6) #define CALL_FMT_GP (1<<7) @@ -52,18 +52,13 @@ family_t; // For the single-sample and grouped -G calling typedef struct { + double ref_lk, max_lk, lk_sum; float *qsum; // QS(quality sum) values - int nqsum, dp; - double fa,fb,fc,fa2,fb2,fc2,fab,fac,fbc; -} -grp1_t; -typedef struct -{ - grp1_t *grp; - int ngrp; - int *smpl2grp; + int nqsum; + uint32_t *smpl, nsmpl; + uint32_t nals, als; } -grp_t; +smpl_grp_t; // For the `-C alleles -i` constrained calling typedef struct @@ -82,6 +77,7 @@ typedef struct int *pl_map, npl_map; // same as above for PLs, but reverse (new -> old) char **als; // array to hold the trimmed set of alleles to appear on output int nals; // size of the als array + int als_new, nals_new; // bitmask with final alleles and their number family_t *fams; // list of families and samples for trio calling int nfams, mfams; int ntrio[5][5]; // possible trio genotype combinations and their counts; first idx: @@ -96,18 +92,16 @@ typedef struct int32_t *ugts, *cgts; // unconstraind and constrained GTs uint32_t output_tags; char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN) - tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES - char *sample_groups; // for single-sample or grouped calling with -G - grp_t smpl_grp; - float *qsum; - int nqsum; + tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES + char *sample_groups; // for single-sample or grouped calling with -G + char *sample_groups_tag; // for -G [AD|QS:] + smpl_grp_t *smpl_grp; + int nsmpl_grp; // ccall only double indel_frac, min_perm_p, min_lrt; double prior_type, pref; - double ref_lk, lk_sum; int ngrp1_samples, n_perm; - int nhets, ndiploid; char *prior_file; ccall_t *cdat; @@ -149,7 +143,7 @@ void qcall_destroy(call_t *call); void call_init_pl2p(call_t *call); uint32_t *call_trio_prep(int is_x, int is_son); -void init_allele_trimming_maps(call_t *call, int als, int nals); -void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als); +void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out); +void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new); #endif diff --git a/bcftools/ccall.c b/bcftools/ccall.c index 9f6958a..6bf987b 100644 --- a/bcftools/ccall.c +++ b/bcftools/ccall.c @@ -24,6 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include #include "call.h" #include "kmin.h" @@ -302,8 +303,8 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double // trim Number=R tags int out_als = 0; for (i=0; i +#include #include #include "call.h" #include "kmin.h" @@ -304,8 +305,8 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double // trim Number=R tags int out_als = 0; for (i=0; i @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,9 @@ #define PICK_SHORT 8 #define PICK_IUPAC 16 +#define TO_UPPER 0 +#define TO_LOWER 1 + typedef struct { int num; // number of ungapped blocks in this chain @@ -64,6 +68,16 @@ typedef struct } chain_t; +#define MASK_LC 1 +#define MASK_UC 2 +#define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0) +typedef struct +{ + char *fname, with; + regidx_t *idx; + regitr_t *itr; +} +mask_t; typedef struct { @@ -71,9 +85,10 @@ typedef struct int fa_ori_pos; // start position of the fa_buffer (wrt original sequence) int fa_frz_pos; // protected position to avoid conflicting variants (last pos for SNPs/ins) int fa_mod_off; // position difference of fa_frz_pos in the ori and modified sequence (ins positive) + int fa_frz_mod; // the fa_buf offset of the protected fa_frz_pos position, includes the modified sequence int fa_end_pos; // region's end position in the original sequence int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header) - int fa_case; // output upper case or lower case? + int fa_case; // output upper case or lower case: TO_UPPER|TO_LOWER int fa_src_pos; // last genomic coordinate read from the input fasta (0-based) char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778 int prev_base_pos; // the position of prev_base @@ -84,8 +99,8 @@ typedef struct int nvcf_buf, rid; char *chr, *chr_prefix; - regidx_t *mask; - regitr_t *itr; + mask_t *mask; + int nmask; int chain_id; // chain_id, to provide a unique ID to each chain in the chain output chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences @@ -101,7 +116,10 @@ typedef struct FILE *fp_chain; char **argv; int argc, output_iupac, haplotype, allele, isample, napplied; - char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele; + uint8_t *iupac_bitmask; + int miupac_bitmask; + char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; + char mark_del, mark_ins, mark_snv; } args_t; @@ -182,7 +200,7 @@ static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_s // fprintf(stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len); int num = chain->num; - if (ref_start <= chain->ref_last_block_ori) { + if (num && ref_start <= chain->ref_last_block_ori) { // In case this variant is back-to-back with the previous one chain->ref_last_block_ori = ref_start + ref_len; chain->alt_last_block_ori = alt_start + alt_len; @@ -222,11 +240,13 @@ static void init_data(args_t *args) if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); args->isample = 0; } - if ( args->mask_fname ) + int i; + for (i=0; inmask; i++) { - args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL); - if ( !args->mask ) error("Failed to initialize mask regions\n"); - args->itr = regitr_init(args->mask); + mask_t *mask = &args->mask[i]; + mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL); + if ( !mask->idx ) error("Failed to initialize mask regions\n"); + mask->itr = regitr_init(mask->idx); } // In case we want to store the chains if ( args->chain_fname ) @@ -245,10 +265,28 @@ static void init_data(args_t *args) if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); + args->rid = -1; +} +static void add_mask(args_t *args, char *fname) +{ + args->nmask++; + args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask)); + mask_t *mask = &args->mask[args->nmask-1]; + mask->fname = fname; + mask->with = 'N'; +} +static void add_mask_with(args_t *args, char *with) +{ + if ( !args->nmask ) error("The --mask-with option must follow --mask\n"); + mask_t *mask = &args->mask[args->nmask-1]; + if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC; + else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC; + else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n"); + else mask->with = *with; } - static void destroy_data(args_t *args) { + free(args->iupac_bitmask); if (args->filter) filter_destroy(args->filter); bcf_sr_destroy(args->files); int i; @@ -257,8 +295,13 @@ static void destroy_data(args_t *args) free(args->vcf_buf); free(args->fa_buf.s); free(args->chr); - if ( args->mask ) regidx_destroy(args->mask); - if ( args->itr ) regitr_destroy(args->itr); + for (i=0; inmask; i++) + { + mask_t *mask = &args->mask[i]; + regidx_destroy(mask->idx); + regitr_destroy(mask->itr); + } + free(args->mask); if ( args->chain_fname ) if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname); if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname); @@ -297,6 +340,7 @@ static void init_region(args_t *args, char *line) args->fa_src_pos = from; args->fa_mod_off = 0; args->fa_frz_pos = -1; + args->fa_frz_mod = -1; args->fa_case = -1; args->vcf_rbuf.n = 0; bcf_sr_seek(args->files,line,args->fa_ori_pos); @@ -345,7 +389,6 @@ static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr) static void flush_fa_buffer(args_t *args, int len) { if ( !args->fa_buf.l ) return; - int nwr = 0; while ( nwr + 60 <= args->fa_buf.l ) { @@ -356,6 +399,8 @@ static void flush_fa_buffer(args_t *args, int len) if ( nwr ) args->fa_ori_pos += nwr; + args->fa_frz_mod -= nwr; + if ( len ) { // not finished on this chr yet and the buffer cannot be emptied completely @@ -375,21 +420,84 @@ static void flush_fa_buffer(args_t *args, int len) args->fa_mod_off = 0; args->fa_buf.l = 0; } +static void apply_absent(args_t *args, hts_pos_t pos) +{ + if ( !args->fa_buf.l || pos <= args->fa_frz_pos + 1 || pos <= args->fa_ori_pos ) return; + + int ie = pos && pos - args->fa_ori_pos + args->fa_mod_off < args->fa_buf.l ? pos - args->fa_ori_pos + args->fa_mod_off : args->fa_buf.l; + int ib = args->fa_frz_mod < 0 ? 0 : args->fa_frz_mod; + int i; + for (i=ib; ifa_buf.s[i] = args->absent_allele; +} +static void freeze_ref(args_t *args, bcf1_t *rec) +{ + if ( args->fa_frz_pos >= rec->pos + rec->rlen - 1 ) return; + args->fa_frz_pos = rec->pos + rec->rlen - 1; + args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen; +} +static char *mark_del(char *ref, int rlen, char *alt, int mark) +{ + char *out = malloc(rlen+1); + int i; + if ( alt ) + { + int nalt = strlen(alt); + for (i=0; i + { + int nref = strlen(ref); + for (i=0; in_allele==1 && !args->missing_allele ) return; + if ( args->absent_allele ) apply_absent(args, rec->pos); + if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; } + int i,j; if ( args->mask ) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = rec->pos; int end = rec->pos + rec->rlen - 1; - if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; + for (i=0; inmask; i++) + { + mask_t *mask = &args->mask[i]; + if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return; + } } - int i, ialt = 1; // the alternate allele + int ialt = 1; // the alternate allele if ( args->isample >= 0 ) { bcf_unpack(rec, BCF_UN_FMT); @@ -403,6 +511,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) enum { use_hap, use_iupac, pick_one } action = use_hap; if ( args->allele==PICK_IUPAC ) { + if ( !args->haplotype ) action = use_iupac; if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac; } else if ( args->output_iupac ) action = use_iupac; @@ -441,41 +550,40 @@ static void apply_variant(args_t *args, bcf1_t *rec) } else if ( action==use_iupac ) { - ialt = ptr[0]; - if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) + ialt = -1; + int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1; + for (i=0; in; i++) { - if ( !args->missing_allele ) return; - ialt = -1; - } - else - ialt = bcf_gt_allele(ialt); + if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; } + if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; + int jalt = bcf_gt_allele(ptr[i]); + if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + if ( fallback_alt <= 0 ) fallback_alt = jalt; - int jalt; - if ( fmt->n>1 ) - { - jalt = ptr[1]; - if ( bcf_gt_is_missing(jalt) ) + int l = strlen(rec->d.allele[jalt]); + for (j=0; jd.allele[jalt][j]) < 0 ) break; + if ( j mlen ) { - if ( !args->missing_allele ) return; - ialt = -1; + hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask); + for (j=mlen; jiupac_bitmask[j] = 0; + mlen = l; } - else if ( jalt==bcf_int32_vector_end ) jalt = ialt; - else - jalt = bcf_gt_allele(jalt); - } - else jalt = ialt; - - if ( ialt>=0 ) - { - if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? + if ( jalt>0 && l>alen ) { - char ial = rec->d.allele[ialt][0]; - char jal = rec->d.allele[jalt][0]; - if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored - rec->d.allele[ialt][0] = gt2iupac(ial,jal); + alen = l; + ialt = jalt; } + for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]); } + if ( alen > 0 ) + for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); + else if ( fallback_alt >= 0 ) + ialt = fallback_alt; + else if ( is_missing && !args->missing_allele ) return; } else { @@ -520,17 +628,50 @@ static void apply_variant(args_t *args, bcf1_t *rec) } } } - if ( !ialt ) return; // ref allele + if ( !ialt ) + { + // ref allele + if ( args->absent_allele ) freeze_ref(args,rec); + return; + } if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } - else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) + else if ( args->output_iupac && rec->n_allele>1 ) { - char ial = rec->d.allele[0][0]; - char jal = rec->d.allele[1][0]; - rec->d.allele[1][0] = gt2iupac(ial,jal); + int ialt, alen = 0, mlen = 0; + for (i=0; in_allele; i++) + { + int l = strlen(rec->d.allele[i]); + for (j=0; jd.allele[i][j]) < 0 ) break; + if ( j mlen ) + { + hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask); + for (j=mlen; jiupac_bitmask[j] = 0; + mlen = l; + } + if ( i>0 && l>alen ) + { + alen = l; + ialt = i; + } + for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]); + } + if ( alen > 0 ) + for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); + else + ialt = 1; } - if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference + if ( rec->n_allele==1 && ialt!=-1 ) + { + // non-missing reference + if ( args->absent_allele ) freeze_ref(args,rec); + return; + } if ( ialt==-1 ) { char alleles[4]; @@ -542,15 +683,34 @@ static void apply_variant(args_t *args, bcf1_t *rec) ialt = 1; } + // For some variant types POS+REF refer to the base *before* the event; in such case set trim_beg + int trim_beg = 0; + int var_type = bcf_get_variant_type(rec,ialt); + int var_len = rec->d.var[ialt].n; + if ( var_type & VCF_INDEL ) + { + // normally indel starts one base after, but not if the first base of the fa reference is deleted + if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] ) + trim_beg = 1; + else + trim_beg = 0; + } + else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"") ) + { + trim_beg = 1; + var_len = 1 - rec->rlen; + } + else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"pos <= args->fa_frz_pos ) { // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888). // This still may not be enough for more complicated cases with multiple duplicate positions // and other types in between. In such case let the user normalize the VCF and remove duplicates. + int overlap = 0; - if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1; - else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1; + if ( rec->pos < args->fa_frz_pos || !trim_beg || var_len==0 || args->prev_is_insert ) overlap = 1; if ( overlap ) { @@ -560,6 +720,9 @@ static void apply_variant(args_t *args, bcf1_t *rec) } + char *alt_allele = rec->d.allele[ialt]; + int rmme_alt = 0; + int len_diff = 0, alen = 0; int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; if ( idx<0 ) @@ -570,10 +733,10 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( rec->rlen > args->fa_buf.l - idx ) { rec->rlen = args->fa_buf.l - idx; - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); if ( alen > rec->rlen ) { - rec->d.allele[ialt][rec->rlen] = 0; + alt_allele[rec->rlen] = 0; fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } } @@ -581,14 +744,44 @@ static void apply_variant(args_t *args, bcf1_t *rec) error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); // sanity check the reference base - if ( rec->d.allele[ialt][0]=='<' ) + if ( alt_allele[0]=='<' ) { - if ( strcasecmp(rec->d.allele[ialt], "") ) - error("Symbolic alleles other than are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 - len_diff = 1-rec->rlen; - rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event - alen = strlen(rec->d.allele[ialt]); + // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG + + if ( strcasecmp(alt_allele,"") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"") ) + error("Symbolic alleles other than , <*> or are currently not supported, e.g. %s at %s:%"PRId64".\n" + "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n", + alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + if ( !strcasecmp(alt_allele,"") ) + { + static int multibase_ref_del_warned = 0; + if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned ) + { + fprintf(stderr, + "Warning: one REF base is expected with , assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n" + " (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + multibase_ref_del_warned = 1; + } + if ( args->mark_del ) // insert dashes instead of delete sequence + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } + else + { + len_diff = 1-rec->rlen; + alt_allele = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event + alen = 1; + } + } + else + { + // <*> or .. gVCF, evidence for the reference allele throughout the whole block + freeze_ref(args,rec); + return; + } } else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) { @@ -614,39 +807,63 @@ static void apply_variant(args_t *args, bcf1_t *rec) } error( "The fasta sequence does not match the REF allele at %s:%"PRId64":\n" - " .vcf: [%s] <- (REF)\n" - " .vcf: [%s] <- (ALT)\n" - " .fa: [%s]%c%s\n", - bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, + " REF .vcf: [%s]\n" + " ALT .vcf: [%s]\n" + " REF .fa : [%s]%c%s\n", + bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); } - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); len_diff = alen - rec->rlen; + + if ( args->mark_del && len_diff<0 ) + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } } else { - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); len_diff = alen - rec->rlen; + + if ( args->mark_del && len_diff<0 ) + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } } - if ( args->fa_case ) - for (i=0; id.allele[ialt][i] = toupper(rec->d.allele[ialt][i]); + args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER; + if ( args->fa_case==TO_UPPER ) + for (i=0; id.allele[ialt][i] = tolower(rec->d.allele[ialt][i]); + for (i=0; imark_ins && len_diff>0 ) + mark_ins(rec->d.allele[0], alt_allele, args->mark_ins); + if ( args->mark_snv ) + mark_snv(rec->d.allele[0], alt_allele, args->mark_snv); if ( len_diff <= 0 ) { // deletion or same size event - for (i=0; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; + assert( args->fa_buf.l >= idx+rec->rlen ); + args->prev_base = args->fa_buf.s[idx+rec->rlen-1]; + args->prev_base_pos = rec->pos + rec->rlen - 1; + args->prev_is_insert = 0; + args->fa_frz_mod = idx + alen; + + for (i=trim_beg; ifa_buf.s[idx+i] = alt_allele[i]; if ( len_diff ) memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); - - args->prev_base = rec->d.allele[0][rec->rlen - 1]; - args->prev_base_pos = rec->pos + rec->rlen - 1; - args->prev_is_insert = 0; } else { @@ -663,14 +880,16 @@ static void apply_variant(args_t *args, bcf1_t *rec) // 1 C T // 1 C CAA int ibeg = 0; - while ( ibegd.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; + while ( ibegd.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; for (i=ibeg; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; + args->fa_buf.s[idx+i] = alt_allele[i]; + + args->fa_frz_mod = idx + alen - ibeg + 1; } if (args->chain && len_diff != 0) { // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant) - if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0) + if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0) { // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1); @@ -685,6 +904,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) args->fa_mod_off += len_diff; args->fa_frz_pos = rec->pos + rec->rlen - 1; args->napplied++; + if ( rmme_alt ) free(alt_allele); } @@ -692,17 +912,27 @@ static void mask_region(args_t *args, char *seq, int len) { int start = args->fa_src_pos - len; int end = args->fa_src_pos; + int i; - if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return; - - int idx_start, idx_end, i; - while ( regitr_overlap(args->itr) ) + for (i=0; inmask; i++) { - idx_start = args->itr->beg - start; - idx_end = args->itr->end - start; - if ( idx_start < 0 ) idx_start = 0; - if ( idx_end >= len ) idx_end = len - 1; - for (i=idx_start; i<=idx_end; i++) seq[i] = 'N'; + mask_t *mask = &args->mask[i]; + if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue; + + int idx_start, idx_end, j; + while ( regitr_overlap(mask->itr) ) + { + idx_start = mask->itr->beg - start; + idx_end = mask->itr->end - start; + if ( idx_start < 0 ) idx_start = 0; + if ( idx_end >= len ) idx_end = len - 1; + if ( mask->with==MASK_UC ) + for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]); + else if ( mask->with==MASK_LC ) + for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]); + else + for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with; + } } } @@ -720,13 +950,20 @@ static void consensus(args_t *args) print_chain(args); destroy_chain(args); } - // apply all cached variants - while ( args->vcf_rbuf.n ) + // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*) + bcf1_t **rec_ptr = NULL; + while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) { - bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f]; + bcf1_t *rec = *rec_ptr; if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break; - int i = rbuf_shift(&args->vcf_rbuf); - apply_variant(args, args->vcf_buf[i]); + apply_variant(args, rec); + } + if ( args->absent_allele ) + { + int pos = 0; + if ( args->vcf_rbuf.n && args->vcf_buf[args->vcf_rbuf.f]->rid==args->rid ) + pos = args->vcf_buf[args->vcf_rbuf.f]->pos; + apply_absent(args, pos); } flush_fa_buffer(args, 0); init_region(args, str.s+1); @@ -771,7 +1008,11 @@ static void consensus(args_t *args) } apply_variant(args, rec); } - if ( !rec_ptr ) flush_fa_buffer(args, 60); + if ( !rec_ptr ) + { + if ( args->absent_allele ) apply_absent(args, args->fa_ori_pos - args->fa_mod_off + args->fa_buf.l); + flush_fa_buffer(args, 60); + } } bcf1_t **rec_ptr = NULL; while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) @@ -787,6 +1028,7 @@ static void consensus(args_t *args) print_chain(args); destroy_chain(args); } + if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX); flush_fa_buffer(args, 0); bgzf_close(fasta); free(str.s); @@ -801,27 +1043,33 @@ static void usage(args_t *args) fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n"); - fprintf(stderr, "Usage: bcftools consensus [OPTIONS] \n"); + fprintf(stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --chain write a chain file for liftover\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); - fprintf(stderr, " the codes are case-insensitive:\n"); - fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); - fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); - fprintf(stderr, " R: REF allele in het genotypes\n"); - fprintf(stderr, " A: ALT allele\n"); - fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); - fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); - fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(stderr, " -m, --mask replace regions with N\n"); - fprintf(stderr, " -M, --missing output instead of skipping the missing genotypes\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -p, --prefix prefix to add to output sequence names\n"); - fprintf(stderr, " -s, --sample apply variants of the given sample\n"); + fprintf(stderr, " -c, --chain FILE write a chain file for liftover\n"); + fprintf(stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n"); + fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n"); + fprintf(stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(stderr, " the codes are case-insensitive:\n"); + fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); + fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); + fprintf(stderr, " R: REF allele in het genotypes\n"); + fprintf(stderr, " A: ALT allele\n"); + fprintf(stderr, " I: IUPAC code for all genotypes\n"); + fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); + fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); + fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); + fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); + fprintf(stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n"); + fprintf(stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n"); + fprintf(stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n"); + fprintf(stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n"); + fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(stderr, " -p, --prefix STRING prefix to add to output sequence names\n"); + fprintf(stderr, " -s, --sample NAME apply variants of the given sample\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); fprintf(stderr, " # in the form \">chr:from-to\".\n"); @@ -837,6 +1085,10 @@ int main_consensus(int argc, char *argv[]) static struct option loptions[] = { + {"mark-del",required_argument,NULL,1}, + {"mark-ins",required_argument,NULL,2}, + {"mark-snv",required_argument,NULL,3}, + {"mask-with",1,0,4}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"sample",1,0,'s'}, @@ -846,23 +1098,44 @@ int main_consensus(int argc, char *argv[]) {"fasta-ref",1,0,'f'}, {"mask",1,0,'m'}, {"missing",1,0,'M'}, + {"absent",1,0,'a'}, {"chain",1,0,'c'}, {"prefix",required_argument,0,'p'}, {0,0,0,0} }; int c; - while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) { switch (c) { + case 1 : args->mark_del = optarg[0]; break; + case 2 : + if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u'; + else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l'; + else error("The argument is not recognised: --mark-ins %s\n",optarg); + break; + case 3 : + if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u'; + else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l'; + else error("The argument is not recognised: --mark-snv %s\n",optarg); + break; case 'p': args->chr_prefix = optarg; break; case 's': args->sample = optarg; break; case 'o': args->output_fname = optarg; break; case 'I': args->output_iupac = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'f': args->ref_fname = optarg; break; - case 'm': args->mask_fname = optarg; break; + case 'm': add_mask(args,optarg); break; + case 4 : add_mask_with(args,optarg); break; + case 'a': + args->absent_allele = optarg[0]; + if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg); + break; case 'M': args->missing_allele = optarg[0]; if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg); @@ -877,6 +1150,7 @@ int main_consensus(int argc, char *argv[]) else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; + else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC; else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1; else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2; else diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index b1b1861..5105a2e 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2014-2017 Genome Research Ltd. + Copyright (c) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,9 @@ #define PICK_SHORT 8 #define PICK_IUPAC 16 +#define TO_UPPER 0 +#define TO_LOWER 1 + typedef struct { int num; // number of ungapped blocks in this chain @@ -66,6 +70,16 @@ typedef struct } chain_t; +#define MASK_LC 1 +#define MASK_UC 2 +#define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0) +typedef struct +{ + char *fname, with; + regidx_t *idx; + regitr_t *itr; +} +mask_t; typedef struct { @@ -73,9 +87,10 @@ typedef struct int fa_ori_pos; // start position of the fa_buffer (wrt original sequence) int fa_frz_pos; // protected position to avoid conflicting variants (last pos for SNPs/ins) int fa_mod_off; // position difference of fa_frz_pos in the ori and modified sequence (ins positive) + int fa_frz_mod; // the fa_buf offset of the protected fa_frz_pos position, includes the modified sequence int fa_end_pos; // region's end position in the original sequence int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header) - int fa_case; // output upper case or lower case? + int fa_case; // output upper case or lower case: TO_UPPER|TO_LOWER int fa_src_pos; // last genomic coordinate read from the input fasta (0-based) char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778 int prev_base_pos; // the position of prev_base @@ -86,8 +101,8 @@ typedef struct int nvcf_buf, rid; char *chr, *chr_prefix; - regidx_t *mask; - regitr_t *itr; + mask_t *mask; + int nmask; int chain_id; // chain_id, to provide a unique ID to each chain in the chain output chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences @@ -103,7 +118,10 @@ typedef struct FILE *fp_chain; char **argv; int argc, output_iupac, haplotype, allele, isample, napplied; - char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele; + uint8_t *iupac_bitmask; + int miupac_bitmask; + char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; + char mark_del, mark_ins, mark_snv; } args_t; @@ -184,7 +202,7 @@ static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_s // fprintf(bcftools_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len); int num = chain->num; - if (ref_start <= chain->ref_last_block_ori) { + if (num && ref_start <= chain->ref_last_block_ori) { // In case this variant is back-to-back with the previous one chain->ref_last_block_ori = ref_start + ref_len; chain->alt_last_block_ori = alt_start + alt_len; @@ -224,11 +242,13 @@ static void init_data(args_t *args) if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); args->isample = 0; } - if ( args->mask_fname ) + int i; + for (i=0; inmask; i++) { - args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL); - if ( !args->mask ) error("Failed to initialize mask regions\n"); - args->itr = regitr_init(args->mask); + mask_t *mask = &args->mask[i]; + mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL); + if ( !mask->idx ) error("Failed to initialize mask regions\n"); + mask->itr = regitr_init(mask->idx); } // In case we want to store the chains if ( args->chain_fname ) @@ -247,10 +267,28 @@ static void init_data(args_t *args) if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); + args->rid = -1; +} +static void add_mask(args_t *args, char *fname) +{ + args->nmask++; + args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask)); + mask_t *mask = &args->mask[args->nmask-1]; + mask->fname = fname; + mask->with = 'N'; +} +static void add_mask_with(args_t *args, char *with) +{ + if ( !args->nmask ) error("The --mask-with option must follow --mask\n"); + mask_t *mask = &args->mask[args->nmask-1]; + if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC; + else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC; + else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n"); + else mask->with = *with; } - static void destroy_data(args_t *args) { + free(args->iupac_bitmask); if (args->filter) filter_destroy(args->filter); bcf_sr_destroy(args->files); int i; @@ -259,8 +297,13 @@ static void destroy_data(args_t *args) free(args->vcf_buf); free(args->fa_buf.s); free(args->chr); - if ( args->mask ) regidx_destroy(args->mask); - if ( args->itr ) regitr_destroy(args->itr); + for (i=0; inmask; i++) + { + mask_t *mask = &args->mask[i]; + regidx_destroy(mask->idx); + regitr_destroy(mask->itr); + } + free(args->mask); if ( args->chain_fname ) if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname); if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname); @@ -299,6 +342,7 @@ static void init_region(args_t *args, char *line) args->fa_src_pos = from; args->fa_mod_off = 0; args->fa_frz_pos = -1; + args->fa_frz_mod = -1; args->fa_case = -1; args->vcf_rbuf.n = 0; bcf_sr_seek(args->files,line,args->fa_ori_pos); @@ -347,7 +391,6 @@ static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr) static void flush_fa_buffer(args_t *args, int len) { if ( !args->fa_buf.l ) return; - int nwr = 0; while ( nwr + 60 <= args->fa_buf.l ) { @@ -358,6 +401,8 @@ static void flush_fa_buffer(args_t *args, int len) if ( nwr ) args->fa_ori_pos += nwr; + args->fa_frz_mod -= nwr; + if ( len ) { // not finished on this chr yet and the buffer cannot be emptied completely @@ -377,21 +422,84 @@ static void flush_fa_buffer(args_t *args, int len) args->fa_mod_off = 0; args->fa_buf.l = 0; } +static void apply_absent(args_t *args, hts_pos_t pos) +{ + if ( !args->fa_buf.l || pos <= args->fa_frz_pos + 1 || pos <= args->fa_ori_pos ) return; + + int ie = pos && pos - args->fa_ori_pos + args->fa_mod_off < args->fa_buf.l ? pos - args->fa_ori_pos + args->fa_mod_off : args->fa_buf.l; + int ib = args->fa_frz_mod < 0 ? 0 : args->fa_frz_mod; + int i; + for (i=ib; ifa_buf.s[i] = args->absent_allele; +} +static void freeze_ref(args_t *args, bcf1_t *rec) +{ + if ( args->fa_frz_pos >= rec->pos + rec->rlen - 1 ) return; + args->fa_frz_pos = rec->pos + rec->rlen - 1; + args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen; +} +static char *mark_del(char *ref, int rlen, char *alt, int mark) +{ + char *out = malloc(rlen+1); + int i; + if ( alt ) + { + int nalt = strlen(alt); + for (i=0; i + { + int nref = strlen(ref); + for (i=0; in_allele==1 && !args->missing_allele ) return; + if ( args->absent_allele ) apply_absent(args, rec->pos); + if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; } + int i,j; if ( args->mask ) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = rec->pos; int end = rec->pos + rec->rlen - 1; - if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; + for (i=0; inmask; i++) + { + mask_t *mask = &args->mask[i]; + if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return; + } } - int i, ialt = 1; // the alternate allele + int ialt = 1; // the alternate allele if ( args->isample >= 0 ) { bcf_unpack(rec, BCF_UN_FMT); @@ -405,6 +513,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) enum { use_hap, use_iupac, pick_one } action = use_hap; if ( args->allele==PICK_IUPAC ) { + if ( !args->haplotype ) action = use_iupac; if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac; } else if ( args->output_iupac ) action = use_iupac; @@ -443,41 +552,40 @@ static void apply_variant(args_t *args, bcf1_t *rec) } else if ( action==use_iupac ) { - ialt = ptr[0]; - if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) + ialt = -1; + int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1; + for (i=0; in; i++) { - if ( !args->missing_allele ) return; - ialt = -1; - } - else - ialt = bcf_gt_allele(ialt); + if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; } + if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; + int jalt = bcf_gt_allele(ptr[i]); + if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + if ( fallback_alt <= 0 ) fallback_alt = jalt; - int jalt; - if ( fmt->n>1 ) - { - jalt = ptr[1]; - if ( bcf_gt_is_missing(jalt) ) + int l = strlen(rec->d.allele[jalt]); + for (j=0; jd.allele[jalt][j]) < 0 ) break; + if ( j mlen ) { - if ( !args->missing_allele ) return; - ialt = -1; + hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask); + for (j=mlen; jiupac_bitmask[j] = 0; + mlen = l; } - else if ( jalt==bcf_int32_vector_end ) jalt = ialt; - else - jalt = bcf_gt_allele(jalt); - } - else jalt = ialt; - - if ( ialt>=0 ) - { - if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? + if ( jalt>0 && l>alen ) { - char ial = rec->d.allele[ialt][0]; - char jal = rec->d.allele[jalt][0]; - if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored - rec->d.allele[ialt][0] = gt2iupac(ial,jal); + alen = l; + ialt = jalt; } + for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]); } + if ( alen > 0 ) + for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); + else if ( fallback_alt >= 0 ) + ialt = fallback_alt; + else if ( is_missing && !args->missing_allele ) return; } else { @@ -522,17 +630,50 @@ static void apply_variant(args_t *args, bcf1_t *rec) } } } - if ( !ialt ) return; // ref allele + if ( !ialt ) + { + // ref allele + if ( args->absent_allele ) freeze_ref(args,rec); + return; + } if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } - else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) + else if ( args->output_iupac && rec->n_allele>1 ) { - char ial = rec->d.allele[0][0]; - char jal = rec->d.allele[1][0]; - rec->d.allele[1][0] = gt2iupac(ial,jal); + int ialt, alen = 0, mlen = 0; + for (i=0; in_allele; i++) + { + int l = strlen(rec->d.allele[i]); + for (j=0; jd.allele[i][j]) < 0 ) break; + if ( j mlen ) + { + hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask); + for (j=mlen; jiupac_bitmask[j] = 0; + mlen = l; + } + if ( i>0 && l>alen ) + { + alen = l; + ialt = i; + } + for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]); + } + if ( alen > 0 ) + for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); + else + ialt = 1; } - if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference + if ( rec->n_allele==1 && ialt!=-1 ) + { + // non-missing reference + if ( args->absent_allele ) freeze_ref(args,rec); + return; + } if ( ialt==-1 ) { char alleles[4]; @@ -544,15 +685,34 @@ static void apply_variant(args_t *args, bcf1_t *rec) ialt = 1; } + // For some variant types POS+REF refer to the base *before* the event; in such case set trim_beg + int trim_beg = 0; + int var_type = bcf_get_variant_type(rec,ialt); + int var_len = rec->d.var[ialt].n; + if ( var_type & VCF_INDEL ) + { + // normally indel starts one base after, but not if the first base of the fa reference is deleted + if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] ) + trim_beg = 1; + else + trim_beg = 0; + } + else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"") ) + { + trim_beg = 1; + var_len = 1 - rec->rlen; + } + else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"pos <= args->fa_frz_pos ) { // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888). // This still may not be enough for more complicated cases with multiple duplicate positions // and other types in between. In such case let the user normalize the VCF and remove duplicates. + int overlap = 0; - if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1; - else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1; + if ( rec->pos < args->fa_frz_pos || !trim_beg || var_len==0 || args->prev_is_insert ) overlap = 1; if ( overlap ) { @@ -562,6 +722,9 @@ static void apply_variant(args_t *args, bcf1_t *rec) } + char *alt_allele = rec->d.allele[ialt]; + int rmme_alt = 0; + int len_diff = 0, alen = 0; int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; if ( idx<0 ) @@ -572,10 +735,10 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( rec->rlen > args->fa_buf.l - idx ) { rec->rlen = args->fa_buf.l - idx; - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); if ( alen > rec->rlen ) { - rec->d.allele[ialt][rec->rlen] = 0; + alt_allele[rec->rlen] = 0; fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } } @@ -583,14 +746,44 @@ static void apply_variant(args_t *args, bcf1_t *rec) error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); // sanity check the reference base - if ( rec->d.allele[ialt][0]=='<' ) + if ( alt_allele[0]=='<' ) { - if ( strcasecmp(rec->d.allele[ialt], "") ) - error("Symbolic alleles other than are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 - len_diff = 1-rec->rlen; - rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event - alen = strlen(rec->d.allele[ialt]); + // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG + + if ( strcasecmp(alt_allele,"") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"") ) + error("Symbolic alleles other than , <*> or are currently not supported, e.g. %s at %s:%"PRId64".\n" + "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n", + alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + if ( !strcasecmp(alt_allele,"") ) + { + static int multibase_ref_del_warned = 0; + if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned ) + { + fprintf(bcftools_stderr, + "Warning: one REF base is expected with , assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n" + " (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + multibase_ref_del_warned = 1; + } + if ( args->mark_del ) // insert dashes instead of delete sequence + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } + else + { + len_diff = 1-rec->rlen; + alt_allele = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event + alen = 1; + } + } + else + { + // <*> or .. gVCF, evidence for the reference allele throughout the whole block + freeze_ref(args,rec); + return; + } } else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) { @@ -616,39 +809,63 @@ static void apply_variant(args_t *args, bcf1_t *rec) } error( "The fasta sequence does not match the REF allele at %s:%"PRId64":\n" - " .vcf: [%s] <- (REF)\n" - " .vcf: [%s] <- (ALT)\n" - " .fa: [%s]%c%s\n", - bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, + " REF .vcf: [%s]\n" + " ALT .vcf: [%s]\n" + " REF .fa : [%s]%c%s\n", + bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); } - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); len_diff = alen - rec->rlen; + + if ( args->mark_del && len_diff<0 ) + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } } else { - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); len_diff = alen - rec->rlen; + + if ( args->mark_del && len_diff<0 ) + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } } - if ( args->fa_case ) - for (i=0; id.allele[ialt][i] = toupper(rec->d.allele[ialt][i]); + args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER; + if ( args->fa_case==TO_UPPER ) + for (i=0; id.allele[ialt][i] = tolower(rec->d.allele[ialt][i]); + for (i=0; imark_ins && len_diff>0 ) + mark_ins(rec->d.allele[0], alt_allele, args->mark_ins); + if ( args->mark_snv ) + mark_snv(rec->d.allele[0], alt_allele, args->mark_snv); if ( len_diff <= 0 ) { // deletion or same size event - for (i=0; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; + assert( args->fa_buf.l >= idx+rec->rlen ); + args->prev_base = args->fa_buf.s[idx+rec->rlen-1]; + args->prev_base_pos = rec->pos + rec->rlen - 1; + args->prev_is_insert = 0; + args->fa_frz_mod = idx + alen; + + for (i=trim_beg; ifa_buf.s[idx+i] = alt_allele[i]; if ( len_diff ) memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); - - args->prev_base = rec->d.allele[0][rec->rlen - 1]; - args->prev_base_pos = rec->pos + rec->rlen - 1; - args->prev_is_insert = 0; } else { @@ -665,14 +882,16 @@ static void apply_variant(args_t *args, bcf1_t *rec) // 1 C T // 1 C CAA int ibeg = 0; - while ( ibegd.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; + while ( ibegd.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; for (i=ibeg; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; + args->fa_buf.s[idx+i] = alt_allele[i]; + + args->fa_frz_mod = idx + alen - ibeg + 1; } if (args->chain && len_diff != 0) { // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant) - if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0) + if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0) { // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1); @@ -687,6 +906,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) args->fa_mod_off += len_diff; args->fa_frz_pos = rec->pos + rec->rlen - 1; args->napplied++; + if ( rmme_alt ) free(alt_allele); } @@ -694,17 +914,27 @@ static void mask_region(args_t *args, char *seq, int len) { int start = args->fa_src_pos - len; int end = args->fa_src_pos; + int i; - if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return; - - int idx_start, idx_end, i; - while ( regitr_overlap(args->itr) ) + for (i=0; inmask; i++) { - idx_start = args->itr->beg - start; - idx_end = args->itr->end - start; - if ( idx_start < 0 ) idx_start = 0; - if ( idx_end >= len ) idx_end = len - 1; - for (i=idx_start; i<=idx_end; i++) seq[i] = 'N'; + mask_t *mask = &args->mask[i]; + if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue; + + int idx_start, idx_end, j; + while ( regitr_overlap(mask->itr) ) + { + idx_start = mask->itr->beg - start; + idx_end = mask->itr->end - start; + if ( idx_start < 0 ) idx_start = 0; + if ( idx_end >= len ) idx_end = len - 1; + if ( mask->with==MASK_UC ) + for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]); + else if ( mask->with==MASK_LC ) + for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]); + else + for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with; + } } } @@ -722,13 +952,20 @@ static void consensus(args_t *args) print_chain(args); destroy_chain(args); } - // apply all cached variants - while ( args->vcf_rbuf.n ) + // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*) + bcf1_t **rec_ptr = NULL; + while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) { - bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f]; + bcf1_t *rec = *rec_ptr; if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break; - int i = rbuf_shift(&args->vcf_rbuf); - apply_variant(args, args->vcf_buf[i]); + apply_variant(args, rec); + } + if ( args->absent_allele ) + { + int pos = 0; + if ( args->vcf_rbuf.n && args->vcf_buf[args->vcf_rbuf.f]->rid==args->rid ) + pos = args->vcf_buf[args->vcf_rbuf.f]->pos; + apply_absent(args, pos); } flush_fa_buffer(args, 0); init_region(args, str.s+1); @@ -773,7 +1010,11 @@ static void consensus(args_t *args) } apply_variant(args, rec); } - if ( !rec_ptr ) flush_fa_buffer(args, 60); + if ( !rec_ptr ) + { + if ( args->absent_allele ) apply_absent(args, args->fa_ori_pos - args->fa_mod_off + args->fa_buf.l); + flush_fa_buffer(args, 60); + } } bcf1_t **rec_ptr = NULL; while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) @@ -789,6 +1030,7 @@ static void consensus(args_t *args) print_chain(args); destroy_chain(args); } + if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX); flush_fa_buffer(args, 0); bgzf_close(fasta); free(str.s); @@ -803,33 +1045,39 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); fprintf(bcftools_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); fprintf(bcftools_stderr, " information, such as INFO/AD or FORMAT/AD.\n"); - fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] \n"); + fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -c, --chain write a chain file for liftover\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(bcftools_stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); - fprintf(bcftools_stderr, " the codes are case-insensitive:\n"); - fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n"); - fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n"); - fprintf(bcftools_stderr, " R: REF allele in het genotypes\n"); - fprintf(bcftools_stderr, " A: ALT allele\n"); - fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); - fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); - fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); - fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(bcftools_stderr, " -m, --mask replace regions with N\n"); - fprintf(bcftools_stderr, " -M, --missing output instead of skipping the missing genotypes\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -p, --prefix prefix to add to output sequence names\n"); - fprintf(bcftools_stderr, " -s, --sample apply variants of the given sample\n"); + fprintf(bcftools_stderr, " -c, --chain FILE write a chain file for liftover\n"); + fprintf(bcftools_stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n"); + fprintf(bcftools_stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(bcftools_stderr, " the codes are case-insensitive:\n"); + fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n"); + fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n"); + fprintf(bcftools_stderr, " R: REF allele in het genotypes\n"); + fprintf(bcftools_stderr, " A: ALT allele\n"); + fprintf(bcftools_stderr, " I: IUPAC code for all genotypes\n"); + fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); + fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); + fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); + fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); + fprintf(bcftools_stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n"); + fprintf(bcftools_stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(bcftools_stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(bcftools_stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n"); + fprintf(bcftools_stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n"); + fprintf(bcftools_stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n"); + fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -p, --prefix STRING prefix to add to output sequence names\n"); + fprintf(bcftools_stderr, " -s, --sample NAME apply variants of the given sample\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n"); fprintf(bcftools_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_consensus(int argc, char *argv[]) @@ -839,6 +1087,10 @@ int main_consensus(int argc, char *argv[]) static struct option loptions[] = { + {"mark-del",required_argument,NULL,1}, + {"mark-ins",required_argument,NULL,2}, + {"mark-snv",required_argument,NULL,3}, + {"mask-with",1,0,4}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"sample",1,0,'s'}, @@ -848,23 +1100,44 @@ int main_consensus(int argc, char *argv[]) {"fasta-ref",1,0,'f'}, {"mask",1,0,'m'}, {"missing",1,0,'M'}, + {"absent",1,0,'a'}, {"chain",1,0,'c'}, {"prefix",required_argument,0,'p'}, {0,0,0,0} }; int c; - while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) { switch (c) { + case 1 : args->mark_del = optarg[0]; break; + case 2 : + if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u'; + else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l'; + else error("The argument is not recognised: --mark-ins %s\n",optarg); + break; + case 3 : + if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u'; + else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l'; + else error("The argument is not recognised: --mark-snv %s\n",optarg); + break; case 'p': args->chr_prefix = optarg; break; case 's': args->sample = optarg; break; case 'o': args->output_fname = optarg; break; case 'I': args->output_iupac = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'f': args->ref_fname = optarg; break; - case 'm': args->mask_fname = optarg; break; + case 'm': add_mask(args,optarg); break; + case 4 : add_mask_with(args,optarg); break; + case 'a': + args->absent_allele = optarg[0]; + if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg); + break; case 'M': args->missing_allele = optarg[0]; if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg); @@ -879,6 +1152,7 @@ int main_consensus(int argc, char *argv[]) else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; + else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC; else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1; else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2; else diff --git a/bcftools/convert.c b/bcftools/convert.c index fbf98e0..71dfb51 100644 --- a/bcftools/convert.c +++ b/bcftools/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -25,6 +25,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "variantkey.h" #include "convert.h" +#include "filter.h" #define T_CHROM 1 #define T_POS 2 @@ -73,6 +75,7 @@ THE SOFTWARE. */ #define T_RSX 30 // RSID HEX #define T_VKX 31 // VARIANTKEY HEX #define T_PBINOM 32 +#define T_NPASS 33 typedef struct _fmt_t { @@ -503,7 +506,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam type_t val = x[j]; \ if ( !val ) continue; \ for (i=0; istr[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \ + if ( val & (mask<str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \ } \ } \ if ( fmt->subscript<0 || fmt->subscript==2 ) \ @@ -513,7 +516,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam type_t val = x[j]; \ if ( !val ) continue; \ for (i=1; istr[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \ + if ( val & (1<str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \ } \ } \ } @@ -521,7 +524,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam { case BCF_BT_INT8: BRANCH(uint8_t, 8); break; case BCF_BT_INT16: BRANCH(uint16_t,16); break; - case BCF_BT_INT32: BRANCH(uint32_t,32); break; + case BCF_BT_INT32: BRANCH(uint32_t,30); break; // 2 bytes unused to account for the reserved BCF values default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break; } #undef BRANCH @@ -782,8 +785,8 @@ static void process_gp_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, in int j; for (j=0; j1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]); sum+=ptr[j]; } @@ -1122,6 +1125,21 @@ static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, ksprintf(str, "%016" PRIx64 "", vk); } +static void process_npass(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +{ + int i, nsmpl = 0; + filter_t *flt = (filter_t*) fmt->usr; + const uint8_t *smpl; + filter_test(flt,line,&smpl); + for (i=0; insamples; i++) + if ( smpl[i] ) nsmpl++; + kputd(nsmpl, str); +} +static void destroy_npass(void *usr) +{ + filter_destroy((filter_t*)usr); +} + static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { int i; @@ -1225,11 +1243,17 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; } } - if ( fmt->type==T_PBINOM ) + else if ( fmt->type==T_PBINOM ) { fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key); } + else if ( fmt->type==T_NPASS ) + { + filter_t *flt = filter_init(convert->header,key); + convert->max_unpack |= filter_max_unpack(flt); + fmt->usr = (void*) flt; + } } switch (fmt->type) @@ -1266,6 +1290,7 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) case T_RSX: fmt->handler = &process_rsid_hex; break; case T_VKX: fmt->handler = &process_variantkey_hex; break; case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break; + case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break; default: error("TODO: handler for type %d\n", fmt->type); } if ( key && fmt->type==T_INFO ) @@ -1344,6 +1369,8 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) register_tag(convert, T_PBINOM, str.s, is_gtf); q++; } + else if ( !strcmp(str.s,"N_PASS") ) + error("N_PASS() must be placed outside the square brackets\n"); else { fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf); @@ -1380,7 +1407,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf); else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf); else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf); - else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n"); + else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n"); else if ( !strcmp(str.s, "INFO") ) { if ( *q=='/' ) @@ -1398,6 +1425,22 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) } else if ( !strcmp(str.s, "FORMAT") ) register_tag(convert, T_FORMAT, NULL, 0); + else if ( !strcmp(str.s,"N_PASS") ) + { + if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str); + p = ++q; + str.l = 0; + int nopen = 1; + while ( *q && nopen ) + { + if ( *q=='(' ) nopen++; + else if ( *q==')' ) nopen--; + q++; + } + if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str); + kputsn(p, q-p-1, &str); + register_tag(convert, T_NPASS, str.s, is_gtf); + } else { fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); @@ -1565,7 +1608,8 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) for (js=0; jsnsamples; js++) { // Skip samples when filtering was requested - if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; + int ks = convert->samples[js]; + if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue; // Here comes a hack designed for TBCSQ. When running on large files, // such as 1000GP, there are too many empty fields in the output and @@ -1574,7 +1618,6 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) // brackets here. This may be changed in future, time will show... size_t l_start = str->l; - int ks = convert->samples[js]; for (k=i; kfmt[k].type == T_MASK ) diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index 8f04911..e3c995f 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -2,7 +2,7 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -27,6 +27,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "variantkey.h" #include "convert.h" +#include "filter.h" #define T_CHROM 1 #define T_POS 2 @@ -75,6 +77,7 @@ THE SOFTWARE. */ #define T_RSX 30 // RSID HEX #define T_VKX 31 // VARIANTKEY HEX #define T_PBINOM 32 +#define T_NPASS 33 typedef struct _fmt_t { @@ -270,7 +273,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break; case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break; case BCF_BT_CHAR: kputc(info->v1.i, str); break; - default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break; } } else if ( fmt->subscript >=0 ) @@ -292,7 +295,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break; case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break; - default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break; } #undef BRANCH } @@ -505,7 +508,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam type_t val = x[j]; \ if ( !val ) continue; \ for (i=0; istr[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \ + if ( val & (mask<str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \ } \ } \ if ( fmt->subscript<0 || fmt->subscript==2 ) \ @@ -515,7 +518,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam type_t val = x[j]; \ if ( !val ) continue; \ for (i=1; istr[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \ + if ( val & (1<str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \ } \ } \ } @@ -523,8 +526,8 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam { case BCF_BT_INT8: BRANCH(uint8_t, 8); break; case BCF_BT_INT16: BRANCH(uint16_t,16); break; - case BCF_BT_INT32: BRANCH(uint32_t,32); break; - default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break; + case BCF_BT_INT32: BRANCH(uint32_t,30); break; // 2 bytes unused to account for the reserved BCF values + default: error("Unexpected type: %d\n", fmt->fmt->type); bcftools_exit(1); break; } #undef BRANCH @@ -784,8 +787,8 @@ static void process_gp_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, in int j; for (j=0; j1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]); sum+=ptr[j]; } @@ -1124,6 +1127,21 @@ static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, ksprintf(str, "%016" PRIx64 "", vk); } +static void process_npass(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +{ + int i, nsmpl = 0; + filter_t *flt = (filter_t*) fmt->usr; + const uint8_t *smpl; + filter_test(flt,line,&smpl); + for (i=0; insamples; i++) + if ( smpl[i] ) nsmpl++; + kputd(nsmpl, str); +} +static void destroy_npass(void *usr) +{ + filter_destroy((filter_t*)usr); +} + static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { int i; @@ -1227,11 +1245,17 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; } } - if ( fmt->type==T_PBINOM ) + else if ( fmt->type==T_PBINOM ) { fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key); } + else if ( fmt->type==T_NPASS ) + { + filter_t *flt = filter_init(convert->header,key); + convert->max_unpack |= filter_max_unpack(flt); + fmt->usr = (void*) flt; + } } switch (fmt->type) @@ -1268,6 +1292,7 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) case T_RSX: fmt->handler = &process_rsid_hex; break; case T_VKX: fmt->handler = &process_variantkey_hex; break; case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break; + case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break; default: error("TODO: handler for type %d\n", fmt->type); } if ( key && fmt->type==T_INFO ) @@ -1346,6 +1371,8 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) register_tag(convert, T_PBINOM, str.s, is_gtf); q++; } + else if ( !strcmp(str.s,"N_PASS") ) + error("N_PASS() must be placed outside the square brackets\n"); else { fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf); @@ -1382,7 +1409,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf); else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf); else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf); - else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n"); + else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n"); else if ( !strcmp(str.s, "INFO") ) { if ( *q=='/' ) @@ -1400,6 +1427,22 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) } else if ( !strcmp(str.s, "FORMAT") ) register_tag(convert, T_FORMAT, NULL, 0); + else if ( !strcmp(str.s,"N_PASS") ) + { + if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str); + p = ++q; + str.l = 0; + int nopen = 1; + while ( *q && nopen ) + { + if ( *q=='(' ) nopen++; + else if ( *q==')' ) nopen--; + q++; + } + if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str); + kputsn(p, q-p-1, &str); + register_tag(convert, T_NPASS, str.s, is_gtf); + } else { fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); @@ -1567,7 +1610,8 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) for (js=0; jsnsamples; js++) { // Skip samples when filtering was requested - if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; + int ks = convert->samples[js]; + if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue; // Here comes a hack designed for TBCSQ. When running on large files, // such as 1000GP, there are too many empty fields in the output and @@ -1576,7 +1620,6 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) // brackets here. This may be changed in future, time will show... size_t l_start = str->l; - int ks = convert->samples[js]; for (k=i; kfmt[k].type == T_MASK ) diff --git a/bcftools/csq.c b/bcftools/csq.c index c9a0132..8e3ee3b 100644 --- a/bcftools/csq.c +++ b/bcftools/csq.c @@ -1,9 +1,6 @@ -//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz - - /* The MIT License - Copyright (c) 2016-2018 Genome Research Ltd. + Copyright (c) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -136,6 +133,7 @@ #include #include +#include #include #include #include @@ -592,8 +590,8 @@ typedef struct _args_t char *bcsq_tag; int argc, output_type; int phase, verbosity, local_csq, record_cmd_line; - int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ - int ncsq_small_warned; + int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) + int ncsq2_small_warned; int brief_predictions; int rid; // current chromosome @@ -680,11 +678,42 @@ static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end) int iseq; if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) { - hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); - aux->seq[aux->nseq] = strdup(chr_beg); - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; - assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + // check for possible mismatch in chromosome naming convention such as chrX vs X + char *new_chr = NULL; + if ( faidx_has_seq(args->fai,chr_beg) ) + new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx + else + { + int len = strlen(chr_beg); + if ( !strncmp("chr",chr_beg,3) && len>3 ) + new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not + else + { + new_chr = malloc(len+3); // gff does not have the prefix, faidx has + memcpy(new_chr,"chr",3); + memcpy(new_chr+3,chr_beg,len); + new_chr[len+3] = 0; + } + if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai + { + static int unkwn_chr_warned = 0; + if ( !unkwn_chr_warned && args->verbosity>0 ) + fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg); + unkwn_chr_warned = 1; + free(new_chr); + new_chr = strdup(chr_beg); // use the original sequence name + } + } + if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 ) + { + hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); + aux->seq[aux->nseq] = new_chr; + iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); + aux->nseq++; + assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + } + else + free(new_chr); } chr_end[1] = c; return iseq; @@ -1140,7 +1169,8 @@ void tscript_init_cds(args_t *args) tscript_ok = 0; break; } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } len += tr->cds[i]->len; } @@ -1178,7 +1208,8 @@ void tscript_init_cds(args_t *args) tscript_ok = 0; break; } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } len += tr->cds[i]->len; } @@ -1196,8 +1227,17 @@ void tscript_init_cds(args_t *args) gf_cds_t *a = tr->cds[i-1]; gf_cds_t *b = tr->cds[i]; if ( a->beg + a->len - 1 >= b->beg ) - error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n", - kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + { + if ( args->force ) + { + fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n", + args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + } + else + error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n" + " Use the --force option to override (at your own risk).\n", + args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + } } if ( len%3 != 0 ) { @@ -1337,9 +1377,22 @@ void init_gff(args_t *args) khash_str2int_destroy_free(aux->ignored_biotypes); } +static inline int ncsq2_to_nfmt(int ncsq2) +{ + return 1 + (ncsq2 - 1) / 30; +} +static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit) +{ + *ival = icsq2 / 30; + *ibit = icsq2 % 30; +} + void init_data(args_t *args) { - args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; + args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max); + + args->fai = fai_load(args->fa_fname); + if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); init_gff(args); @@ -1349,9 +1402,6 @@ void init_data(args_t *args) if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); - args->fai = fai_load(args->fa_fname); - if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); - args->pos2vbuf = kh_init(pos2vbuf); args->active_tr = khp_init(trhp); args->hap = (hap_t*) calloc(1,sizeof(hap_t)); @@ -1395,7 +1445,7 @@ void init_data(args_t *args) } else { - args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads > 0) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); @@ -1410,6 +1460,11 @@ void init_data(args_t *args) void destroy_data(args_t *args) { + if ( args->ncsq2_small_warned ) + fprintf(stderr, + "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n" + " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2); + regidx_destroy(args->idx_cds); regidx_destroy(args->idx_utr); regidx_destroy(args->idx_exon); @@ -2683,13 +2738,13 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) { - if ( !args->brief_predictions ) + if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 ) kputs(aa->s, str); else { - int len = aa->l; + int i, len = aa->l; if ( aa->s[len-1]=='*' ) len--; - kputc(aa->s[0], str); + for (i=0; ibrief_predictions; i++) kputc(aa->s[i], str); kputs("..", str); kputw(beg+len, str); } @@ -3083,22 +3138,24 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha { csq_t *csq = node->csq_list + i; vrec_t *vrec = csq->vrec; - int icsq = 2*csq->idx + ihap; - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + int icsq2 = 2*csq->idx + ihap; + if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT { - if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) + if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) ) { fprintf(stderr, "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); - if ( !args->ncsq_small_warned ) + if ( !args->ncsq2_small_warned ) fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; } + if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2; break; } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; - vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + int ival, ibit; + icsq2_to_bit(icsq2, &ival,&ibit); + if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; + vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit; } } @@ -3727,22 +3784,26 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) { if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue; - int icsq = 2*csq->idx + j; - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + int icsq2 = 2*csq->idx + j; + if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT { int ismpl = args->smpl->idx[i]; - if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) + if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) ) { fprintf(stderr, "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", - args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1); - if ( !args->ncsq_small_warned ) + args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1); + if ( !args->ncsq2_small_warned ) fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; + args->ncsq2_small_warned = 1; } + if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2; + break; } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; - vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + int ival, ibit; + icsq2_to_bit(icsq2, &ival,&ibit); + if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; + vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit; } } } @@ -4041,39 +4102,39 @@ static const char *usage(void) return "\n" "About: Haplotype-aware consequence caller.\n" - "Usage: bcftools csq [options] in.vcf\n" + "Usage: bcftools csq [OPTIONS] in.vcf\n" "\n" "Required options:\n" - " -f, --fasta-ref reference file in fasta format\n" - " -g, --gff-annot gff3 annotation file\n" + " -f, --fasta-ref FILE reference file in fasta format\n" + " -g, --gff-annot FILE gff3 annotation file\n" "\n" "CSQ options:\n" - " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n" - " -c, --custom-tag use this tag instead of the default BCSQ\n" + " -B, --trim-protein-seq INT abbreviate protein-changing predictions to max INT aminoacids\n" + " -c, --custom-tag STRING use this tag instead of the default BCSQ\n" " -l, --local-csq localized predictions, consider only one VCF record at a time\n" - " -n, --ncsq maximum number of consequences to consider per site [16]\n" - " -p, --phase how to handle unphased heterozygous genotypes: [r]\n" + " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n" + " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n" " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" " r: require phased GTs, throw an error on unphased het GTs\n" " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" " s: skip unphased hets\n" "Options:\n" - " -e, --exclude exclude sites for which the expression is true\n" + " -e, --exclude EXPR exclude sites for which the expression is true\n" " --force run even if some sanity checks fail\n" - " -i, --include select sites for which the expression is true\n" + " -i, --include EXPR select sites for which the expression is true\n" " --no-version do not append version and command line to the header\n" - " -o, --output write output to a file [standard output]\n" - " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" + " -o, --output FILE write output to a file [standard output]\n" + " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" - " -r, --regions restrict to comma-separated list of regions\n" - " -R, --regions-file restrict to regions listed in a file\n" - " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" - " -S, --samples-file samples to include\n" - " -t, --targets similar to -r but streams rather than index-jumps\n" - " -T, --targets-file similar to -R but streams rather than index-jumps\n" - " --threads use multithreading with worker threads [0]\n" - " -v, --verbose verbosity level 0-2 [1]\n" + " -r, --regions REGION restrict to comma-separated list of regions\n" + " -R, --regions-file FILE restrict to regions listed in a file\n" + " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n" + " -S, --samples-file FILE samples to include\n" + " -t, --targets REGION similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + " --threads INT use multithreading with worker threads [0]\n" + " -v, --verbose INT verbosity level 0-2 [1]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4090,7 +4151,7 @@ int main_csq(int argc, char *argv[]) args->argc = argc; args->argv = argv; args->output_type = FT_VCF; args->bcsq_tag = "BCSQ"; - args->ncsq_max = 2*16; + args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values args->verbosity = 1; args->record_cmd_line = 1; @@ -4100,7 +4161,8 @@ int main_csq(int argc, char *argv[]) {"threads",required_argument,NULL,2}, {"help",0,0,'h'}, {"ncsq",1,0,'n'}, - {"brief-predictions",0,0,'b'}, + {"brief-predictions",no_argument,0,'b'}, + {"trim-protein-seq",required_argument,0,'B'}, {"custom-tag",1,0,'c'}, {"local-csq",0,0,'l'}, {"gff-annot",1,0,'g'}, @@ -4123,7 +4185,7 @@ int main_csq(int argc, char *argv[]) }; int c, targets_is_file = 0, regions_is_file = 0; char *targets_list = NULL, *regions_list = NULL, *tmp; - while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0) { switch (c) { @@ -4133,7 +4195,14 @@ int main_csq(int argc, char *argv[]) if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); break; case 3 : args->record_cmd_line = 0; break; - case 'b': args->brief_predictions = 1; break; + case 'b': + args->brief_predictions = 1; + fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n"); + break; + case 'B': + args->brief_predictions = strtol(optarg,&tmp,10); + if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg); + break; case 'l': args->local_csq = 1; break; case 'c': args->bcsq_tag = optarg; break; case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; @@ -4155,8 +4224,8 @@ int main_csq(int argc, char *argv[]) case 'f': args->fa_fname = optarg; break; case 'g': args->gff_fname = optarg; break; case 'n': - args->ncsq_max = 2 * atoi(optarg); - if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg); + args->ncsq2_max = 2 * atoi(optarg); + if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg); break; case 'o': args->output_fname = optarg; break; case 'O': @@ -4169,8 +4238,12 @@ int main_csq(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); } break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': regions_list = optarg; break; case 'R': regions_list = optarg; regions_is_file = 1; break; case 's': args->sample_list = optarg; break; diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index e0c3001..e7f6a70 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -1,11 +1,8 @@ #include "bcftools.pysam.h" -//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz - - /* The MIT License - Copyright (c) 2016-2018 Genome Research Ltd. + Copyright (c) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -138,6 +135,7 @@ #include #include +#include #include #include #include @@ -594,8 +592,8 @@ typedef struct _args_t char *bcsq_tag; int argc, output_type; int phase, verbosity, local_csq, record_cmd_line; - int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ - int ncsq_small_warned; + int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) + int ncsq2_small_warned; int brief_predictions; int rid; // current chromosome @@ -682,11 +680,42 @@ static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end) int iseq; if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 ) { - hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); - aux->seq[aux->nseq] = strdup(chr_beg); - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; - assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + // check for possible mismatch in chromosome naming convention such as chrX vs X + char *new_chr = NULL; + if ( faidx_has_seq(args->fai,chr_beg) ) + new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx + else + { + int len = strlen(chr_beg); + if ( !strncmp("chr",chr_beg,3) && len>3 ) + new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not + else + { + new_chr = malloc(len+3); // gff does not have the prefix, faidx has + memcpy(new_chr,"chr",3); + memcpy(new_chr+3,chr_beg,len); + new_chr[len+3] = 0; + } + if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai + { + static int unkwn_chr_warned = 0; + if ( !unkwn_chr_warned && args->verbosity>0 ) + fprintf(bcftools_stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg); + unkwn_chr_warned = 1; + free(new_chr); + new_chr = strdup(chr_beg); // use the original sequence name + } + } + if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 ) + { + hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq); + aux->seq[aux->nseq] = new_chr; + iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); + aux->nseq++; + assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq + } + else + free(new_chr); } chr_end[1] = c; return iseq; @@ -1142,7 +1171,8 @@ void tscript_init_cds(args_t *args) tscript_ok = 0; break; } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } len += tr->cds[i]->len; } @@ -1180,7 +1210,8 @@ void tscript_init_cds(args_t *args) tscript_ok = 0; break; } - error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } len += tr->cds[i]->len; } @@ -1198,8 +1229,17 @@ void tscript_init_cds(args_t *args) gf_cds_t *a = tr->cds[i-1]; gf_cds_t *b = tr->cds[i]; if ( a->beg + a->len - 1 >= b->beg ) - error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n", - kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + { + if ( args->force ) + { + fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n", + args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + } + else + error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n" + " Use the --force option to override (at your own risk).\n", + args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); + } } if ( len%3 != 0 ) { @@ -1339,9 +1379,22 @@ void init_gff(args_t *args) khash_str2int_destroy_free(aux->ignored_biotypes); } +static inline int ncsq2_to_nfmt(int ncsq2) +{ + return 1 + (ncsq2 - 1) / 30; +} +static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit) +{ + *ival = icsq2 / 30; + *ibit = icsq2 % 30; +} + void init_data(args_t *args) { - args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; + args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max); + + args->fai = fai_load(args->fa_fname); + if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname); init_gff(args); @@ -1351,9 +1404,6 @@ void init_data(args_t *args) if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); - args->fai = fai_load(args->fa_fname); - if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); - args->pos2vbuf = kh_init(pos2vbuf); args->active_tr = khp_init(trhp); args->hap = (hap_t*) calloc(1,sizeof(hap_t)); @@ -1397,7 +1447,7 @@ void init_data(args_t *args) } else { - args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads > 0) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); @@ -1412,6 +1462,11 @@ void init_data(args_t *args) void destroy_data(args_t *args) { + if ( args->ncsq2_small_warned ) + fprintf(bcftools_stderr, + "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n" + " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2); + regidx_destroy(args->idx_cds); regidx_destroy(args->idx_utr); regidx_destroy(args->idx_exon); @@ -2685,13 +2740,13 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) { - if ( !args->brief_predictions ) + if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 ) kputs(aa->s, str); else { - int len = aa->l; + int i, len = aa->l; if ( aa->s[len-1]=='*' ) len--; - kputc(aa->s[0], str); + for (i=0; ibrief_predictions; i++) kputc(aa->s[i], str); kputs("..", str); kputw(beg+len, str); } @@ -3085,22 +3140,24 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha { csq_t *csq = node->csq_list + i; vrec_t *vrec = csq->vrec; - int icsq = 2*csq->idx + ihap; - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + int icsq2 = 2*csq->idx + ihap; + if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT { - if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) + if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) ) { fprintf(bcftools_stderr, "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); - if ( !args->ncsq_small_warned ) + if ( !args->ncsq2_small_warned ) fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; } + if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2; break; } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; - vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + int ival, ibit; + icsq2_to_bit(icsq2, &ival,&ibit); + if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; + vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit; } } @@ -3729,22 +3786,26 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) { if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue; - int icsq = 2*csq->idx + j; - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + int icsq2 = 2*csq->idx + j; + if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT { int ismpl = args->smpl->idx[i]; - if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) + if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) ) { fprintf(bcftools_stderr, "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", - args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1); - if ( !args->ncsq_small_warned ) + args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1); + if ( !args->ncsq2_small_warned ) fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; + args->ncsq2_small_warned = 1; } + if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2; + break; } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; - vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + int ival, ibit; + icsq2_to_bit(icsq2, &ival,&ibit); + if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; + vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit; } } } @@ -4043,39 +4104,39 @@ static const char *usage(void) return "\n" "About: Haplotype-aware consequence caller.\n" - "Usage: bcftools csq [options] in.vcf\n" + "Usage: bcftools csq [OPTIONS] in.vcf\n" "\n" "Required options:\n" - " -f, --fasta-ref reference file in fasta format\n" - " -g, --gff-annot gff3 annotation file\n" + " -f, --fasta-ref FILE reference file in fasta format\n" + " -g, --gff-annot FILE gff3 annotation file\n" "\n" "CSQ options:\n" - " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n" - " -c, --custom-tag use this tag instead of the default BCSQ\n" + " -B, --trim-protein-seq INT abbreviate protein-changing predictions to max INT aminoacids\n" + " -c, --custom-tag STRING use this tag instead of the default BCSQ\n" " -l, --local-csq localized predictions, consider only one VCF record at a time\n" - " -n, --ncsq maximum number of consequences to consider per site [16]\n" - " -p, --phase how to handle unphased heterozygous genotypes: [r]\n" + " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n" + " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n" " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" " r: require phased GTs, throw an error on unphased het GTs\n" " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" " s: skip unphased hets\n" "Options:\n" - " -e, --exclude exclude sites for which the expression is true\n" + " -e, --exclude EXPR exclude sites for which the expression is true\n" " --force run even if some sanity checks fail\n" - " -i, --include select sites for which the expression is true\n" + " -i, --include EXPR select sites for which the expression is true\n" " --no-version do not append version and command line to the header\n" - " -o, --output write output to a file [standard output]\n" - " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" + " -o, --output FILE write output to a file [standard output]\n" + " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" - " -r, --regions restrict to comma-separated list of regions\n" - " -R, --regions-file restrict to regions listed in a file\n" - " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" - " -S, --samples-file samples to include\n" - " -t, --targets similar to -r but streams rather than index-jumps\n" - " -T, --targets-file similar to -R but streams rather than index-jumps\n" - " --threads use multithreading with worker threads [0]\n" - " -v, --verbose verbosity level 0-2 [1]\n" + " -r, --regions REGION restrict to comma-separated list of regions\n" + " -R, --regions-file FILE restrict to regions listed in a file\n" + " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n" + " -S, --samples-file FILE samples to include\n" + " -t, --targets REGION similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + " --threads INT use multithreading with worker threads [0]\n" + " -v, --verbose INT verbosity level 0-2 [1]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4092,7 +4153,7 @@ int main_csq(int argc, char *argv[]) args->argc = argc; args->argv = argv; args->output_type = FT_VCF; args->bcsq_tag = "BCSQ"; - args->ncsq_max = 2*16; + args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values args->verbosity = 1; args->record_cmd_line = 1; @@ -4102,7 +4163,8 @@ int main_csq(int argc, char *argv[]) {"threads",required_argument,NULL,2}, {"help",0,0,'h'}, {"ncsq",1,0,'n'}, - {"brief-predictions",0,0,'b'}, + {"brief-predictions",no_argument,0,'b'}, + {"trim-protein-seq",required_argument,0,'B'}, {"custom-tag",1,0,'c'}, {"local-csq",0,0,'l'}, {"gff-annot",1,0,'g'}, @@ -4125,7 +4187,7 @@ int main_csq(int argc, char *argv[]) }; int c, targets_is_file = 0, regions_is_file = 0; char *targets_list = NULL, *regions_list = NULL, *tmp; - while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0) { switch (c) { @@ -4135,7 +4197,14 @@ int main_csq(int argc, char *argv[]) if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); break; case 3 : args->record_cmd_line = 0; break; - case 'b': args->brief_predictions = 1; break; + case 'b': + args->brief_predictions = 1; + fprintf(bcftools_stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n"); + break; + case 'B': + args->brief_predictions = strtol(optarg,&tmp,10); + if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg); + break; case 'l': args->local_csq = 1; break; case 'c': args->bcsq_tag = optarg; break; case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; @@ -4157,8 +4226,8 @@ int main_csq(int argc, char *argv[]) case 'f': args->fa_fname = optarg; break; case 'g': args->gff_fname = optarg; break; case 'n': - args->ncsq_max = 2 * atoi(optarg); - if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg); + args->ncsq2_max = 2 * atoi(optarg); + if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg); break; case 'o': args->output_fname = optarg; break; case 'O': @@ -4171,8 +4240,12 @@ int main_csq(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); } break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': regions_list = optarg; break; case 'R': regions_list = optarg; regions_is_file = 1; break; case 's': args->sample_list = optarg; break; diff --git a/bcftools/dist.c b/bcftools/dist.c new file mode 100644 index 0000000..094fc73 --- /dev/null +++ b/bcftools/dist.c @@ -0,0 +1,124 @@ +/* The MIT License + + Copyright (c) 2016-2020 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include +#include "dist.h" + +extern void error(const char *format, ...); + +struct _dist_t +{ + uint64_t *bins, nvalues; + int nbins; + int npow; // the number of orders of magnitude to represent exactly + int nexact; // pow(10,npow) + int nlevel; +}; + +dist_t *dist_init(int npow) +{ + dist_t *dist = (dist_t*) calloc(1,sizeof(dist_t)); + dist->npow = npow; + dist->nexact = pow(10,npow); + dist->nlevel = dist->nexact - pow(10,npow-1); + return dist; +} + +void dist_destroy(dist_t *dist) +{ + if ( !dist ) return; + free(dist->bins); + free(dist); +} + +int dist_nbins(dist_t *dist) +{ + return dist->nbins; +} + +int dist_nvalues(dist_t *dist) +{ + return dist->nvalues; +} + +uint32_t dist_insert(dist_t *dist, uint32_t value) +{ + int ibin; + + if ( value <= dist->nexact ) + ibin = value; + else + { + int npow = (int) log10(value); + int level = npow - dist->npow + 1; + uint32_t step = pow(10, level); + ibin = dist->nexact + dist->nlevel*(level-1) + (value - pow(10,npow)) / step; + } + + if ( ibin >= dist->nbins ) + { + dist->bins = (uint64_t*) realloc(dist->bins, sizeof(*dist->bins)*(ibin+1)); + memset(dist->bins + dist->nbins, 0, (ibin+1 - dist->nbins)*sizeof(*dist->bins)); + dist->nbins = ibin+1; + } + dist->bins[ibin]++; + dist->nvalues++; + return ibin; +} +uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt) +{ + if ( !cnt ) return 0; + int ibin = dist_insert(dist, value); + dist->bins[ibin] += cnt - 1; + dist->nvalues += cnt; + return ibin; +} + +uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end) +{ + if ( idx < dist->nexact ) + { + if ( beg ) *beg = idx; + if ( end ) *end = idx + 1; + } + else + { + int level = (idx - dist->nexact) / dist->nlevel + 1; + int bin = idx - dist->nexact - dist->nlevel*(level-1); + + uint32_t step = pow(10, level); + uint32_t value = pow(10, level + dist->npow - 1) + step*bin; + + if ( beg ) *beg = value; + if ( end ) *end = value + step; + } + return dist->bins[idx]; +} + diff --git a/bcftools/dist.c.pysam.c b/bcftools/dist.c.pysam.c new file mode 100644 index 0000000..f3f0915 --- /dev/null +++ b/bcftools/dist.c.pysam.c @@ -0,0 +1,126 @@ +#include "bcftools.pysam.h" + +/* The MIT License + + Copyright (c) 2016-2020 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include +#include "dist.h" + +extern void error(const char *format, ...); + +struct _dist_t +{ + uint64_t *bins, nvalues; + int nbins; + int npow; // the number of orders of magnitude to represent exactly + int nexact; // pow(10,npow) + int nlevel; +}; + +dist_t *dist_init(int npow) +{ + dist_t *dist = (dist_t*) calloc(1,sizeof(dist_t)); + dist->npow = npow; + dist->nexact = pow(10,npow); + dist->nlevel = dist->nexact - pow(10,npow-1); + return dist; +} + +void dist_destroy(dist_t *dist) +{ + if ( !dist ) return; + free(dist->bins); + free(dist); +} + +int dist_nbins(dist_t *dist) +{ + return dist->nbins; +} + +int dist_nvalues(dist_t *dist) +{ + return dist->nvalues; +} + +uint32_t dist_insert(dist_t *dist, uint32_t value) +{ + int ibin; + + if ( value <= dist->nexact ) + ibin = value; + else + { + int npow = (int) log10(value); + int level = npow - dist->npow + 1; + uint32_t step = pow(10, level); + ibin = dist->nexact + dist->nlevel*(level-1) + (value - pow(10,npow)) / step; + } + + if ( ibin >= dist->nbins ) + { + dist->bins = (uint64_t*) realloc(dist->bins, sizeof(*dist->bins)*(ibin+1)); + memset(dist->bins + dist->nbins, 0, (ibin+1 - dist->nbins)*sizeof(*dist->bins)); + dist->nbins = ibin+1; + } + dist->bins[ibin]++; + dist->nvalues++; + return ibin; +} +uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt) +{ + if ( !cnt ) return 0; + int ibin = dist_insert(dist, value); + dist->bins[ibin] += cnt - 1; + dist->nvalues += cnt; + return ibin; +} + +uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end) +{ + if ( idx < dist->nexact ) + { + if ( beg ) *beg = idx; + if ( end ) *end = idx + 1; + } + else + { + int level = (idx - dist->nexact) / dist->nlevel + 1; + int bin = idx - dist->nexact - dist->nlevel*(level-1); + + uint32_t step = pow(10, level); + uint32_t value = pow(10, level + dist->npow - 1) + step*bin; + + if ( beg ) *beg = value; + if ( end ) *end = value + step; + } + return dist->bins[idx]; +} + diff --git a/bcftools/dist.h b/bcftools/dist.h new file mode 100644 index 0000000..5c9c571 --- /dev/null +++ b/bcftools/dist.h @@ -0,0 +1,98 @@ +/* The MIT License + + Copyright (c) 2016-2020 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ +/* + Logarithmic binning + + Example of usage: + + // Initialize, make the binning exact up to 10^4, then add a log-step + dist_t *dist = dist_init(4); + + // Insert values + int i; + for (i=0; i<1e6; i++) + dist_insert(dist, i); + + // Number of bins used + int n = dist_n(dist); + + // Now print the distribution + uint32_t beg, end; + for (i=0; i +#include + +typedef struct _dist_t dist_t; + +/* + * dist_init() - init bins + */ +dist_t *dist_init(int npow); +void dist_destroy(dist_t *dist); + +/* + dist_nbins() - get the number of bins + */ +int dist_nbins(dist_t *dist); + +/* + dist_nvalues() - get the total number of values inserted + */ +int dist_nvalues(dist_t *dist); + +/* + dist_insert() - insert new value + dist_insert_n() - insert new value n times + */ +uint32_t dist_insert(dist_t *dist, uint32_t value); +uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt); + +/* + dist_get() + @idx: from the interval [0,dist_n-1] + @beg,end: [beg,end) + */ +uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end); + +#endif + diff --git a/bcftools/em.c b/bcftools/em.c index a976f22..baa3490 100644 --- a/bcftools/em.c +++ b/bcftools/em.c @@ -1,7 +1,7 @@ /* em.c -- mathematical functions. Copyright (C) 2010, 2011 Broad Institute. - Portions copyright (C) 2013 Genome Research Ltd. + Portions copyright (C) 2013-2014 Genome Research Ltd. Author: Heng Li diff --git a/bcftools/em.c.pysam.c b/bcftools/em.c.pysam.c index db27d06..37a3dea 100644 --- a/bcftools/em.c.pysam.c +++ b/bcftools/em.c.pysam.c @@ -3,7 +3,7 @@ /* em.c -- mathematical functions. Copyright (C) 2010, 2011 Broad Institute. - Portions copyright (C) 2013 Genome Research Ltd. + Portions copyright (C) 2013-2014 Genome Research Ltd. Author: Heng Li diff --git a/bcftools/extsort.c b/bcftools/extsort.c new file mode 100644 index 0000000..014e03b --- /dev/null +++ b/bcftools/extsort.c @@ -0,0 +1,250 @@ +/* ext-sort.h -- sort on disk + + Copyright (C) 2020-2021 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include // for unlink() +#include // for chmod() +#include +#include +#ifdef _WIN32 +#include +#endif +#include "bcftools.h" +#include "extsort.h" +#include "kheap.h" + +typedef struct +{ + extsort_t *es; // this is to get access to extsort_cmp_f from kheap + int fd; + char *fname; + void *dat; +} +blk_t; + +static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr); +KHEAP_INIT(blk, blk_t*, blk_is_smaller) /* defines khp_blk_t */ + +struct _extsort_t +{ + size_t dat_size, mem, max_mem; + char *tmp_prefix; + extsort_cmp_f cmp; + + size_t nbuf, mbuf, nblk; + blk_t **blk; + void **buf, *tmp_dat; + khp_blk_t *bhp; +}; + +static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) +{ + blk_t *a = *aptr; + blk_t *b = *bptr; + int ret = a->es->cmp(&a->dat,&b->dat); + if ( ret < 0 ) return 1; + return 0; +} + +size_t parse_mem_string(const char *str); + +void extsort_set(extsort_t *es, extsort_opt_t key, void *value) +{ + if ( key==DAT_SIZE ) { es->dat_size = *((size_t*)value); return; } + if ( key==MAX_MEM ) + { + es->max_mem = parse_mem_string(*((const char**)value)); + if ( es->max_mem <=0 ) error("Could not parse the memory string, expected positive number: %s\n",*((const char**)value)); + return; + } + if ( key==TMP_PREFIX ) { es->tmp_prefix = init_tmp_prefix(*((const char**)value)); return; } + if ( key==FUNC_CMP ) { es->cmp = *((extsort_cmp_f*)value); return; } +} + +extsort_t *extsort_alloc(void) +{ + extsort_t *es = (extsort_t*) calloc(1,sizeof(*es)); + es->max_mem = 100e6; + return es; +} +void extsort_init(extsort_t *es) +{ + assert( es->cmp ); + assert( es->dat_size ); + if ( !es->tmp_prefix ) es->tmp_prefix = init_tmp_prefix(NULL); + es->tmp_dat = malloc(es->dat_size); +} + +void extsort_destroy(extsort_t *es) +{ + int i; + for (i=0; inblk; i++) + { + blk_t *blk = es->blk[i]; + if ( blk->fd!=-1 ) +#ifdef _WIN32 + _close(blk->fd); +#else + close(blk->fd); +#endif + free(blk->fname); + free(blk->dat); + free(blk); + } + free(es->tmp_dat); + free(es->tmp_prefix); + free(es->blk); + khp_destroy(blk, es->bhp); + free(es); +} + +static void _buf_flush(extsort_t *es) +{ + int i; + if ( !es->nbuf ) return; + + qsort(es->buf, es->nbuf, sizeof(void*), es->cmp); + + es->nblk++; + es->blk = (blk_t**) realloc(es->blk, sizeof(blk_t*)*es->nblk); + es->blk[es->nblk-1] = (blk_t*) calloc(1,sizeof(blk_t)); + blk_t *blk = es->blk[es->nblk-1]; + blk->es = es; + blk->dat = malloc(es->dat_size); + blk->fname = strdup(es->tmp_prefix); + #ifdef _WIN32 + for (i=0; i<100000; i++) + { + memcpy(blk->fname,es->tmp_prefix,strlen(es->tmp_prefix)); + mktemp(blk->fname); + blk->fd = _open(blk->fname, O_RDWR|O_CREAT|O_EXCL|O_BINARY|O_TEMPORARY, 0600); + if ( blk->fd==-1 ) + { + if ( errno==EEXIST ) continue; + error("Error: failed to open a temporary file %s\n",blk->fname); + } + break; + } + if ( !blk->fd ) error("Error: failed to create a unique temporary file name from %s\n",es->tmp_prefix); + if ( _chmod(blk->fname, S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname); + #else + if ( (blk->fd = mkstemp(blk->fname))==-1 ) + error("Error: failed to open a temporary file %s\n",blk->fname); + if ( fchmod(blk->fd,S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname); + unlink(blk->fname); // should auto delete when closed on linux, the descriptor remains open + #endif + + for (i=0; inbuf; i++) + { + #ifdef _WIN32 + if ( _write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname); + #else + if ( write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname); + #endif + free(es->buf[i]); + } +#ifdef _WIN32 + if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname); +#else + if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname); +#endif + + es->nbuf = 0; + es->mem = 0; +} + +void extsort_push(extsort_t *es, void *dat) +{ + int delta = sizeof(void*) + es->dat_size; + if ( es->nbuf && es->mem + delta > es->max_mem ) _buf_flush(es); + es->nbuf++; + es->mem += delta; + hts_expand(void*, es->nbuf, es->mbuf, es->buf); + es->buf[es->nbuf-1] = dat; +} + +// return number of elements read +static ssize_t _blk_read(extsort_t *es, blk_t *blk) +{ + ssize_t ret = 0; + if ( blk->fd==-1 ) return ret; +#ifdef _WIN32 + ret = _read(blk->fd, blk->dat, es->dat_size); +#else + ret = read(blk->fd, blk->dat, es->dat_size); +#endif + if ( ret < 0 ) error("Error: failed to read from the temporary file %s\n", blk->fname); + if ( ret == 0 ) + { +#ifdef _WIN32 + if ( _close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname); +#else + if ( close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname); +#endif + blk->fd = -1; + return ret; + } + if ( ret < es->dat_size ) error("Error: failed to read %zu bytes from the temporary file %s\n",es->dat_size,blk->fname); + return ret; +} + +void extsort_sort(extsort_t *es) +{ + _buf_flush(es); + free(es->buf); + es->buf = NULL; + es->bhp = khp_init(blk); + + // open all blocks, read one record from each, create a heap + int i; + for (i=0; inblk; i++) + { + blk_t *blk = es->blk[i]; +#ifdef _WIN32 + if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname); +#else + if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname); +#endif + int ret = _blk_read(es, blk); + if ( ret ) khp_insert(blk, es->bhp, &blk); + } +} + +void *extsort_shift(extsort_t *es) +{ + if ( !es->bhp->ndat ) return NULL; + blk_t *blk = es->bhp->dat[0]; + + // swap the pointer which keeps the location of user data so that it is not overwritten by the next read + void *tmp = es->tmp_dat; es->tmp_dat = blk->dat; blk->dat = tmp; + khp_delete(blk, es->bhp); + + int ret = _blk_read(es, blk); + if ( ret ) khp_insert(blk, es->bhp, &blk); + + return es->tmp_dat; +} + diff --git a/bcftools/extsort.c.pysam.c b/bcftools/extsort.c.pysam.c new file mode 100644 index 0000000..1b410a7 --- /dev/null +++ b/bcftools/extsort.c.pysam.c @@ -0,0 +1,252 @@ +#include "bcftools.pysam.h" + +/* ext-sort.h -- sort on disk + + Copyright (C) 2020-2021 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include // for unlink() +#include // for chmod() +#include +#include +#ifdef _WIN32 +#include +#endif +#include "bcftools.h" +#include "extsort.h" +#include "kheap.h" + +typedef struct +{ + extsort_t *es; // this is to get access to extsort_cmp_f from kheap + int fd; + char *fname; + void *dat; +} +blk_t; + +static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr); +KHEAP_INIT(blk, blk_t*, blk_is_smaller) /* defines khp_blk_t */ + +struct _extsort_t +{ + size_t dat_size, mem, max_mem; + char *tmp_prefix; + extsort_cmp_f cmp; + + size_t nbuf, mbuf, nblk; + blk_t **blk; + void **buf, *tmp_dat; + khp_blk_t *bhp; +}; + +static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) +{ + blk_t *a = *aptr; + blk_t *b = *bptr; + int ret = a->es->cmp(&a->dat,&b->dat); + if ( ret < 0 ) return 1; + return 0; +} + +size_t parse_mem_string(const char *str); + +void extsort_set(extsort_t *es, extsort_opt_t key, void *value) +{ + if ( key==DAT_SIZE ) { es->dat_size = *((size_t*)value); return; } + if ( key==MAX_MEM ) + { + es->max_mem = parse_mem_string(*((const char**)value)); + if ( es->max_mem <=0 ) error("Could not parse the memory string, expected positive number: %s\n",*((const char**)value)); + return; + } + if ( key==TMP_PREFIX ) { es->tmp_prefix = init_tmp_prefix(*((const char**)value)); return; } + if ( key==FUNC_CMP ) { es->cmp = *((extsort_cmp_f*)value); return; } +} + +extsort_t *extsort_alloc(void) +{ + extsort_t *es = (extsort_t*) calloc(1,sizeof(*es)); + es->max_mem = 100e6; + return es; +} +void extsort_init(extsort_t *es) +{ + assert( es->cmp ); + assert( es->dat_size ); + if ( !es->tmp_prefix ) es->tmp_prefix = init_tmp_prefix(NULL); + es->tmp_dat = malloc(es->dat_size); +} + +void extsort_destroy(extsort_t *es) +{ + int i; + for (i=0; inblk; i++) + { + blk_t *blk = es->blk[i]; + if ( blk->fd!=-1 ) +#ifdef _WIN32 + _close(blk->fd); +#else + close(blk->fd); +#endif + free(blk->fname); + free(blk->dat); + free(blk); + } + free(es->tmp_dat); + free(es->tmp_prefix); + free(es->blk); + khp_destroy(blk, es->bhp); + free(es); +} + +static void _buf_flush(extsort_t *es) +{ + int i; + if ( !es->nbuf ) return; + + qsort(es->buf, es->nbuf, sizeof(void*), es->cmp); + + es->nblk++; + es->blk = (blk_t**) realloc(es->blk, sizeof(blk_t*)*es->nblk); + es->blk[es->nblk-1] = (blk_t*) calloc(1,sizeof(blk_t)); + blk_t *blk = es->blk[es->nblk-1]; + blk->es = es; + blk->dat = malloc(es->dat_size); + blk->fname = strdup(es->tmp_prefix); + #ifdef _WIN32 + for (i=0; i<100000; i++) + { + memcpy(blk->fname,es->tmp_prefix,strlen(es->tmp_prefix)); + mktemp(blk->fname); + blk->fd = _open(blk->fname, O_RDWR|O_CREAT|O_EXCL|O_BINARY|O_TEMPORARY, 0600); + if ( blk->fd==-1 ) + { + if ( errno==EEXIST ) continue; + error("Error: failed to open a temporary file %s\n",blk->fname); + } + break; + } + if ( !blk->fd ) error("Error: failed to create a unique temporary file name from %s\n",es->tmp_prefix); + if ( _chmod(blk->fname, S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname); + #else + if ( (blk->fd = mkstemp(blk->fname))==-1 ) + error("Error: failed to open a temporary file %s\n",blk->fname); + if ( fchmod(blk->fd,S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname); + unlink(blk->fname); // should auto delete when closed on linux, the descriptor remains open + #endif + + for (i=0; inbuf; i++) + { + #ifdef _WIN32 + if ( _write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname); + #else + if ( write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname); + #endif + free(es->buf[i]); + } +#ifdef _WIN32 + if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname); +#else + if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname); +#endif + + es->nbuf = 0; + es->mem = 0; +} + +void extsort_push(extsort_t *es, void *dat) +{ + int delta = sizeof(void*) + es->dat_size; + if ( es->nbuf && es->mem + delta > es->max_mem ) _buf_flush(es); + es->nbuf++; + es->mem += delta; + hts_expand(void*, es->nbuf, es->mbuf, es->buf); + es->buf[es->nbuf-1] = dat; +} + +// return number of elements read +static ssize_t _blk_read(extsort_t *es, blk_t *blk) +{ + ssize_t ret = 0; + if ( blk->fd==-1 ) return ret; +#ifdef _WIN32 + ret = _read(blk->fd, blk->dat, es->dat_size); +#else + ret = read(blk->fd, blk->dat, es->dat_size); +#endif + if ( ret < 0 ) error("Error: failed to read from the temporary file %s\n", blk->fname); + if ( ret == 0 ) + { +#ifdef _WIN32 + if ( _close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname); +#else + if ( close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname); +#endif + blk->fd = -1; + return ret; + } + if ( ret < es->dat_size ) error("Error: failed to read %zu bytes from the temporary file %s\n",es->dat_size,blk->fname); + return ret; +} + +void extsort_sort(extsort_t *es) +{ + _buf_flush(es); + free(es->buf); + es->buf = NULL; + es->bhp = khp_init(blk); + + // open all blocks, read one record from each, create a heap + int i; + for (i=0; inblk; i++) + { + blk_t *blk = es->blk[i]; +#ifdef _WIN32 + if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname); +#else + if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname); +#endif + int ret = _blk_read(es, blk); + if ( ret ) khp_insert(blk, es->bhp, &blk); + } +} + +void *extsort_shift(extsort_t *es) +{ + if ( !es->bhp->ndat ) return NULL; + blk_t *blk = es->bhp->dat[0]; + + // swap the pointer which keeps the location of user data so that it is not overwritten by the next read + void *tmp = es->tmp_dat; es->tmp_dat = blk->dat; blk->dat = tmp; + khp_delete(blk, es->bhp); + + int ret = _blk_read(es, blk); + if ( ret ) khp_insert(blk, es->bhp, &blk); + + return es->tmp_dat; +} + diff --git a/bcftools/extsort.h b/bcftools/extsort.h new file mode 100644 index 0000000..ba6282e --- /dev/null +++ b/bcftools/extsort.h @@ -0,0 +1,56 @@ +/* ext-sort.h -- sort on disk + + Copyright (C) 2020 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#ifndef __EXTSORT_H__ +#define __EXTSORT_H__ + +//todo: return status to all functions + +typedef struct _extsort_t extsort_t; + +typedef int (*extsort_cmp_f) (const void *aptr, const void *bptr); + +// Modes of operation +typedef enum +{ + DAT_SIZE, // size_t .. assuming constant size records for now + TMP_PREFIX, // const char* .. prefix of temporary files, XXXXXX will be appended + MAX_MEM, // const char* .. maximum memory to use, e.g. 100MB + FUNC_CMP, // extsort_cmp_f .. sort function +} +extsort_opt_t; + +#define extsort_set_opt(es,type,key,value) { type tmp = value; extsort_set(es, key, (void*)&tmp); } + +extsort_t *extsort_alloc(void); +void extsort_set(extsort_t *es, extsort_opt_t key, void *value); +void extsort_init(extsort_t *es); +void extsort_push(extsort_t *es, void *dat); // dat will be freed by extsort later +void extsort_sort(extsort_t *es); +void *extsort_shift(extsort_t *es); +void extsort_destroy(extsort_t *es); + +#endif diff --git a/bcftools/filter.c b/bcftools/filter.c index ea60036..3c45195 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -1,6 +1,6 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -25,6 +25,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -56,27 +57,6 @@ static int filter_ninit = 0; # define __FUNCTION__ __func__ #endif -static const uint64_t bcf_double_missing = 0x7ff0000000000001; -static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; -static inline void bcf_double_set(double *ptr, uint64_t value) -{ - union { uint64_t i; double d; } u; - u.i = value; - *ptr = u.d; -} -static inline int bcf_double_test(double d, uint64_t value) -{ - union { uint64_t i; double d; } u; - u.d = d; - return u.i==value ? 1 : 0; -} -#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end) -#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) -#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) -#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) -#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) - - typedef struct _token_t { // read-only values, same for all VCF lines @@ -89,9 +69,9 @@ typedef struct _token_t int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) - int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited + int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only int nidxs, nuidxs; // size of idxs array and the number of elements set to 1 - uint8_t *usmpl; // bitmask of used samples as set by idx + uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise int nsamples; // number of samples for format fields, 0 for info and other fields void (*setter)(filter_t *, bcf1_t *, struct _token_t *); int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack); @@ -158,11 +138,19 @@ struct _filter_t #define TOK_PHRED 29 #define TOK_MEDIAN 30 #define TOK_STDEV 31 - -// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 -// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s -static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; -#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" +#define TOK_sMAX 32 +#define TOK_sMIN 33 +#define TOK_sAVG 34 +#define TOK_sMEDIAN 35 +#define TOK_sSTDEV 36 +#define TOK_sSUM 37 +#define TOK_IN 38 // contains, e.g. FILTER~"A" +#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" + +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 +// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s +static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; +#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently // Return negative values if it is a function with variable number of arguments static int filters_next_token(char **str, int *len) @@ -184,6 +172,20 @@ static int filters_next_token(char **str, int *len) tmp = *str; } + if ( !strncasecmp(tmp,"SMPL_MAX(",9) ) { (*str) += 8; return TOK_sMAX; } + if ( !strncasecmp(tmp,"SMPL_MIN(",9) ) { (*str) += 8; return TOK_sMIN; } + if ( !strncasecmp(tmp,"SMPL_MEAN(",10) ) { (*str) += 9; return TOK_sAVG; } + if ( !strncasecmp(tmp,"SMPL_MEDIAN(",12) ) { (*str) += 11; return TOK_sMEDIAN; } + if ( !strncasecmp(tmp,"SMPL_AVG(",9) ) { (*str) += 8; return TOK_sAVG; } + if ( !strncasecmp(tmp,"SMPL_STDEV(",11) ) { (*str) += 10; return TOK_sSTDEV; } + if ( !strncasecmp(tmp,"SMPL_SUM(",9) ) { (*str) += 8; return TOK_sSUM; } + if ( !strncasecmp(tmp,"sMAX(",5) ) { (*str) += 4; return TOK_sMAX; } + if ( !strncasecmp(tmp,"sMIN(",5) ) { (*str) += 4; return TOK_sMIN; } + if ( !strncasecmp(tmp,"sMEAN(",6) ) { (*str) += 5; return TOK_sAVG; } + if ( !strncasecmp(tmp,"sMEDIAN(",8) ) { (*str) += 7; return TOK_sMEDIAN; } + if ( !strncasecmp(tmp,"sAVG(",5) ) { (*str) += 4; return TOK_sAVG; } + if ( !strncasecmp(tmp,"sSTDEV(",7) ) { (*str) += 6; return TOK_sSTDEV; } + if ( !strncasecmp(tmp,"sSUM(",5) ) { (*str) += 4; return TOK_sSUM; } if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; } if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; } if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; } @@ -417,7 +419,7 @@ static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { int i; - if ( rtok->tok_type==TOK_NE ) // AND logic: none of the filters can match + if ( rtok->tok_type==TOK_NOT_IN ) { if ( !line->d.n_flt ) { @@ -430,7 +432,7 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1 rtok->pass_site = 1; return; } - else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match + else if ( rtok->tok_type==TOK_IN ) { if ( !line->d.n_flt ) { @@ -441,8 +443,30 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1 if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; } return; } + else if ( rtok->tok_type==TOK_NE ) // exact match + { + if ( !line->d.n_flt ) + { + if ( atok->hdr_id==-1 ) return; // missing value + rtok->pass_site = 1; + return; // no filter present, eval to true + } + if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present + rtok->pass_site = 1; + return; + } + else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present + { + if ( !line->d.n_flt ) + { + if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; } + return; // no filter present, eval to false + } + if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1; + return; + } else - error("Only == and != operators are supported for FILTER\n"); + error("Only ==, !=, ~, and !~ operators are supported for FILTER\n"); return; } static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) @@ -1036,54 +1060,46 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) tok->nvalues = 0; return; } - if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n"); - + int j,nmissing = 0; - for (i=0; in_sample; i++) - { - int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size); - for (j=0; jn; j++) - { - if ( ptr[j]==bcf_int8_vector_end ) break; - if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } - } + #define BRANCH(type_t, is_vector_end) { \ + for (i=0; in_sample; i++) \ + { \ + type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \ + for (j=0; jn; j++) \ + { \ + if ( ptr[j]==is_vector_end ) break; \ + if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \ + } \ + } \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; + default: fprintf(stderr,"todo: type %d\n", fmt->type); exit(1); break; } + #undef BRANCH tok->nvalues = 1; tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample; } static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - if ( nstack==0 ) error("Error parsing the expresion\n"); + if ( nstack==0 ) error("Error parsing the expression\n"); token_t *tok = stack[nstack - 1]; if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag); - - rtok->nsamples = tok->nsamples; - memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); - assert(tok->usmpl); - if ( !rtok->usmpl ) - { - rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); - memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl)); - } int i, npass = 0; - for (i=0; insamples; i++) + for (i=0; insamples; i++) { - if ( !rtok->usmpl[i] ) continue; - if ( rtok->pass_samples[i] ) npass++; + if ( !tok->usmpl[i] ) continue; + if ( tok->pass_samples[i] ) npass++; } - - hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values); - double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); - rtok->nval1 = 1; - rtok->nvalues = rtok->nsamples; - - // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats - // consider only the passing site AND samples. The values for failed samples is set to -1 so - // that it can never conflict with valid expressions. - for (i=0; insamples; i++) - rtok->values[i] = rtok->pass_samples[i] ? value : -1; + hts_expand(double,1,rtok->mvalues,rtok->values); + rtok->nsamples = 0; + rtok->nvalues = 1; + rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); return 1; } @@ -1165,13 +1181,30 @@ static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - double val = -HUGE_VAL; - int i, has_value = 0; - for (i=0; invalues; i++) + double *ptr, val = -HUGE_VAL; + int i,j, has_value = 0; + if ( tok->nsamples ) { - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - has_value = 1; - if ( val < tok->values[i] ) val = tok->values[i]; + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + if ( val < ptr[j] ) val = ptr[j]; + } + } + } + else + { + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; + has_value = 1; + if ( val < tok->values[i] ) val = tok->values[i]; + } } if ( has_value ) { @@ -1180,18 +1213,65 @@ static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, } return 1; } +static int func_smpl_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_max(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, has_value; + double val, *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + val = -HUGE_VAL; + has_value = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + if ( val < ptr[j] ) val = ptr[j]; + } + if ( has_value ) rtok->values[i] = val; + else bcf_double_set_missing(rtok->values[i]); + } + return 1; +} static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - double val = HUGE_VAL; - int i, has_value = 0; - for (i=0; invalues; i++) + double *ptr, val = HUGE_VAL; + int i,j, has_value = 0; + if ( tok->nsamples ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + if ( val > ptr[j] ) val = ptr[j]; + } + } + } + else { - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - has_value = 1; - if ( val > tok->values[i] ) val = tok->values[i]; + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; + has_value = 1; + if ( val > tok->values[i] ) val = tok->values[i]; + } } if ( has_value ) { @@ -1200,15 +1280,62 @@ static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, } return 1; } +static int func_smpl_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_min(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, has_value; + double val, *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + val = HUGE_VAL; + has_value = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + if ( val > ptr[j] ) val = ptr[j]; + } + if ( has_value ) rtok->values[i] = val; + else bcf_double_set_missing(rtok->values[i]); + } + return 1; +} static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - double val = 0; - int i, n = 0; - for (i=0; invalues; i++) - if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } + double *ptr, val = 0; + int i,j, n = 0; + if ( tok->nsamples ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + val += ptr[j]; + n++; + } + } + } + else + { + for (i=0; invalues; i++) + if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } + } if ( n ) { rtok->values[0] = val / n; @@ -1216,6 +1343,34 @@ static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, } return 1; } +static int func_smpl_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, n; + double val, *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + val = 0; + n = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) { val += ptr[j]; n++; } + } + if ( n ) rtok->values[i] = val / n; + else bcf_double_set_missing(rtok->values[i]); + } + return 1; +} static int compare_doubles(const void *lhs, const void *rhs) { double arg1 = *(const double*) lhs; @@ -1229,12 +1384,29 @@ static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - int i, n = 0; - for (i=0; invalues; i++) + // sweep through all tok->values and while excluding all missing values reuse the very same array + int i,j,k = 0, n = 0; + if ( tok->nsamples ) { - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - if ( n < i ) tok->values[n] = tok->values[i]; - n++; + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) { k += tok->nval1; continue; } + for (j=0; jnval1; k++,j++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue; + if ( n < k ) tok->values[n] = tok->values[k]; + n++; + } + } + } + else + { + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; + if ( n < i ) tok->values[n] = tok->values[i]; + n++; + } } if ( !n ) return 1; if ( n==1 ) rtok->values[0] = tok->values[0]; @@ -1246,40 +1418,149 @@ static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta rtok->nvalues = 1; return 1; } +static int func_smpl_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, n; + double *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + n = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + if ( n < j ) ptr[n] = ptr[j]; + n++; + } + if ( n==0 ) + bcf_double_set_missing(rtok->values[i]); + else if ( n==1 ) + rtok->values[i] = ptr[0]; + else + { + qsort(ptr, n, sizeof(double), compare_doubles); + rtok->values[i] = n % 2 ? ptr[n/2] : (ptr[n/2-1] + ptr[n/2]) * 0.5; + } + } + return 1; +} static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - int i, n = 0; - for (i=0; invalues; i++) + // sweep through all tok->values and while excluding all missing values reuse the very same array + int i,j,k = 0, n = 0; + if ( tok->nsamples ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) { k += tok->nval1; continue; } + for (j=0; jnval1; k++,j++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue; + if ( n < k ) tok->values[n] = tok->values[k]; + n++; + } + } + } + else { - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - if ( n < i ) tok->values[n] = tok->values[i]; - n++; + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; + if ( n < i ) tok->values[n] = tok->values[i]; + n++; + } } if ( !n ) return 1; if ( n==1 ) rtok->values[0] = 0; else { double sdev = 0, avg = 0; - for (i=0; ivalues[n]; + for (i=0; ivalues[i]; avg /= n; - for (i=0; ivalues[n] - avg) * (tok->values[n] - avg); + for (i=0; ivalues[i] - avg) * (tok->values[i] - avg); rtok->values[0] = sqrt(sdev/n); } rtok->nvalues = 1; return 1; } +static int func_smpl_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, n; + double *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + n = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + if ( n < j ) ptr[n] = ptr[j]; + n++; + } + if ( n==0 ) + bcf_double_set_missing(rtok->values[i]); + else if ( n==1 ) + rtok->values[i] = 0; + else + { + double sdev = 0, avg = 0; + for (j=0; jvalues[i] = sqrt(sdev/n); + } + } + return 1; +} static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { rtok->nvalues = 0; token_t *tok = stack[nstack - 1]; if ( !tok->nvalues ) return 1; - double val = 0; - int i, n = 0; - for (i=0; invalues; i++) - if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } + double *ptr, val = 0; + int i,j, n = 0; + if ( tok->nsamples ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + val += ptr[j]; + n++; + } + } + } + else + { + for (i=0; invalues; i++) + if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } + } if ( n ) { rtok->values[0] = val; @@ -1287,39 +1568,104 @@ static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, } return 1; } +static int func_smpl_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, has_value; + double val, *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + val = 0; + has_value = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + val += ptr[j]; + } + if ( has_value ) rtok->values[i] = val; + else bcf_double_set_missing(rtok->values[i]); + } + return 1; +} static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; if ( tok->is_str ) error("ABS() can be applied only on numeric values\n"); - + rtok->nsamples = tok->nsamples; rtok->nvalues = tok->nvalues; + rtok->nval1 = tok->nval1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + if ( tok->usmpl ) + { + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + } if ( !tok->nvalues ) return 1; hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); - int i; - for (i=0; invalues; i++) - if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); - else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]); + int i,j,k = 0; + if ( tok->usmpl ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; } + for (j=0; jnval1; k++,j++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]); + else rtok->values[k] = fabs(tok->values[k]); + } + } + } + else + { + for (i=0; invalues; i++) + { + if ( tok->usmpl && !tok->usmpl[i] ) continue; + if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); + else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]); + } + } return 1; } static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; - int i, cnt = 0; - if ( !tok->nsamples ) + int i,j, cnt = 0; + if ( tok->tag && tok->nsamples ) { - if ( tok->is_str ) + // raw number of values in a FMT tag, e.g. COUNT(FMT/TAG) + if ( tok->is_str ) error("todo: Type=String for COUNT on FORMAT fields?\n"); + for (i=0; insamples; i++) { - if ( tok->str_value.l ) cnt = 1; - for (i=0; istr_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++; + if ( !tok->usmpl[i] ) continue; + double *ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) cnt++; } - else - cnt = tok->nvalues; } - else + else if ( tok->nsamples ) { + // number of samples that pass a processed FMT tag for (i=0; insamples; i++) if ( tok->pass_samples[i] ) cnt++; } + else if ( tok->is_str ) + { + if ( tok->str_value.l ) cnt = 1; + for (i=0; istr_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++; + } + else + cnt = tok->nvalues; rtok->nvalues = 1; rtok->values[0] = cnt; @@ -1531,11 +1877,27 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac if ( !tok->nvalues ) return 1; hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); - int i; - for (i=0; invalues; i++) - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); - else rtok->values[i] = -4.34294481903*log(tok->values[i]); - + int i,j,k = 0; + if ( tok->usmpl ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; } + for (j=0; jnval1; k++,j++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]); + else rtok->values[k] = -4.34294481903*log(tok->values[k]); + } + } + } + else + { + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); + else rtok->values[i] = -4.34294481903*log(tok->values[i]); + } + } return 1; } inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) @@ -1555,7 +1917,8 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) for (i=0; insamples; i++) rtok->usmpl[i] |= atok->usmpl[i]; for (i=0; insamples; i++) rtok->usmpl[i] |= btok->usmpl[i]; } - memset(rtok->pass_samples, 0, rtok->nsamples); + if (rtok->nsamples) + memset(rtok->pass_samples, 0, rtok->nsamples); } #define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \ @@ -1580,22 +1943,37 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) rtok->values[i] = atok->values[i] AOP btok->values[i]; \ } \ } \ + else if ( atok->nsamples ) \ + { \ + assert( btok->nvalues==1 ); \ + if ( !bcf_double_is_missing_or_vector_end(btok->values[0]) ) \ + { \ + for (i=0; invalues; i++) \ + { \ + if ( bcf_double_is_missing_or_vector_end(atok->values[i]) ) \ + { \ + bcf_double_set_missing(rtok->values[i]); \ + continue; \ + } \ + has_values = 1; \ + rtok->values[i] = atok->values[i] AOP btok->values[0]; \ + } \ + } \ + } \ else \ { \ - token_t *xtok = atok->nsamples ? atok : btok; \ - token_t *ytok = atok->nsamples ? btok : atok; \ - assert( ytok->nvalues==1 ); \ - if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \ + assert( atok->nvalues==1 ); \ + if ( !bcf_double_is_missing_or_vector_end(atok->values[0]) ) \ { \ - for (i=0; invalues; i++) \ + for (i=0; invalues; i++) \ { \ - if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \ + if ( bcf_double_is_missing_or_vector_end(btok->values[i]) ) \ { \ bcf_double_set_missing(rtok->values[i]); \ continue; \ } \ has_values = 1; \ - rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \ + rtok->values[i] = atok->values[0] AOP btok->values[i]; \ } \ } \ } \ @@ -1711,14 +2089,6 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token return 2; } -#define CMP_MISSING(atok,btok,CMP_OP,ret) \ -{ \ - if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \ - token_t *tok = (atok)->is_missing ? (btok) : (atok); \ - (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \ - tok->nvalues = 1; \ -} - #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \ { \ token_t *rtok = _rtok; \ @@ -1821,31 +2191,56 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token } \ } \ } \ - else \ + else if ( atok->nsamples )\ + { \ + for (i=0; insamples; i++) \ + { \ + if ( !rtok->usmpl[i] ) continue; \ + double *aptr = atok->values + i*atok->nval1; \ + double *bptr = btok->values + i*btok->nval1; \ + for (j=0; jnval1; j++) \ + { \ + int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \ + if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ + for (k=0; knvalues; k++) \ + { \ + int nmiss = miss + (bcf_double_is_missing_or_vector_end(bptr[k]) ? 1 : 0); \ + if ( nmiss ) \ + { \ + if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \ + } \ + else if ( aptr[j] > 16777216 || bptr[k] > 16777216 ) /* Ugly, see #871 */ \ + { \ + if ( aptr[j] CMP_OP bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \ + } \ + else if ( (float)aptr[j] CMP_OP (float)bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \ + } \ + } \ + } \ + } \ + else /* btok->nsamples */ \ { \ - token_t *xtok = atok->nsamples ? atok : btok; \ - token_t *ytok = atok->nsamples ? btok : atok; \ - for (i=0; insamples; i++) \ + for (i=0; insamples; i++) \ { \ if ( !rtok->usmpl[i] ) continue; \ - double *xptr = xtok->values + i*xtok->nval1; \ - double *yptr = ytok->values + i*ytok->nval1; \ - for (j=0; jnval1; j++) \ + double *aptr = atok->values + i*atok->nval1; \ + double *bptr = btok->values + i*btok->nval1; \ + for (j=0; jnval1; j++) \ { \ - int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \ + int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \ if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ - for (k=0; knvalues; k++) \ + for (k=0; knvalues; k++) \ { \ - int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \ + int nmiss = miss + (bcf_double_is_missing_or_vector_end(aptr[k]) ? 1 : 0); \ if ( nmiss ) \ { \ - if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \ } \ - else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \ + else if ( bptr[j] > 16777216 || aptr[k] > 16777216 ) /* Ugly, see #871 */ \ { \ - if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + if ( aptr[k] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \ } \ - else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + else if ( (float)aptr[k] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \ } \ } \ } \ @@ -2344,7 +2739,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0; is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0; - if ( is_info && is_fmt ) error("Both INFO/%s and FORMAT/%s exist, which one do you want?\n", tmp.s,tmp.s); + if ( is_info && is_fmt ) + error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s); } if ( is_fmt==-1 ) is_fmt = 0; } @@ -2833,6 +3229,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) // Additionally, treat "." as missing value rather than a string in numeric equalities; that // @file is only used with ID; etc. // This code is fragile: improve me. + static int comma_separator_warned = 0; int i; for (i=0; istr); } + if ( out[i].is_str && out[i].tok_type==TOK_VAL && out[i].key && strchr(out[i].key,',') ) + { + int print_note = 0; + if ( out[i+1].tok_type==TOK_EQ || (out[i+1].is_str && out[i+2].tok_type==TOK_EQ) ) print_note = 1; + else if ( out[i+1].tok_type==TOK_NE || (out[i+1].is_str && out[i+2].tok_type==TOK_NE) ) print_note = 1; + if ( print_note && !comma_separator_warned ) + { + comma_separator_warned = 1; + fprintf(stderr, + "Warning: comma is interpreted as a separator and OR logic is used in string comparisons.\n" + " (Search the manual for \"Comma in strings\" to learn more.)\n"); + } + } if ( out[i].tok_type!=TOK_VAL ) continue; if ( !out[i].tag ) continue; if ( out[i].setter==filters_set_type ) @@ -2939,11 +3349,11 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int itok = i, ival; if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1; - else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1; - else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1; + else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_IN, ival = i - 1; + else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NOT_IN, ival = i - 1; else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i; - else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i; - else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i; + else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_IN, ival = ++i; + else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NOT_IN, ival = ++i; else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( out[ival].tok_type!=TOK_VAL || !out[ival].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); @@ -2976,6 +3386,12 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sAVG ) { out[i].func = func_smpl_avg; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sMEDIAN ) { out[i].func = func_smpl_median; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sSTDEV ) { out[i].func = func_smpl_stddev; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sSUM ) { out[i].func = func_smpl_sum; out[i].tok_type = TOK_FUNC; } hts_expand0(double,1,out[i].mvalues,out[i].values); if ( filter->nsamples ) { @@ -3151,3 +3567,32 @@ int filter_max_unpack(filter_t *flt) { return flt->max_unpack; } + +const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1) +{ + token_t *tok = filter->flt_stack[0]; + if ( tok->nvalues ) + { + *nval = tok->nvalues; + *nval1 = tok->nval1; + } + else + { + if ( !tok->values ) error("fixme in filter_get_doubles(): %s\n", filter->str); + *nval = 1; + *nval1 = 1; + tok->values[0] = filter->flt_stack[0]->pass_site; + } + return tok->values; +} + +void filter_set_samples(filter_t *filter, const uint8_t *samples) +{ + int i,j; + for (i=0; infilters; i++) + { + if ( !filter->filters[i].nsamples ) continue; + for (j=0; jfilters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j]; + } +} + diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 2d1987a..8832633 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -2,7 +2,7 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -27,6 +27,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -58,27 +59,6 @@ static int filter_ninit = 0; # define __FUNCTION__ __func__ #endif -static const uint64_t bcf_double_missing = 0x7ff0000000000001; -static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; -static inline void bcf_double_set(double *ptr, uint64_t value) -{ - union { uint64_t i; double d; } u; - u.i = value; - *ptr = u.d; -} -static inline int bcf_double_test(double d, uint64_t value) -{ - union { uint64_t i; double d; } u; - u.d = d; - return u.i==value ? 1 : 0; -} -#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end) -#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) -#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) -#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) -#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) - - typedef struct _token_t { // read-only values, same for all VCF lines @@ -91,9 +71,9 @@ typedef struct _token_t int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) - int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited + int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only int nidxs, nuidxs; // size of idxs array and the number of elements set to 1 - uint8_t *usmpl; // bitmask of used samples as set by idx + uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise int nsamples; // number of samples for format fields, 0 for info and other fields void (*setter)(filter_t *, bcf1_t *, struct _token_t *); int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack); @@ -160,11 +140,19 @@ struct _filter_t #define TOK_PHRED 29 #define TOK_MEDIAN 30 #define TOK_STDEV 31 - -// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 -// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s -static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; -#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" +#define TOK_sMAX 32 +#define TOK_sMIN 33 +#define TOK_sAVG 34 +#define TOK_sMEDIAN 35 +#define TOK_sSTDEV 36 +#define TOK_sSUM 37 +#define TOK_IN 38 // contains, e.g. FILTER~"A" +#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" + +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 +// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s +static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; +#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently // Return negative values if it is a function with variable number of arguments static int filters_next_token(char **str, int *len) @@ -186,6 +174,20 @@ static int filters_next_token(char **str, int *len) tmp = *str; } + if ( !strncasecmp(tmp,"SMPL_MAX(",9) ) { (*str) += 8; return TOK_sMAX; } + if ( !strncasecmp(tmp,"SMPL_MIN(",9) ) { (*str) += 8; return TOK_sMIN; } + if ( !strncasecmp(tmp,"SMPL_MEAN(",10) ) { (*str) += 9; return TOK_sAVG; } + if ( !strncasecmp(tmp,"SMPL_MEDIAN(",12) ) { (*str) += 11; return TOK_sMEDIAN; } + if ( !strncasecmp(tmp,"SMPL_AVG(",9) ) { (*str) += 8; return TOK_sAVG; } + if ( !strncasecmp(tmp,"SMPL_STDEV(",11) ) { (*str) += 10; return TOK_sSTDEV; } + if ( !strncasecmp(tmp,"SMPL_SUM(",9) ) { (*str) += 8; return TOK_sSUM; } + if ( !strncasecmp(tmp,"sMAX(",5) ) { (*str) += 4; return TOK_sMAX; } + if ( !strncasecmp(tmp,"sMIN(",5) ) { (*str) += 4; return TOK_sMIN; } + if ( !strncasecmp(tmp,"sMEAN(",6) ) { (*str) += 5; return TOK_sAVG; } + if ( !strncasecmp(tmp,"sMEDIAN(",8) ) { (*str) += 7; return TOK_sMEDIAN; } + if ( !strncasecmp(tmp,"sAVG(",5) ) { (*str) += 4; return TOK_sAVG; } + if ( !strncasecmp(tmp,"sSTDEV(",7) ) { (*str) += 6; return TOK_sSTDEV; } + if ( !strncasecmp(tmp,"sSUM(",5) ) { (*str) += 4; return TOK_sSUM; } if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; } if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; } if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; } @@ -419,7 +421,7 @@ static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { int i; - if ( rtok->tok_type==TOK_NE ) // AND logic: none of the filters can match + if ( rtok->tok_type==TOK_NOT_IN ) { if ( !line->d.n_flt ) { @@ -432,7 +434,7 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1 rtok->pass_site = 1; return; } - else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match + else if ( rtok->tok_type==TOK_IN ) { if ( !line->d.n_flt ) { @@ -443,8 +445,30 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1 if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; } return; } + else if ( rtok->tok_type==TOK_NE ) // exact match + { + if ( !line->d.n_flt ) + { + if ( atok->hdr_id==-1 ) return; // missing value + rtok->pass_site = 1; + return; // no filter present, eval to true + } + if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present + rtok->pass_site = 1; + return; + } + else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present + { + if ( !line->d.n_flt ) + { + if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; } + return; // no filter present, eval to false + } + if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1; + return; + } else - error("Only == and != operators are supported for FILTER\n"); + error("Only ==, !=, ~, and !~ operators are supported for FILTER\n"); return; } static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) @@ -516,7 +540,7 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value) case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break; case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break; - default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break; } #undef BRANCH return -1; // this shouldn't happen @@ -1038,54 +1062,46 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) tok->nvalues = 0; return; } - if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n"); - + int j,nmissing = 0; - for (i=0; in_sample; i++) - { - int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size); - for (j=0; jn; j++) - { - if ( ptr[j]==bcf_int8_vector_end ) break; - if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } - } + #define BRANCH(type_t, is_vector_end) { \ + for (i=0; in_sample; i++) \ + { \ + type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \ + for (j=0; jn; j++) \ + { \ + if ( ptr[j]==is_vector_end ) break; \ + if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \ + } \ + } \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; + default: fprintf(bcftools_stderr,"todo: type %d\n", fmt->type); bcftools_exit(1); break; } + #undef BRANCH tok->nvalues = 1; tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample; } static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - if ( nstack==0 ) error("Error parsing the expresion\n"); + if ( nstack==0 ) error("Error parsing the expression\n"); token_t *tok = stack[nstack - 1]; if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag); - - rtok->nsamples = tok->nsamples; - memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); - assert(tok->usmpl); - if ( !rtok->usmpl ) - { - rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); - memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl)); - } int i, npass = 0; - for (i=0; insamples; i++) + for (i=0; insamples; i++) { - if ( !rtok->usmpl[i] ) continue; - if ( rtok->pass_samples[i] ) npass++; + if ( !tok->usmpl[i] ) continue; + if ( tok->pass_samples[i] ) npass++; } - - hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values); - double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); - rtok->nval1 = 1; - rtok->nvalues = rtok->nsamples; - - // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats - // consider only the passing site AND samples. The values for failed samples is set to -1 so - // that it can never conflict with valid expressions. - for (i=0; insamples; i++) - rtok->values[i] = rtok->pass_samples[i] ? value : -1; + hts_expand(double,1,rtok->mvalues,rtok->values); + rtok->nsamples = 0; + rtok->nvalues = 1; + rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); return 1; } @@ -1167,13 +1183,30 @@ static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - double val = -HUGE_VAL; - int i, has_value = 0; - for (i=0; invalues; i++) + double *ptr, val = -HUGE_VAL; + int i,j, has_value = 0; + if ( tok->nsamples ) { - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - has_value = 1; - if ( val < tok->values[i] ) val = tok->values[i]; + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + if ( val < ptr[j] ) val = ptr[j]; + } + } + } + else + { + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; + has_value = 1; + if ( val < tok->values[i] ) val = tok->values[i]; + } } if ( has_value ) { @@ -1182,18 +1215,65 @@ static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, } return 1; } +static int func_smpl_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_max(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, has_value; + double val, *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + val = -HUGE_VAL; + has_value = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + if ( val < ptr[j] ) val = ptr[j]; + } + if ( has_value ) rtok->values[i] = val; + else bcf_double_set_missing(rtok->values[i]); + } + return 1; +} static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - double val = HUGE_VAL; - int i, has_value = 0; - for (i=0; invalues; i++) + double *ptr, val = HUGE_VAL; + int i,j, has_value = 0; + if ( tok->nsamples ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + if ( val > ptr[j] ) val = ptr[j]; + } + } + } + else { - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - has_value = 1; - if ( val > tok->values[i] ) val = tok->values[i]; + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; + has_value = 1; + if ( val > tok->values[i] ) val = tok->values[i]; + } } if ( has_value ) { @@ -1202,15 +1282,62 @@ static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, } return 1; } +static int func_smpl_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_min(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, has_value; + double val, *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + val = HUGE_VAL; + has_value = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + if ( val > ptr[j] ) val = ptr[j]; + } + if ( has_value ) rtok->values[i] = val; + else bcf_double_set_missing(rtok->values[i]); + } + return 1; +} static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - double val = 0; - int i, n = 0; - for (i=0; invalues; i++) - if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } + double *ptr, val = 0; + int i,j, n = 0; + if ( tok->nsamples ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + val += ptr[j]; + n++; + } + } + } + else + { + for (i=0; invalues; i++) + if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } + } if ( n ) { rtok->values[0] = val / n; @@ -1218,6 +1345,34 @@ static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, } return 1; } +static int func_smpl_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, n; + double val, *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + val = 0; + n = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) { val += ptr[j]; n++; } + } + if ( n ) rtok->values[i] = val / n; + else bcf_double_set_missing(rtok->values[i]); + } + return 1; +} static int compare_doubles(const void *lhs, const void *rhs) { double arg1 = *(const double*) lhs; @@ -1231,12 +1386,29 @@ static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - int i, n = 0; - for (i=0; invalues; i++) + // sweep through all tok->values and while excluding all missing values reuse the very same array + int i,j,k = 0, n = 0; + if ( tok->nsamples ) { - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - if ( n < i ) tok->values[n] = tok->values[i]; - n++; + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) { k += tok->nval1; continue; } + for (j=0; jnval1; k++,j++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue; + if ( n < k ) tok->values[n] = tok->values[k]; + n++; + } + } + } + else + { + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; + if ( n < i ) tok->values[n] = tok->values[i]; + n++; + } } if ( !n ) return 1; if ( n==1 ) rtok->values[0] = tok->values[0]; @@ -1248,40 +1420,149 @@ static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta rtok->nvalues = 1; return 1; } +static int func_smpl_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, n; + double *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + n = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + if ( n < j ) ptr[n] = ptr[j]; + n++; + } + if ( n==0 ) + bcf_double_set_missing(rtok->values[i]); + else if ( n==1 ) + rtok->values[i] = ptr[0]; + else + { + qsort(ptr, n, sizeof(double), compare_doubles); + rtok->values[i] = n % 2 ? ptr[n/2] : (ptr[n/2-1] + ptr[n/2]) * 0.5; + } + } + return 1; +} static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; rtok->nvalues = 0; if ( !tok->nvalues ) return 1; - int i, n = 0; - for (i=0; invalues; i++) + // sweep through all tok->values and while excluding all missing values reuse the very same array + int i,j,k = 0, n = 0; + if ( tok->nsamples ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) { k += tok->nval1; continue; } + for (j=0; jnval1; k++,j++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue; + if ( n < k ) tok->values[n] = tok->values[k]; + n++; + } + } + } + else { - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - if ( n < i ) tok->values[n] = tok->values[i]; - n++; + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; + if ( n < i ) tok->values[n] = tok->values[i]; + n++; + } } if ( !n ) return 1; if ( n==1 ) rtok->values[0] = 0; else { double sdev = 0, avg = 0; - for (i=0; ivalues[n]; + for (i=0; ivalues[i]; avg /= n; - for (i=0; ivalues[n] - avg) * (tok->values[n] - avg); + for (i=0; ivalues[i] - avg) * (tok->values[i] - avg); rtok->values[0] = sqrt(sdev/n); } rtok->nvalues = 1; return 1; } +static int func_smpl_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, n; + double *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + n = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + if ( n < j ) ptr[n] = ptr[j]; + n++; + } + if ( n==0 ) + bcf_double_set_missing(rtok->values[i]); + else if ( n==1 ) + rtok->values[i] = 0; + else + { + double sdev = 0, avg = 0; + for (j=0; jvalues[i] = sqrt(sdev/n); + } + } + return 1; +} static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { rtok->nvalues = 0; token_t *tok = stack[nstack - 1]; if ( !tok->nvalues ) return 1; - double val = 0; - int i, n = 0; - for (i=0; invalues; i++) - if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } + double *ptr, val = 0; + int i,j, n = 0; + if ( tok->nsamples ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + val += ptr[j]; + n++; + } + } + } + else + { + for (i=0; invalues; i++) + if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } + } if ( n ) { rtok->values[0] = val; @@ -1289,39 +1570,104 @@ static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, } return 1; } +static int func_smpl_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack); + rtok->nsamples = tok->nsamples; + rtok->nvalues = tok->nsamples; + rtok->nval1 = 1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + assert(tok->usmpl); + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + int i, j, has_value; + double val, *ptr; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + val = 0; + has_value = 0; + ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + { + if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue; + has_value = 1; + val += ptr[j]; + } + if ( has_value ) rtok->values[i] = val; + else bcf_double_set_missing(rtok->values[i]); + } + return 1; +} static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; if ( tok->is_str ) error("ABS() can be applied only on numeric values\n"); - + rtok->nsamples = tok->nsamples; rtok->nvalues = tok->nvalues; + rtok->nval1 = tok->nval1; + hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values); + if ( tok->usmpl ) + { + if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples); + memcpy(rtok->usmpl, tok->usmpl, tok->nsamples); + } if ( !tok->nvalues ) return 1; hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); - int i; - for (i=0; invalues; i++) - if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); - else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]); + int i,j,k = 0; + if ( tok->usmpl ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; } + for (j=0; jnval1; k++,j++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]); + else rtok->values[k] = fabs(tok->values[k]); + } + } + } + else + { + for (i=0; invalues; i++) + { + if ( tok->usmpl && !tok->usmpl[i] ) continue; + if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); + else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]); + } + } return 1; } static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { token_t *tok = stack[nstack - 1]; - int i, cnt = 0; - if ( !tok->nsamples ) + int i,j, cnt = 0; + if ( tok->tag && tok->nsamples ) { - if ( tok->is_str ) + // raw number of values in a FMT tag, e.g. COUNT(FMT/TAG) + if ( tok->is_str ) error("todo: Type=String for COUNT on FORMAT fields?\n"); + for (i=0; insamples; i++) { - if ( tok->str_value.l ) cnt = 1; - for (i=0; istr_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++; + if ( !tok->usmpl[i] ) continue; + double *ptr = tok->values + i*tok->nval1; + for (j=0; jnval1; j++) + if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) cnt++; } - else - cnt = tok->nvalues; } - else + else if ( tok->nsamples ) { + // number of samples that pass a processed FMT tag for (i=0; insamples; i++) if ( tok->pass_samples[i] ) cnt++; } + else if ( tok->is_str ) + { + if ( tok->str_value.l ) cnt = 1; + for (i=0; istr_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++; + } + else + cnt = tok->nvalues; rtok->nvalues = 1; rtok->values[0] = cnt; @@ -1533,11 +1879,27 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac if ( !tok->nvalues ) return 1; hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); - int i; - for (i=0; invalues; i++) - if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); - else rtok->values[i] = -4.34294481903*log(tok->values[i]); - + int i,j,k = 0; + if ( tok->usmpl ) + { + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; } + for (j=0; jnval1; k++,j++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]); + else rtok->values[k] = -4.34294481903*log(tok->values[k]); + } + } + } + else + { + for (i=0; invalues; i++) + { + if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); + else rtok->values[i] = -4.34294481903*log(tok->values[i]); + } + } return 1; } inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) @@ -1557,7 +1919,8 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) for (i=0; insamples; i++) rtok->usmpl[i] |= atok->usmpl[i]; for (i=0; insamples; i++) rtok->usmpl[i] |= btok->usmpl[i]; } - memset(rtok->pass_samples, 0, rtok->nsamples); + if (rtok->nsamples) + memset(rtok->pass_samples, 0, rtok->nsamples); } #define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \ @@ -1582,22 +1945,37 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) rtok->values[i] = atok->values[i] AOP btok->values[i]; \ } \ } \ + else if ( atok->nsamples ) \ + { \ + assert( btok->nvalues==1 ); \ + if ( !bcf_double_is_missing_or_vector_end(btok->values[0]) ) \ + { \ + for (i=0; invalues; i++) \ + { \ + if ( bcf_double_is_missing_or_vector_end(atok->values[i]) ) \ + { \ + bcf_double_set_missing(rtok->values[i]); \ + continue; \ + } \ + has_values = 1; \ + rtok->values[i] = atok->values[i] AOP btok->values[0]; \ + } \ + } \ + } \ else \ { \ - token_t *xtok = atok->nsamples ? atok : btok; \ - token_t *ytok = atok->nsamples ? btok : atok; \ - assert( ytok->nvalues==1 ); \ - if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \ + assert( atok->nvalues==1 ); \ + if ( !bcf_double_is_missing_or_vector_end(atok->values[0]) ) \ { \ - for (i=0; invalues; i++) \ + for (i=0; invalues; i++) \ { \ - if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \ + if ( bcf_double_is_missing_or_vector_end(btok->values[i]) ) \ { \ bcf_double_set_missing(rtok->values[i]); \ continue; \ } \ has_values = 1; \ - rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \ + rtok->values[i] = atok->values[0] AOP btok->values[i]; \ } \ } \ } \ @@ -1713,14 +2091,6 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token return 2; } -#define CMP_MISSING(atok,btok,CMP_OP,ret) \ -{ \ - if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \ - token_t *tok = (atok)->is_missing ? (btok) : (atok); \ - (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \ - tok->nvalues = 1; \ -} - #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \ { \ token_t *rtok = _rtok; \ @@ -1823,31 +2193,56 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token } \ } \ } \ - else \ + else if ( atok->nsamples )\ + { \ + for (i=0; insamples; i++) \ + { \ + if ( !rtok->usmpl[i] ) continue; \ + double *aptr = atok->values + i*atok->nval1; \ + double *bptr = btok->values + i*btok->nval1; \ + for (j=0; jnval1; j++) \ + { \ + int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \ + if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ + for (k=0; knvalues; k++) \ + { \ + int nmiss = miss + (bcf_double_is_missing_or_vector_end(bptr[k]) ? 1 : 0); \ + if ( nmiss ) \ + { \ + if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \ + } \ + else if ( aptr[j] > 16777216 || bptr[k] > 16777216 ) /* Ugly, see #871 */ \ + { \ + if ( aptr[j] CMP_OP bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \ + } \ + else if ( (float)aptr[j] CMP_OP (float)bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \ + } \ + } \ + } \ + } \ + else /* btok->nsamples */ \ { \ - token_t *xtok = atok->nsamples ? atok : btok; \ - token_t *ytok = atok->nsamples ? btok : atok; \ - for (i=0; insamples; i++) \ + for (i=0; insamples; i++) \ { \ if ( !rtok->usmpl[i] ) continue; \ - double *xptr = xtok->values + i*xtok->nval1; \ - double *yptr = ytok->values + i*ytok->nval1; \ - for (j=0; jnval1; j++) \ + double *aptr = atok->values + i*atok->nval1; \ + double *bptr = btok->values + i*btok->nval1; \ + for (j=0; jnval1; j++) \ { \ - int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \ + int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \ if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ - for (k=0; knvalues; k++) \ + for (k=0; knvalues; k++) \ { \ - int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \ + int nmiss = miss + (bcf_double_is_missing_or_vector_end(aptr[k]) ? 1 : 0); \ if ( nmiss ) \ { \ - if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \ } \ - else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \ + else if ( bptr[j] > 16777216 || aptr[k] > 16777216 ) /* Ugly, see #871 */ \ { \ - if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + if ( aptr[k] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \ } \ - else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + else if ( (float)aptr[k] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \ } \ } \ } \ @@ -2346,7 +2741,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0; is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0; - if ( is_info && is_fmt ) error("Both INFO/%s and FORMAT/%s exist, which one do you want?\n", tmp.s,tmp.s); + if ( is_info && is_fmt ) + error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s); } if ( is_fmt==-1 ) is_fmt = 0; } @@ -2835,6 +3231,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) // Additionally, treat "." as missing value rather than a string in numeric equalities; that // @file is only used with ID; etc. // This code is fragile: improve me. + static int comma_separator_warned = 0; int i; for (i=0; istr); } + if ( out[i].is_str && out[i].tok_type==TOK_VAL && out[i].key && strchr(out[i].key,',') ) + { + int print_note = 0; + if ( out[i+1].tok_type==TOK_EQ || (out[i+1].is_str && out[i+2].tok_type==TOK_EQ) ) print_note = 1; + else if ( out[i+1].tok_type==TOK_NE || (out[i+1].is_str && out[i+2].tok_type==TOK_NE) ) print_note = 1; + if ( print_note && !comma_separator_warned ) + { + comma_separator_warned = 1; + fprintf(bcftools_stderr, + "Warning: comma is interpreted as a separator and OR logic is used in string comparisons.\n" + " (Search the manual for \"Comma in strings\" to learn more.)\n"); + } + } if ( out[i].tok_type!=TOK_VAL ) continue; if ( !out[i].tag ) continue; if ( out[i].setter==filters_set_type ) @@ -2941,11 +3351,11 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int itok = i, ival; if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1; - else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1; - else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1; + else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_IN, ival = i - 1; + else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NOT_IN, ival = i - 1; else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i; - else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i; - else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i; + else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_IN, ival = ++i; + else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NOT_IN, ival = ++i; else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( out[ival].tok_type!=TOK_VAL || !out[ival].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); @@ -2978,6 +3388,12 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; } else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sAVG ) { out[i].func = func_smpl_avg; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sMEDIAN ) { out[i].func = func_smpl_median; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sSTDEV ) { out[i].func = func_smpl_stddev; out[i].tok_type = TOK_FUNC; } + else if ( out[i].tok_type==TOK_sSUM ) { out[i].func = func_smpl_sum; out[i].tok_type = TOK_FUNC; } hts_expand0(double,1,out[i].mvalues,out[i].values); if ( filter->nsamples ) { @@ -3153,3 +3569,32 @@ int filter_max_unpack(filter_t *flt) { return flt->max_unpack; } + +const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1) +{ + token_t *tok = filter->flt_stack[0]; + if ( tok->nvalues ) + { + *nval = tok->nvalues; + *nval1 = tok->nval1; + } + else + { + if ( !tok->values ) error("fixme in filter_get_doubles(): %s\n", filter->str); + *nval = 1; + *nval1 = 1; + tok->values[0] = filter->flt_stack[0]->pass_site; + } + return tok->values; +} + +void filter_set_samples(filter_t *filter, const uint8_t *samples) +{ + int i,j; + for (i=0; infilters; i++) + { + if ( !filter->filters[i].nsamples ) continue; + for (j=0; jfilters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j]; + } +} + diff --git a/bcftools/filter.h b/bcftools/filter.h index ccd3fe3..243e3b6 100644 --- a/bcftools/filter.h +++ b/bcftools/filter.h @@ -1,6 +1,6 @@ /* filter.h -- filter expressions. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -46,6 +46,18 @@ void filter_destroy(filter_t *filter); */ int filter_test(filter_t *filter, bcf1_t *rec, const uint8_t **samples); +/** + * filter_set_samples() - restrict filtering expression to samples. + * Call after filter_init(). + * @samples: use samples set to 1, ignore samples set 0 + */ +void filter_set_samples(filter_t *filter, const uint8_t *samples); + +/** + * filter_get_doubles() - return a pointer to values from the last filter_test() evaluation + */ +const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1); + void filter_expression_info(FILE *fp); int filter_max_unpack(filter_t *filter); diff --git a/bcftools/hclust.c b/bcftools/hclust.c index 692fa54..945c70e 100644 --- a/bcftools/hclust.c +++ b/bcftools/hclust.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "bcftools.h" #include "hclust.h" diff --git a/bcftools/hclust.c.pysam.c b/bcftools/hclust.c.pysam.c index 29da67c..0a90af8 100644 --- a/bcftools/hclust.c.pysam.c +++ b/bcftools/hclust.c.pysam.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "bcftools.h" #include "hclust.h" diff --git a/bcftools/htslib-1.10.2/LICENSE b/bcftools/htslib-1.10.2/LICENSE deleted file mode 100644 index f70e757..0000000 --- a/bcftools/htslib-1.10.2/LICENSE +++ /dev/null @@ -1,69 +0,0 @@ -[Files in this distribution outwith the cram/ subdirectory are distributed -according to the terms of the following MIT/Expat license.] - -The MIT/Expat License - -Copyright (C) 2012-2019 Genome Research Ltd. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. - - -[Files within the cram/ subdirectory in this distribution are distributed -according to the terms of the following Modified 3-Clause BSD license.] - -The Modified-BSD License - -Copyright (C) 2012-2019 Genome Research Ltd. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute - nor the names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -[The use of a range of years within a copyright notice in this distribution -should be interpreted as being equivalent to a list of years including the -first and last year specified and all consecutive years between them. - -For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009, -2011-2012" should be interpreted as being identical to a notice that reads -"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice -that reads "Copyright (C) 2005-2012" should be interpreted as being identical -to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, -2011, 2012".] diff --git a/bcftools/htslib-1.10.2/README b/bcftools/htslib-1.10.2/README deleted file mode 100644 index 4225bec..0000000 --- a/bcftools/htslib-1.10.2/README +++ /dev/null @@ -1,5 +0,0 @@ -HTSlib is an implementation of a unified C library for accessing common file -formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing -data. It is the core library used by samtools and bcftools. - -See INSTALL for building and installation instructions. diff --git a/bcftools/main.c b/bcftools/main.c index 2e3e56d..f892711 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -1,6 +1,6 @@ /* main.c -- main bcftools command front-end. - Copyright (C) 2012-2018 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -58,7 +58,7 @@ int main_plugin(int argc, char *argv[]); #endif int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); -int bam_mpileup(int argc, char *argv[]); +int main_mpileup(int argc, char *argv[]); int main_sort(int argc, char *argv[]); typedef struct @@ -164,7 +164,7 @@ static cmd_t cmds[] = .alias = "gtcheck", .help = "check sample concordance, detect sample swaps and contamination" }, - { .func = bam_mpileup, + { .func = main_mpileup, .alias = "mpileup", .help = "multi-way pileup producing genotype likelihoods" }, @@ -251,7 +251,7 @@ int main(int argc, char *argv[]) if (argc < 2) { usage(stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version()); + printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL printf("License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index c7cd4b0..bfd0f04 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -2,7 +2,7 @@ /* main.c -- main bcftools command front-end. - Copyright (C) 2012-2018 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -60,7 +60,7 @@ int main_plugin(int argc, char *argv[]); #endif int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); -int bam_mpileup(int argc, char *argv[]); +int main_mpileup(int argc, char *argv[]); int main_sort(int argc, char *argv[]); typedef struct @@ -166,7 +166,7 @@ static cmd_t cmds[] = .alias = "gtcheck", .help = "check sample concordance, detect sample swaps and contamination" }, - { .func = bam_mpileup, + { .func = main_mpileup, .alias = "mpileup", .help = "multi-way pileup producing genotype likelihoods" }, @@ -253,7 +253,7 @@ int bcftools_main(int argc, char *argv[]) if (argc < 2) { usage(bcftools_stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version()); + fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/mcall.c b/bcftools/mcall.c index 325093d..e96d41d 100644 --- a/bcftools/mcall.c +++ b/bcftools/mcall.c @@ -1,6 +1,6 @@ /* mcall.c -- multiallelic and rare variant calling. - Copyright (C) 2012-2016 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -22,11 +22,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include #include #include +#include #include #include #include "call.h" +#include "prob1.h" // Using priors for GTs does not seem to be mathematically justified. Although // it seems effective in removing false calls, it also flips a significant @@ -38,6 +41,7 @@ THE SOFTWARE. */ // genotypes is reported instead. #define FLAT_PDG_FOR_MISSING 0 +int test16(float *anno16, anno16_t *a); void qcall_init(call_t *call) { return; } void qcall_destroy(call_t *call) { return; } @@ -249,19 +253,46 @@ static void init_sample_groups(call_t *call) if ( !call->sample_groups ) { // standard pooled calling, all samples in the same group - grp_t *grps = &call->smpl_grp; - grps->ngrp = 1; - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int)); + call->nsmpl_grp = 1; + call->smpl_grp = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp)); + call->smpl_grp[0].nsmpl = nsmpl; + call->smpl_grp[0].smpl = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t)); + for (i=0; ismpl_grp[0].smpl[i] = i; + return; + } + + if ( call->sample_groups_tag ) + { + // Is the tag defined in the header? + int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag); + if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag); + if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag); + } + else + { + int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS"); + if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS"; + else + { + tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD"); + if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD"; + else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n"); + } } - else if ( !strcmp("-",call->sample_groups) ) + + // Read samples/groups + if ( !strcmp("-",call->sample_groups) ) { // single-sample calling, each sample creates its own group - grp_t *grps = &call->smpl_grp; - grps->ngrp = nsmpl; - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); - for (i=0; ismpl2grp[i] = i; + call->nsmpl_grp = nsmpl; + call->smpl_grp = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp)); + for (i=0; ismpl_grp[i].nsmpl = 1; + call->smpl_grp[i].smpl = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t)); + call->smpl_grp[i].smpl[0] = i; + } } else { @@ -269,40 +300,49 @@ static void init_sample_groups(call_t *call) char **lines = hts_readlist(call->sample_groups, 1, &nlines); if ( !lines ) error("Could not read the file: %s\n", call->sample_groups); - uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); + uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); + uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); void *grp2idx = khash_str2int_init(); - grp_t *grps = &call->smpl_grp; + call->nsmpl_grp = 0; for (i=0; isample_groups,lines[i]); - *ptr = 0; + char *tmp = ptr; + while ( *ptr && isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]); + *tmp = 0; int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); if ( ismpl<0 ) continue; - if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); + if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); if ( !khash_str2int_has_key(grp2idx,ptr+1) ) { - khash_str2int_inc(grp2idx, ptr+1); - grps->ngrp++; + khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp); + call->nsmpl_grp++; } - int igrp; - if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 ) - smpl2grp1[ismpl] = igrp+1; - else + int igrp = -1; + if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 ) error("This should not happen, fixme: %s\n",ptr+1); + grp2n[igrp]++; + smpl2grp[ismpl] = igrp+1; // +1 to distinguish unlisted samples } khash_str2int_destroy(grp2idx); + if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups); - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); + call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp)); for (i=0; ihdr->samples[i],call->sample_groups); - grps->smpl2grp[i] = smpl2grp1[i] - 1; + if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups); + int igrp = smpl2grp[i] - 1; + if ( !call->smpl_grp[igrp].nsmpl ) + call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t)); + call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i; + call->smpl_grp[igrp].nsmpl++; } - free(smpl2grp1); + free(smpl2grp); + free(grp2n); for (i=0; ismpl_grp; - for (i=0; ingrp; i++) - free(grps->grp[i].qsum); - free(grps->grp); - free(grps->smpl2grp); + for (i=0; insmpl_grp; i++) + { + free(call->smpl_grp[i].qsum); + free(call->smpl_grp[i].smpl); + } + free(call->smpl_grp); } void mcall_init(call_t *call) { + init_sample_groups(call); call_init_pl2p(call); call->nals_map = 5; @@ -341,15 +383,15 @@ void mcall_init(call_t *call) if ( call->output_tags & CALL_FMT_GQ ) bcf_hdr_append(call->hdr,"##FORMAT="); if ( call->output_tags & CALL_FMT_GP ) - bcf_hdr_append(call->hdr,"##FORMAT="); + bcf_hdr_append(call->hdr,"##FORMAT="); if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr)); - bcf_hdr_append(call->hdr,"##INFO="); - bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); + if ( call->output_tags & CALL_FMT_PV4 ) + bcf_hdr_append(call->hdr,"##INFO=\n"); // init the prior if ( call->theta>0 ) @@ -372,8 +414,6 @@ void mcall_init(call_t *call) } call->theta = log(call->theta); } - - init_sample_groups(call); } void mcall_destroy(call_t *call) @@ -394,7 +434,6 @@ void mcall_destroy(call_t *call) free(call->pdg); free(call->als); free(call->ac); - free(call->qsum); return; } @@ -505,14 +544,14 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse } // Create mapping between old and new (trimmed) alleles -void init_allele_trimming_maps(call_t *call, int als, int nals) +void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out) { - int i, j; + int i, j, nout = 0; // als_map: old(i) -> new(j) - for (i=0, j=0; ials_map[i] = j++; + if ( als_out & (1<als_map[i] = nout++; else call->als_map[i] = -1; } @@ -520,85 +559,16 @@ void init_allele_trimming_maps(call_t *call, int als, int nals) // pl_map: new(k) -> old(l) int k = 0, l = 0; - for (i=0; ipl_map[k++] = l; + if ( (als_out & (1<pl_map[k++] = l; l++; } } } -double binom_dist(int N, double p, int k) -{ - int mean = (int) (N*p); - if ( mean==k ) return 1.0; - - double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p); - if ( k > N - k ) k = N - k; - if ( mean > N - mean ) mean = N - mean; - - if ( k < mean ) { int tmp = k; k = mean; mean = tmp; } - double diff = k - mean; - - double val = 1.0; - int i; - for (i=0; i10 && (1-q)*ndiploid>10 ) || ndiploid>200 ) - { - //fprintf(stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); - return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))); - } - - return binom_dist(ndiploid, q, nhets); -} - -float calc_HOB(int nref, int nalt, int nhets, int ndiploid) -{ - if ( !nref || !nalt || !ndiploid ) return HUGE_VAL; - - double fref = (double)nref/(nref+nalt); // fraction of reference allelels - double falt = (double)nalt/(nref+nalt); // non-ref als - return fabs((double)nhets/ndiploid - 2*fref*falt); -} - -/** - * log(sum_i exp(a_i)) - */ -// static inline double logsumexp(double *vals, int nvals) -// { -// int i; -// double max_exp = vals[0]; -// for (i=1; ihdr); + int nsmpl = grp->nsmpl; int ngts = nals*(nals+1)/2; // Single allele @@ -634,60 +603,45 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) double lk_tot = 0; int lk_tot_set = 0; int iaa = (ia+1)*(ia+2)/2-1; // index in PL which corresponds to the homozygous "ia/ia" genotype - int isample; - double *pdg = call->pdg + iaa; - for (isample=0; isamplepdg + grp->smpl[ismpl]*ngts + iaa; if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; } - pdg += ngts; } if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples else lk_tot += call->theta; // the prior UPDATE_MAX_LKs(1<0 && lk_tot_set); } - grp_t *grps = &call->smpl_grp; - // Two alleles if ( nals>1 ) { for (ia=0; iangrp==1 && grps->grp[0].qsum[ia]==0 ) continue; + if ( grp->qsum[ia]==0 ) continue; int iaa = (ia+1)*(ia+2)/2-1; for (ib=0; ibngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; + if ( grp->qsum[ib]==0 ) continue; double lk_tot = 0; int lk_tot_set = 0; - int ia_cov = 0, ib_cov = 0; - for (j=0; jngrp; j++) + double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]); + double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]); + double fa2 = fa*fa; + double fb2 = fb*fb; + double fab = 2*fa*fb; + int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; + for (is=0; isgrp[j]; - if ( grp->qsum[ia] ) ia_cov = 1; - if ( grp->qsum[ib] ) ib_cov = 1; - if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; } - grp->dp = 1; - grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]); - grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]); - grp->fa2 = grp->fa*grp->fa; - grp->fb2 = grp->fb*grp->fb; - grp->fab = 2*grp->fa*grp->fb; - } - if ( !ia_cov || !ib_cov ) continue; - int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; - if ( !grp->dp ) continue; + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts; double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) - val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab]; - else if ( call->ploidy && call->ploidy[isample]==1 ) - val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb]; + if ( !call->ploidy || call->ploidy[ismpl]==2 ) + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; + else if ( call->ploidy && call->ploidy[ismpl]==1 ) + val = fa*pdg[iaa] + fb*pdg[ibb]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; } if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; @@ -701,50 +655,38 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) { for (ia=0; iangrp==1 && grps->grp[0].qsum[ia]==0 ) continue; + if ( grp->qsum[ia]==0 ) continue; int iaa = (ia+1)*(ia+2)/2-1; for (ib=0; ibngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; + if ( grp->qsum[ib]==0 ) continue; int ibb = (ib+1)*(ib+2)/2-1; int iab = iaa - ia + ib; for (ic=0; icngrp==1 && grps->grp[0].qsum[ic]==0 ) continue; + if ( grp->qsum[ic]==0 ) continue; double lk_tot = 0; - int lk_tot_set = 1; - int ia_cov = 0, ib_cov = 0, ic_cov = 0; - for (j=0; jngrp; j++) - { - grp1_t *grp = &grps->grp[j]; - if ( grp->qsum[ia] ) ia_cov = 1; - if ( grp->qsum[ib] ) ib_cov = 1; - if ( grp->qsum[ic] ) ic_cov = 1; - if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; } - grp->dp = 1; - grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fa2 = grp->fa*grp->fa; - grp->fb2 = grp->fb*grp->fb; - grp->fc2 = grp->fc*grp->fc; - grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc; - } - if ( !ia_cov || !ib_cov || !ic_cov ) continue; - int isample, icc = (ic+1)*(ic+2)/2-1; + int lk_tot_set = 0; + + double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fc = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fa2 = fa*fa; + double fb2 = fb*fb; + double fc2 = fc*fc; + double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; + int is, icc = (ic+1)*(ic+2)/2-1; int iac = iaa - ia + ic, ibc = ibb - ib + ic; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; - if ( !grp->dp ) continue; + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts; double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) - val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc]; - else if ( call->ploidy && call->ploidy[isample]==1 ) - val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc]; + if ( !call->ploidy || call->ploidy[ismpl]==2 ) + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; + else if ( call->ploidy && call->ploidy[ismpl]==1 ) + val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; } if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; // the prior @@ -755,25 +697,26 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) } } - call->ref_lk = ref_lk; - call->lk_sum = lk_sum; - *out_als = max_als; - int i, n = 0; for (i=0; imax_lk = max_lk; + grp->ref_lk = ref_lk; + grp->lk_sum = lk_sum; + grp->als = max_als; + grp->nals = n; + return n; } -static void mcall_set_ref_genotypes(call_t *call, int nals) +// Sets GT=0/0 or GT=. if PL=0,0,0 +static void mcall_set_ref_genotypes(call_t *call, int nals_ori) { int i; - int ngts = nals*(nals+1)/2; + int ngts = nals_ori*(nals_ori+1)/2; // need this to distinguish between GT=0/0 vs GT=. int nsmpl = bcf_hdr_nsamples(call->hdr); - for (i=0; iac[i] = 0; - call->nhets = 0; - call->ndiploid = 0; + for (i=0; iac[i] = 0; // nals_new<=nals_ori, never mind setting extra 0's // Set all genotypes to 0/0 or 0 int *gts = call->gts; @@ -799,34 +742,27 @@ static void mcall_set_ref_genotypes(call_t *call, int nals) } } -static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp) { int ia, ib, i; - int ngts = nals*(nals+1)/2; - int nsmpl = bcf_hdr_nsamples(call->hdr); - int nout_gts = nout_als*(nout_als+1)/2; - hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs); - - for (i=0; iac[i] = 0; - call->nhets = 0; - call->ndiploid = 0; + int ngts_ori = nals_ori*(nals_ori+1)/2; + int ngts_new = call->nals_new*(call->nals_new+1)/2; + int nsmpl = grp->nsmpl; #if USE_PRIOR_FOR_GTS float prior = exp(call->theta); #endif - float *gps = call->GPs - nout_gts; - double *pdg = call->pdg - ngts; - int *gts = call->gts - 2; - int isample; - for (isample = 0; isample < nsmpl; isample++) + int is; + for (is = 0; is < nsmpl; is++) { - int ploidy = call->ploidy ? call->ploidy[isample] : 2; - assert( ploidy>=0 && ploidy<=2 ); + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts_ori; + float *gps = call->GPs + ismpl*ngts_new; + int *gts = call->gts + ismpl*2; - pdg += ngts; - gts += 2; - gps += nout_gts; + int ploidy = call->ploidy ? call->ploidy[ismpl] : 2; + assert( ploidy>=0 && ploidy<=2 ); if ( !ploidy ) { @@ -838,8 +774,8 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a #if !FLAT_PDG_FOR_MISSING // Skip samples with zero depth, they have all pdg's equal to 0 - for (i=0; indiploid++; - // Default fallback for the case all LKs are the same gts[0] = bcf_gt_unphased(0); gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end; // Non-zero depth, determine the most likely genotype - grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; double best_lk = 0; - for (ia=0; iaals & 1<qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; #if USE_PRIOR_FOR_GTS if ( ia!=0 ) lk *= prior; @@ -876,13 +809,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a if ( ploidy==2 ) { gts[1] = gts[0]; - for (ia=0; iaals & 1<als & 1<qsum[ia]*grp->qsum[ib]; #if USE_PRIOR_FOR_GTS @@ -899,7 +832,6 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a } } } - if ( gts[0] != gts[1] ) call->nhets++; } else gts[1] = bcf_int32_vector_end; @@ -907,55 +839,50 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a call->ac[ bcf_gt_allele(gts[0]) ]++; if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++; } - if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) + if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return; + double max, sum; + for (is=0; isGPs + isample*nout_gts; + int ismpl = grp->smpl[is]; + float *gps = call->GPs + ismpl*ngts_new; - int nmax; - if ( call->ploidy ) - { - if ( call->ploidy[isample]==2 ) nmax = nout_gts; - else if ( call->ploidy[isample]==1 ) nmax = nout_als; - else nmax = 0; - } - else nmax = nout_gts; + int nmax; + if ( call->ploidy ) + { + if ( call->ploidy[ismpl]==2 ) nmax = ngts_new; + else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals; + else nmax = 0; + } + else nmax = ngts_new; - max = gps[0]; - if ( max<0 || nmax==0 ) - { - // no call - if ( call->output_tags & CALL_FMT_GP ) - { - for (i=0; iGQs[isample] = 0; - continue; - } - sum = gps[0]; - for (i=1; iGQs[isample] = max<=INT8_MAX ? max : INT8_MAX; + max = gps[0]; + if ( max<0 || nmax==0 ) + { + // no call if ( call->output_tags & CALL_FMT_GP ) { - assert( max ); - for (i=0; iGQs[ismpl] = 0; + continue; + } + sum = gps[0]; + for (i=1; iGQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX; + if ( call->output_tags & CALL_FMT_GP ) + { + assert( max ); + for (i=0; ioutput_tags & CALL_FMT_GP ) - bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts); - if ( call->output_tags & CALL_FMT_GQ ) - bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl); } @@ -978,12 +905,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a Individual qualities are calculated as GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y) */ -static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +#if 0 +static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new) { int ia, ib, i; int nsmpl = bcf_hdr_nsamples(call->hdr); int ngts = nals*(nals+1)/2; - int nout_gts = nout_als*(nout_als+1)/2; + int nout_gts = nals_new*(nals_new+1)/2; double *gls = call->GLs - nout_gts; double *pdg = call->pdg - ngts; @@ -1013,7 +941,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n double best_lk = 0; for (ia=0; iaals_map[ia],call->als_map[ia]); double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; @@ -1029,10 +957,10 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n { for (ia=0; iaals_map[ia],call->als_map[ib]); double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; @@ -1076,8 +1004,8 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n for (ifm=0; ifmnfams; ifm++) { family_t *fam = &call->fams[ifm]; - int ntrio = call->ntrio[fam->type][nout_als]; - uint16_t *trio = call->trio[fam->type][nout_als]; + int ntrio = call->ntrio[fam->type][nals_new]; + uint16_t *trio = call->trio[fam->type][nals_new]; // Unconstrained likelihood int uc_itr = 0; @@ -1225,11 +1153,12 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl); } } +#endif -static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new) { - int ngts = nals*(nals+1)/2; - int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2; // number of PL values in diploid samples, ori and new + int npls_src = nals_ori*(nals_ori+1)/2; + int npls_dst = nals_new*(nals_new+1)/2; // number of PL values in diploid samples, ori and new if ( call->all_diploid && npls_src == npls_dst ) return; int *pls_src = call->PLs, *pls_dst = call->PLs; @@ -1246,7 +1175,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in } else if ( ploidy==1 ) { - for (ia=0; iapl_map[isrc] ]; @@ -1256,7 +1185,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in else { pls_dst[0] = bcf_int32_missing; - pls_dst[1] = bcf_int32_vector_end; // relying on nout_als>1 in mcall() + pls_dst[1] = bcf_int32_vector_end; // relying on nals_new>1 in mcall() } pls_src += npls_src; pls_dst += npls_dst; @@ -1264,9 +1193,9 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl); } -void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new) { - if ( nals==nout_als ) return; + if ( nals_ori==nals_new ) return; int i,j, nret, size = sizeof(float); @@ -1285,17 +1214,17 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); if ( nret<=0 ) continue; - if ( nout_als==1 ) + if ( nals_new==1 ) bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change else { - for (j=0; jals_map[j]; if ( k==-1 ) continue; // to be dropped memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size); } - bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als); + bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new); } } @@ -1312,21 +1241,21 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o if (nret<=0) continue; int nsmpl = bcf_hdr_nsamples(call->hdr); - assert( nret==nals*nsmpl ); + assert( nret==nals_ori*nsmpl ); for (j=0; jals_map[k]; if ( l==-1 ) continue; // to be dropped memcpy(ptr_dst+size*l, ptr_src+size*k, size); } } - bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl); + bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl); } call->PLs = (int32_t*) tmp_new; @@ -1441,12 +1370,12 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) } bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl); - // update QS - int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); - hts_expand(float,nals,call->nqsum,call->qsum); + // update QS, use temporarily call->GPs to store the values + int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum); + hts_expand(float,nals,call->nGPs,call->GPs); for (i=0; iqsum[i] = call->als_map[i]smpl_grp.grp[0].qsum[call->als_map[i]] : 0; - bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals); + call->GPs[i] = call->als_map[i]smpl_grp[0].qsum[call->als_map[i]] : 0; + bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals); // update any Number=R tags void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point @@ -1487,7 +1416,6 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) call->itmp = (int32_t*) tmp_ori; call->n_itmp = ntmp_ori; - if ( *unseen ) *unseen = nals-1; return 0; } @@ -1506,203 +1434,229 @@ int mcall(call_t *call, bcf1_t *rec) // Force alleles when calling genotypes given alleles was requested if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; - int nsmpl = bcf_hdr_nsamples(call->hdr); - int nals = rec->n_allele; - hts_expand(int,nals,call->nac,call->ac); - hts_expand(int,nals,call->nals_map,call->als_map); - hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map); + int nsmpl = bcf_hdr_nsamples(call->hdr); + int nals_ori = rec->n_allele; + hts_expand(int,nals_ori,call->nac,call->ac); + hts_expand(int,nals_ori,call->nals_map,call->als_map); + hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map); // Get the genotype likelihoods call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs); - if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // a mixture of diploid and haploid or haploid only - error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs); + if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori ) // a mixture of diploid and haploid or haploid only + error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs); // Convert PLs to probabilities - int ngts = nals*(nals+1)/2; + int ngts_ori = nals_ori*(nals_ori+1)/2; hts_expand(double, call->nPLs, call->npdg, call->pdg); - set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen); + set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen); // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. - if ( call->smpl_grp.ngrp == 1 ) + if ( call->nsmpl_grp == 1 ) { - int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); + int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum); if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); - if ( nqs < nals ) + if ( nqs < nals_ori ) { // Some of the listed alleles do not have the corresponding QS field. This is // typically ref-only site with <*> in ALT. - hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum); - for (i=nqs; ismpl_grp.grp[0].qsum[i] = 0; + hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum); + for (i=nqs; ismpl_grp[0].qsum[i] = 0; } } else { - for (j=0; jsmpl_grp.ngrp; j++) + for (j=0; jnsmpl_grp; j++) { - hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum); - memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals); + hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum); + memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori); } - int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs); - if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n"); + // Use FORMAT/AD or FORMAT/QS + int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs); + if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag); nad /= bcf_hdr_nsamples(call->hdr); - hts_expand(float,nals,call->nqsum,call->qsum); - float qsum = 0; - for (i=0; ihdr); i++) + for (i=0; insmpl_grp; i++) { - int32_t *ptr = call->ADs + i*nad; - for (j=0; jsmpl_grp[i]; + hts_expand(float,nals_ori,grp->nqsum,grp->qsum); + for (j=0; jqsum[j] = 0; + for (is=0; isnsmpl; is++) { - if ( ptr[j]==bcf_int32_vector_end ) break; - if ( ptr[j]==bcf_int32_missing ) call->qsum[j] = 0; - else { call->qsum[j] = ptr[j]; qsum += ptr[j]; } + int ismpl = grp->smpl[is]; + int32_t *ptr = call->ADs + ismpl*nad; + float sum = 0; + for (j=0; jqsum[j] += ptr[j]/sum; + } + } } - for (; jqsum[j] = 0; - if ( qsum ) - for (j=0; jqsum[j] /= qsum; - - grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]]; - for (j=0; jqsum[j] += call->qsum[j]; } } // If available, take into account reference panel AFs if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) { - int an = call->ac[0]; - if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) + int an = call->ac[0]; // number of alleles total, procede only if not zero; reuse call->ac + if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 ) // number of ALT alleles { - int ac0 = an; // number of alleles in the reference population - for (i=0; iac[i]==bcf_int32_vector_end ) break; if ( call->ac[i]==bcf_int32_missing ) continue; ac0 -= call->ac[i]; - for (j=0; jsmpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5; + + // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples + for (j=0; jnsmpl_grp; j++) + call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an); } if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); - for (j=0; jsmpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[0] += ac0*0.5; - for (i=0; ismpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an; - } + for (j=0; jnsmpl_grp; j++) + call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an); } } - for (j=0; jsmpl_grp.ngrp; j++) + // normalize so that QS sums to 1 for each group + for (j=0; jnsmpl_grp; j++) { - float qsum_tot = 0; - for (i=0; ismpl_grp.grp[j].qsum[i]; - if ( qsum_tot ) for (i=0; ismpl_grp.grp[j].qsum[i] /= qsum_tot; + float sum = 0; + for (i=0; ismpl_grp[j].qsum[i]; + if ( sum ) for (i=0; ismpl_grp[j].qsum[i] /= sum; } bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag - // Find the best combination of alleles - int out_als, nout; - if ( nals > 8*sizeof(out_als) ) + if ( nals_ori > 8*sizeof(call->als_new) ) { fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); return 0; } - nout = mcall_find_best_alleles(call, nals, &out_als); - // Make sure the REF allele is always present - if ( !(out_als&1) ) + // For each group find the best combination of alleles + call->als_new = 0; + double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL; + for (j=0; jnsmpl_grp; j++) { - out_als |= 1; - nout++; + smpl_grp_t *grp = &call->smpl_grp[j]; + mcall_find_best_alleles(call, nals_ori, grp); + call->als_new |= grp->als; + if ( grp->max_lk==-HUGE_VAL ) continue; + double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk)); + if ( max_qual < qual ) + { + max_qual = qual; + lk_sum = grp->lk_sum; + ref_lk = grp->ref_lk; + } } - int is_variant = out_als==1 ? 0 : 1; + + // Make sure the REF allele is always present + if ( !(call->als_new&1) ) call->als_new |= 1; + + int is_variant = call->als_new==1 ? 0 : 1; if ( call->flag & CALL_VARONLY && !is_variant ) return 0; - // With -A, keep all ALTs except X - if ( call->flag & CALL_KEEPALT ) + call->nals_new = 0; + for (i=0; i0 && i==unseen ) continue; - out_als |= 1<0 && i==unseen ) continue; + if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<als_new & (1<nals_new++; } + init_allele_trimming_maps(call,nals_ori,call->als_new); + int nAC = 0; - if ( out_als==1 ) // only REF allele on output + if ( call->als_new==1 ) // only REF allele on output { - init_allele_trimming_maps(call, 1, nals); - mcall_set_ref_genotypes(call,nals); + mcall_set_ref_genotypes(call,nals_ori); bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now } + else if ( !is_variant ) + { + mcall_set_ref_genotypes(call,nals_ori); // running with -A, prevent mcall_call_genotypes from putting some ALT back + mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new); + } else { // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes. // Note that it is a valid outcome if the called genotypes exclude some of the ALTs. - init_allele_trimming_maps(call, out_als, nals); - if ( !is_variant ) - mcall_set_ref_genotypes(call,nals); // running with -A, prevent mcall_call_genotypes from putting some ALT back - else if ( call->flag & CALL_CONSTR_TRIO ) + int ngts_new = call->nals_new*(call->nals_new+1)/2; + hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs); + for (i=0; inals_new; i++) call->ac[i] = 0; + + if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 ) + { + fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); + return 0; + } + if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) { - if ( nout>4 ) - { - fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; - } - mcall_call_trio_genotypes(call, rec, nals,nout,out_als); + memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs)); + memset(call->GQs,0,nsmpl*sizeof(*call->GQs)); + } + for (i=0; insmpl_grp; i++) + { + if ( call->flag & CALL_CONSTR_TRIO ) + error("todo: constrained trio calling temporarily disabled\n"); //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]); + else + mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]); } - else - mcall_call_genotypes(call,rec,nals,nout,out_als); // Skip the site if all samples are 0/0. This can happen occasionally. - nAC = 0; - for (i=1; iac[i]; + for (i=1; inals_new; i++) nAC += call->ac[i]; if ( !nAC && call->flag & CALL_VARONLY ) return 0; - mcall_trim_PLs(call, rec, nals, nout, out_als); + + if ( call->output_tags & CALL_FMT_GP ) + bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new); + if ( call->output_tags & CALL_FMT_GQ ) + bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl); + + mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new); } - if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als); + if ( nals_ori!=call->nals_new ) + mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new); - // Set QUAL and calculate HWE-related annotations + // Set QUAL if ( nAC ) { - float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid); - if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1); - - float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid); - if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1); - // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set - rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk)); + rec->qual = max_qual; } else { // Set the quality of a REF site - if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior + if ( lk_sum!=-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior + rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk)); + else if ( call->ac[0] ) rec->qual = call->theta ? -4.343*call->theta : 0; else - rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk)); + bcf_float_set_missing(rec->qual); } - if ( rec->qual>999 ) rec->qual = 999; - if ( rec->qual>50 ) rec->qual = rint(rec->qual); - // AC, AN - if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1); + if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1); nAC += call->ac[0]; bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1); // Remove unused alleles - hts_expand(char*,nout,call->nals,call->als); - for (i=0; inals_new,call->nals,call->als); + for (i=0; ials_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i]; - bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout); + bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new); bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2); - // DP4 tag + // DP4 and PV4 tags if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 ) { int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3]; @@ -1710,10 +1664,22 @@ int mcall(call_t *call, bcf1_t *rec) int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]); bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1); + + if ( call->output_tags & CALL_FMT_PV4 ) + { + anno16_t a; + float tmpf[4]; + int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0; + if ( is_tested ) + { + for (i=0; i<4; i++) tmpf[i] = a.p[i]; + bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4); + } + } } bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag - return nout; + return call->nals_new; } diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c index 2c2fb37..c2d38a6 100644 --- a/bcftools/mcall.c.pysam.c +++ b/bcftools/mcall.c.pysam.c @@ -2,7 +2,7 @@ /* mcall.c -- multiallelic and rare variant calling. - Copyright (C) 2012-2016 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -24,11 +24,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include #include #include +#include #include #include #include "call.h" +#include "prob1.h" // Using priors for GTs does not seem to be mathematically justified. Although // it seems effective in removing false calls, it also flips a significant @@ -40,6 +43,7 @@ THE SOFTWARE. */ // genotypes is reported instead. #define FLAT_PDG_FOR_MISSING 0 +int test16(float *anno16, anno16_t *a); void qcall_init(call_t *call) { return; } void qcall_destroy(call_t *call) { return; } @@ -251,19 +255,46 @@ static void init_sample_groups(call_t *call) if ( !call->sample_groups ) { // standard pooled calling, all samples in the same group - grp_t *grps = &call->smpl_grp; - grps->ngrp = 1; - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int)); + call->nsmpl_grp = 1; + call->smpl_grp = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp)); + call->smpl_grp[0].nsmpl = nsmpl; + call->smpl_grp[0].smpl = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t)); + for (i=0; ismpl_grp[0].smpl[i] = i; + return; + } + + if ( call->sample_groups_tag ) + { + // Is the tag defined in the header? + int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag); + if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag); + if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag); + } + else + { + int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS"); + if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS"; + else + { + tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD"); + if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD"; + else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n"); + } } - else if ( !strcmp("-",call->sample_groups) ) + + // Read samples/groups + if ( !strcmp("-",call->sample_groups) ) { // single-sample calling, each sample creates its own group - grp_t *grps = &call->smpl_grp; - grps->ngrp = nsmpl; - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); - for (i=0; ismpl2grp[i] = i; + call->nsmpl_grp = nsmpl; + call->smpl_grp = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp)); + for (i=0; ismpl_grp[i].nsmpl = 1; + call->smpl_grp[i].smpl = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t)); + call->smpl_grp[i].smpl[0] = i; + } } else { @@ -271,40 +302,49 @@ static void init_sample_groups(call_t *call) char **lines = hts_readlist(call->sample_groups, 1, &nlines); if ( !lines ) error("Could not read the file: %s\n", call->sample_groups); - uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); + uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); + uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); void *grp2idx = khash_str2int_init(); - grp_t *grps = &call->smpl_grp; + call->nsmpl_grp = 0; for (i=0; isample_groups,lines[i]); - *ptr = 0; + char *tmp = ptr; + while ( *ptr && isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]); + *tmp = 0; int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); if ( ismpl<0 ) continue; - if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); + if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); if ( !khash_str2int_has_key(grp2idx,ptr+1) ) { - khash_str2int_inc(grp2idx, ptr+1); - grps->ngrp++; + khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp); + call->nsmpl_grp++; } - int igrp; - if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 ) - smpl2grp1[ismpl] = igrp+1; - else + int igrp = -1; + if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 ) error("This should not happen, fixme: %s\n",ptr+1); + grp2n[igrp]++; + smpl2grp[ismpl] = igrp+1; // +1 to distinguish unlisted samples } khash_str2int_destroy(grp2idx); + if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups); - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); + call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp)); for (i=0; ihdr->samples[i],call->sample_groups); - grps->smpl2grp[i] = smpl2grp1[i] - 1; + if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups); + int igrp = smpl2grp[i] - 1; + if ( !call->smpl_grp[igrp].nsmpl ) + call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t)); + call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i; + call->smpl_grp[igrp].nsmpl++; } - free(smpl2grp1); + free(smpl2grp); + free(grp2n); for (i=0; ismpl_grp; - for (i=0; ingrp; i++) - free(grps->grp[i].qsum); - free(grps->grp); - free(grps->smpl2grp); + for (i=0; insmpl_grp; i++) + { + free(call->smpl_grp[i].qsum); + free(call->smpl_grp[i].smpl); + } + free(call->smpl_grp); } void mcall_init(call_t *call) { + init_sample_groups(call); call_init_pl2p(call); call->nals_map = 5; @@ -343,15 +385,15 @@ void mcall_init(call_t *call) if ( call->output_tags & CALL_FMT_GQ ) bcf_hdr_append(call->hdr,"##FORMAT="); if ( call->output_tags & CALL_FMT_GP ) - bcf_hdr_append(call->hdr,"##FORMAT="); + bcf_hdr_append(call->hdr,"##FORMAT="); if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr)); - bcf_hdr_append(call->hdr,"##INFO="); - bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); + if ( call->output_tags & CALL_FMT_PV4 ) + bcf_hdr_append(call->hdr,"##INFO=\n"); // init the prior if ( call->theta>0 ) @@ -374,8 +416,6 @@ void mcall_init(call_t *call) } call->theta = log(call->theta); } - - init_sample_groups(call); } void mcall_destroy(call_t *call) @@ -396,7 +436,6 @@ void mcall_destroy(call_t *call) free(call->pdg); free(call->als); free(call->ac); - free(call->qsum); return; } @@ -507,14 +546,14 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse } // Create mapping between old and new (trimmed) alleles -void init_allele_trimming_maps(call_t *call, int als, int nals) +void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out) { - int i, j; + int i, j, nout = 0; // als_map: old(i) -> new(j) - for (i=0, j=0; ials_map[i] = j++; + if ( als_out & (1<als_map[i] = nout++; else call->als_map[i] = -1; } @@ -522,85 +561,16 @@ void init_allele_trimming_maps(call_t *call, int als, int nals) // pl_map: new(k) -> old(l) int k = 0, l = 0; - for (i=0; ipl_map[k++] = l; + if ( (als_out & (1<pl_map[k++] = l; l++; } } } -double binom_dist(int N, double p, int k) -{ - int mean = (int) (N*p); - if ( mean==k ) return 1.0; - - double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p); - if ( k > N - k ) k = N - k; - if ( mean > N - mean ) mean = N - mean; - - if ( k < mean ) { int tmp = k; k = mean; mean = tmp; } - double diff = k - mean; - - double val = 1.0; - int i; - for (i=0; i10 && (1-q)*ndiploid>10 ) || ndiploid>200 ) - { - //fprintf(bcftools_stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); - return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))); - } - - return binom_dist(ndiploid, q, nhets); -} - -float calc_HOB(int nref, int nalt, int nhets, int ndiploid) -{ - if ( !nref || !nalt || !ndiploid ) return HUGE_VAL; - - double fref = (double)nref/(nref+nalt); // fraction of reference allelels - double falt = (double)nalt/(nref+nalt); // non-ref als - return fabs((double)nhets/ndiploid - 2*fref*falt); -} - -/** - * log(sum_i exp(a_i)) - */ -// static inline double logsumexp(double *vals, int nvals) -// { -// int i; -// double max_exp = vals[0]; -// for (i=1; ihdr); + int nsmpl = grp->nsmpl; int ngts = nals*(nals+1)/2; // Single allele @@ -636,60 +605,45 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) double lk_tot = 0; int lk_tot_set = 0; int iaa = (ia+1)*(ia+2)/2-1; // index in PL which corresponds to the homozygous "ia/ia" genotype - int isample; - double *pdg = call->pdg + iaa; - for (isample=0; isamplepdg + grp->smpl[ismpl]*ngts + iaa; if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; } - pdg += ngts; } if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples else lk_tot += call->theta; // the prior UPDATE_MAX_LKs(1<0 && lk_tot_set); } - grp_t *grps = &call->smpl_grp; - // Two alleles if ( nals>1 ) { for (ia=0; iangrp==1 && grps->grp[0].qsum[ia]==0 ) continue; + if ( grp->qsum[ia]==0 ) continue; int iaa = (ia+1)*(ia+2)/2-1; for (ib=0; ibngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; + if ( grp->qsum[ib]==0 ) continue; double lk_tot = 0; int lk_tot_set = 0; - int ia_cov = 0, ib_cov = 0; - for (j=0; jngrp; j++) + double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]); + double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]); + double fa2 = fa*fa; + double fb2 = fb*fb; + double fab = 2*fa*fb; + int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; + for (is=0; isgrp[j]; - if ( grp->qsum[ia] ) ia_cov = 1; - if ( grp->qsum[ib] ) ib_cov = 1; - if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; } - grp->dp = 1; - grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]); - grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]); - grp->fa2 = grp->fa*grp->fa; - grp->fb2 = grp->fb*grp->fb; - grp->fab = 2*grp->fa*grp->fb; - } - if ( !ia_cov || !ib_cov ) continue; - int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; - if ( !grp->dp ) continue; + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts; double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) - val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab]; - else if ( call->ploidy && call->ploidy[isample]==1 ) - val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb]; + if ( !call->ploidy || call->ploidy[ismpl]==2 ) + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; + else if ( call->ploidy && call->ploidy[ismpl]==1 ) + val = fa*pdg[iaa] + fb*pdg[ibb]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; } if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; @@ -703,50 +657,38 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) { for (ia=0; iangrp==1 && grps->grp[0].qsum[ia]==0 ) continue; + if ( grp->qsum[ia]==0 ) continue; int iaa = (ia+1)*(ia+2)/2-1; for (ib=0; ibngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; + if ( grp->qsum[ib]==0 ) continue; int ibb = (ib+1)*(ib+2)/2-1; int iab = iaa - ia + ib; for (ic=0; icngrp==1 && grps->grp[0].qsum[ic]==0 ) continue; + if ( grp->qsum[ic]==0 ) continue; double lk_tot = 0; - int lk_tot_set = 1; - int ia_cov = 0, ib_cov = 0, ic_cov = 0; - for (j=0; jngrp; j++) - { - grp1_t *grp = &grps->grp[j]; - if ( grp->qsum[ia] ) ia_cov = 1; - if ( grp->qsum[ib] ) ib_cov = 1; - if ( grp->qsum[ic] ) ic_cov = 1; - if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; } - grp->dp = 1; - grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fa2 = grp->fa*grp->fa; - grp->fb2 = grp->fb*grp->fb; - grp->fc2 = grp->fc*grp->fc; - grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc; - } - if ( !ia_cov || !ib_cov || !ic_cov ) continue; - int isample, icc = (ic+1)*(ic+2)/2-1; + int lk_tot_set = 0; + + double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fc = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fa2 = fa*fa; + double fb2 = fb*fb; + double fc2 = fc*fc; + double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; + int is, icc = (ic+1)*(ic+2)/2-1; int iac = iaa - ia + ic, ibc = ibb - ib + ic; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; - if ( !grp->dp ) continue; + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts; double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) - val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc]; - else if ( call->ploidy && call->ploidy[isample]==1 ) - val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc]; + if ( !call->ploidy || call->ploidy[ismpl]==2 ) + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; + else if ( call->ploidy && call->ploidy[ismpl]==1 ) + val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; } if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; // the prior @@ -757,25 +699,26 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) } } - call->ref_lk = ref_lk; - call->lk_sum = lk_sum; - *out_als = max_als; - int i, n = 0; for (i=0; imax_lk = max_lk; + grp->ref_lk = ref_lk; + grp->lk_sum = lk_sum; + grp->als = max_als; + grp->nals = n; + return n; } -static void mcall_set_ref_genotypes(call_t *call, int nals) +// Sets GT=0/0 or GT=. if PL=0,0,0 +static void mcall_set_ref_genotypes(call_t *call, int nals_ori) { int i; - int ngts = nals*(nals+1)/2; + int ngts = nals_ori*(nals_ori+1)/2; // need this to distinguish between GT=0/0 vs GT=. int nsmpl = bcf_hdr_nsamples(call->hdr); - for (i=0; iac[i] = 0; - call->nhets = 0; - call->ndiploid = 0; + for (i=0; iac[i] = 0; // nals_new<=nals_ori, never mind setting extra 0's // Set all genotypes to 0/0 or 0 int *gts = call->gts; @@ -801,34 +744,27 @@ static void mcall_set_ref_genotypes(call_t *call, int nals) } } -static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp) { int ia, ib, i; - int ngts = nals*(nals+1)/2; - int nsmpl = bcf_hdr_nsamples(call->hdr); - int nout_gts = nout_als*(nout_als+1)/2; - hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs); - - for (i=0; iac[i] = 0; - call->nhets = 0; - call->ndiploid = 0; + int ngts_ori = nals_ori*(nals_ori+1)/2; + int ngts_new = call->nals_new*(call->nals_new+1)/2; + int nsmpl = grp->nsmpl; #if USE_PRIOR_FOR_GTS float prior = exp(call->theta); #endif - float *gps = call->GPs - nout_gts; - double *pdg = call->pdg - ngts; - int *gts = call->gts - 2; - int isample; - for (isample = 0; isample < nsmpl; isample++) + int is; + for (is = 0; is < nsmpl; is++) { - int ploidy = call->ploidy ? call->ploidy[isample] : 2; - assert( ploidy>=0 && ploidy<=2 ); + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts_ori; + float *gps = call->GPs + ismpl*ngts_new; + int *gts = call->gts + ismpl*2; - pdg += ngts; - gts += 2; - gps += nout_gts; + int ploidy = call->ploidy ? call->ploidy[ismpl] : 2; + assert( ploidy>=0 && ploidy<=2 ); if ( !ploidy ) { @@ -840,8 +776,8 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a #if !FLAT_PDG_FOR_MISSING // Skip samples with zero depth, they have all pdg's equal to 0 - for (i=0; indiploid++; - // Default fallback for the case all LKs are the same gts[0] = bcf_gt_unphased(0); gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end; // Non-zero depth, determine the most likely genotype - grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; double best_lk = 0; - for (ia=0; iaals & 1<qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; #if USE_PRIOR_FOR_GTS if ( ia!=0 ) lk *= prior; @@ -878,13 +811,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a if ( ploidy==2 ) { gts[1] = gts[0]; - for (ia=0; iaals & 1<als & 1<qsum[ia]*grp->qsum[ib]; #if USE_PRIOR_FOR_GTS @@ -901,7 +834,6 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a } } } - if ( gts[0] != gts[1] ) call->nhets++; } else gts[1] = bcf_int32_vector_end; @@ -909,55 +841,50 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a call->ac[ bcf_gt_allele(gts[0]) ]++; if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++; } - if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) + if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return; + double max, sum; + for (is=0; isGPs + isample*nout_gts; + int ismpl = grp->smpl[is]; + float *gps = call->GPs + ismpl*ngts_new; - int nmax; - if ( call->ploidy ) - { - if ( call->ploidy[isample]==2 ) nmax = nout_gts; - else if ( call->ploidy[isample]==1 ) nmax = nout_als; - else nmax = 0; - } - else nmax = nout_gts; + int nmax; + if ( call->ploidy ) + { + if ( call->ploidy[ismpl]==2 ) nmax = ngts_new; + else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals; + else nmax = 0; + } + else nmax = ngts_new; - max = gps[0]; - if ( max<0 || nmax==0 ) - { - // no call - if ( call->output_tags & CALL_FMT_GP ) - { - for (i=0; iGQs[isample] = 0; - continue; - } - sum = gps[0]; - for (i=1; iGQs[isample] = max<=INT8_MAX ? max : INT8_MAX; + max = gps[0]; + if ( max<0 || nmax==0 ) + { + // no call if ( call->output_tags & CALL_FMT_GP ) { - assert( max ); - for (i=0; iGQs[ismpl] = 0; + continue; + } + sum = gps[0]; + for (i=1; iGQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX; + if ( call->output_tags & CALL_FMT_GP ) + { + assert( max ); + for (i=0; ioutput_tags & CALL_FMT_GP ) - bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts); - if ( call->output_tags & CALL_FMT_GQ ) - bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl); } @@ -980,12 +907,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a Individual qualities are calculated as GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y) */ -static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +#if 0 +static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new) { int ia, ib, i; int nsmpl = bcf_hdr_nsamples(call->hdr); int ngts = nals*(nals+1)/2; - int nout_gts = nout_als*(nout_als+1)/2; + int nout_gts = nals_new*(nals_new+1)/2; double *gls = call->GLs - nout_gts; double *pdg = call->pdg - ngts; @@ -1015,7 +943,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n double best_lk = 0; for (ia=0; iaals_map[ia],call->als_map[ia]); double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; @@ -1031,10 +959,10 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n { for (ia=0; iaals_map[ia],call->als_map[ib]); double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; @@ -1078,8 +1006,8 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n for (ifm=0; ifmnfams; ifm++) { family_t *fam = &call->fams[ifm]; - int ntrio = call->ntrio[fam->type][nout_als]; - uint16_t *trio = call->trio[fam->type][nout_als]; + int ntrio = call->ntrio[fam->type][nals_new]; + uint16_t *trio = call->trio[fam->type][nals_new]; // Unconstrained likelihood int uc_itr = 0; @@ -1227,11 +1155,12 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl); } } +#endif -static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new) { - int ngts = nals*(nals+1)/2; - int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2; // number of PL values in diploid samples, ori and new + int npls_src = nals_ori*(nals_ori+1)/2; + int npls_dst = nals_new*(nals_new+1)/2; // number of PL values in diploid samples, ori and new if ( call->all_diploid && npls_src == npls_dst ) return; int *pls_src = call->PLs, *pls_dst = call->PLs; @@ -1248,7 +1177,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in } else if ( ploidy==1 ) { - for (ia=0; iapl_map[isrc] ]; @@ -1258,7 +1187,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in else { pls_dst[0] = bcf_int32_missing; - pls_dst[1] = bcf_int32_vector_end; // relying on nout_als>1 in mcall() + pls_dst[1] = bcf_int32_vector_end; // relying on nals_new>1 in mcall() } pls_src += npls_src; pls_dst += npls_dst; @@ -1266,9 +1195,9 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl); } -void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new) { - if ( nals==nout_als ) return; + if ( nals_ori==nals_new ) return; int i,j, nret, size = sizeof(float); @@ -1287,17 +1216,17 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); if ( nret<=0 ) continue; - if ( nout_als==1 ) + if ( nals_new==1 ) bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change else { - for (j=0; jals_map[j]; if ( k==-1 ) continue; // to be dropped memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size); } - bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als); + bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new); } } @@ -1314,21 +1243,21 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o if (nret<=0) continue; int nsmpl = bcf_hdr_nsamples(call->hdr); - assert( nret==nals*nsmpl ); + assert( nret==nals_ori*nsmpl ); for (j=0; jals_map[k]; if ( l==-1 ) continue; // to be dropped memcpy(ptr_dst+size*l, ptr_src+size*k, size); } } - bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl); + bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl); } call->PLs = (int32_t*) tmp_new; @@ -1443,12 +1372,12 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) } bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl); - // update QS - int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); - hts_expand(float,nals,call->nqsum,call->qsum); + // update QS, use temporarily call->GPs to store the values + int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum); + hts_expand(float,nals,call->nGPs,call->GPs); for (i=0; iqsum[i] = call->als_map[i]smpl_grp.grp[0].qsum[call->als_map[i]] : 0; - bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals); + call->GPs[i] = call->als_map[i]smpl_grp[0].qsum[call->als_map[i]] : 0; + bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals); // update any Number=R tags void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point @@ -1489,7 +1418,6 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) call->itmp = (int32_t*) tmp_ori; call->n_itmp = ntmp_ori; - if ( *unseen ) *unseen = nals-1; return 0; } @@ -1508,203 +1436,229 @@ int mcall(call_t *call, bcf1_t *rec) // Force alleles when calling genotypes given alleles was requested if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; - int nsmpl = bcf_hdr_nsamples(call->hdr); - int nals = rec->n_allele; - hts_expand(int,nals,call->nac,call->ac); - hts_expand(int,nals,call->nals_map,call->als_map); - hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map); + int nsmpl = bcf_hdr_nsamples(call->hdr); + int nals_ori = rec->n_allele; + hts_expand(int,nals_ori,call->nac,call->ac); + hts_expand(int,nals_ori,call->nals_map,call->als_map); + hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map); // Get the genotype likelihoods call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs); - if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // a mixture of diploid and haploid or haploid only - error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs); + if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori ) // a mixture of diploid and haploid or haploid only + error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs); // Convert PLs to probabilities - int ngts = nals*(nals+1)/2; + int ngts_ori = nals_ori*(nals_ori+1)/2; hts_expand(double, call->nPLs, call->npdg, call->pdg); - set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen); + set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen); // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. - if ( call->smpl_grp.ngrp == 1 ) + if ( call->nsmpl_grp == 1 ) { - int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); + int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum); if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); - if ( nqs < nals ) + if ( nqs < nals_ori ) { // Some of the listed alleles do not have the corresponding QS field. This is // typically ref-only site with <*> in ALT. - hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum); - for (i=nqs; ismpl_grp.grp[0].qsum[i] = 0; + hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum); + for (i=nqs; ismpl_grp[0].qsum[i] = 0; } } else { - for (j=0; jsmpl_grp.ngrp; j++) + for (j=0; jnsmpl_grp; j++) { - hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum); - memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals); + hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum); + memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori); } - int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs); - if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n"); + // Use FORMAT/AD or FORMAT/QS + int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs); + if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag); nad /= bcf_hdr_nsamples(call->hdr); - hts_expand(float,nals,call->nqsum,call->qsum); - float qsum = 0; - for (i=0; ihdr); i++) + for (i=0; insmpl_grp; i++) { - int32_t *ptr = call->ADs + i*nad; - for (j=0; jsmpl_grp[i]; + hts_expand(float,nals_ori,grp->nqsum,grp->qsum); + for (j=0; jqsum[j] = 0; + for (is=0; isnsmpl; is++) { - if ( ptr[j]==bcf_int32_vector_end ) break; - if ( ptr[j]==bcf_int32_missing ) call->qsum[j] = 0; - else { call->qsum[j] = ptr[j]; qsum += ptr[j]; } + int ismpl = grp->smpl[is]; + int32_t *ptr = call->ADs + ismpl*nad; + float sum = 0; + for (j=0; jqsum[j] += ptr[j]/sum; + } + } } - for (; jqsum[j] = 0; - if ( qsum ) - for (j=0; jqsum[j] /= qsum; - - grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]]; - for (j=0; jqsum[j] += call->qsum[j]; } } // If available, take into account reference panel AFs if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) { - int an = call->ac[0]; - if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) + int an = call->ac[0]; // number of alleles total, procede only if not zero; reuse call->ac + if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 ) // number of ALT alleles { - int ac0 = an; // number of alleles in the reference population - for (i=0; iac[i]==bcf_int32_vector_end ) break; if ( call->ac[i]==bcf_int32_missing ) continue; ac0 -= call->ac[i]; - for (j=0; jsmpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5; + + // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples + for (j=0; jnsmpl_grp; j++) + call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an); } if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); - for (j=0; jsmpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[0] += ac0*0.5; - for (i=0; ismpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an; - } + for (j=0; jnsmpl_grp; j++) + call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an); } } - for (j=0; jsmpl_grp.ngrp; j++) + // normalize so that QS sums to 1 for each group + for (j=0; jnsmpl_grp; j++) { - float qsum_tot = 0; - for (i=0; ismpl_grp.grp[j].qsum[i]; - if ( qsum_tot ) for (i=0; ismpl_grp.grp[j].qsum[i] /= qsum_tot; + float sum = 0; + for (i=0; ismpl_grp[j].qsum[i]; + if ( sum ) for (i=0; ismpl_grp[j].qsum[i] /= sum; } bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag - // Find the best combination of alleles - int out_als, nout; - if ( nals > 8*sizeof(out_als) ) + if ( nals_ori > 8*sizeof(call->als_new) ) { fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); return 0; } - nout = mcall_find_best_alleles(call, nals, &out_als); - // Make sure the REF allele is always present - if ( !(out_als&1) ) + // For each group find the best combination of alleles + call->als_new = 0; + double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL; + for (j=0; jnsmpl_grp; j++) { - out_als |= 1; - nout++; + smpl_grp_t *grp = &call->smpl_grp[j]; + mcall_find_best_alleles(call, nals_ori, grp); + call->als_new |= grp->als; + if ( grp->max_lk==-HUGE_VAL ) continue; + double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk)); + if ( max_qual < qual ) + { + max_qual = qual; + lk_sum = grp->lk_sum; + ref_lk = grp->ref_lk; + } } - int is_variant = out_als==1 ? 0 : 1; + + // Make sure the REF allele is always present + if ( !(call->als_new&1) ) call->als_new |= 1; + + int is_variant = call->als_new==1 ? 0 : 1; if ( call->flag & CALL_VARONLY && !is_variant ) return 0; - // With -A, keep all ALTs except X - if ( call->flag & CALL_KEEPALT ) + call->nals_new = 0; + for (i=0; i0 && i==unseen ) continue; - out_als |= 1<0 && i==unseen ) continue; + if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<als_new & (1<nals_new++; } + init_allele_trimming_maps(call,nals_ori,call->als_new); + int nAC = 0; - if ( out_als==1 ) // only REF allele on output + if ( call->als_new==1 ) // only REF allele on output { - init_allele_trimming_maps(call, 1, nals); - mcall_set_ref_genotypes(call,nals); + mcall_set_ref_genotypes(call,nals_ori); bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now } + else if ( !is_variant ) + { + mcall_set_ref_genotypes(call,nals_ori); // running with -A, prevent mcall_call_genotypes from putting some ALT back + mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new); + } else { // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes. // Note that it is a valid outcome if the called genotypes exclude some of the ALTs. - init_allele_trimming_maps(call, out_als, nals); - if ( !is_variant ) - mcall_set_ref_genotypes(call,nals); // running with -A, prevent mcall_call_genotypes from putting some ALT back - else if ( call->flag & CALL_CONSTR_TRIO ) + int ngts_new = call->nals_new*(call->nals_new+1)/2; + hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs); + for (i=0; inals_new; i++) call->ac[i] = 0; + + if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 ) + { + fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); + return 0; + } + if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) { - if ( nout>4 ) - { - fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; - } - mcall_call_trio_genotypes(call, rec, nals,nout,out_als); + memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs)); + memset(call->GQs,0,nsmpl*sizeof(*call->GQs)); + } + for (i=0; insmpl_grp; i++) + { + if ( call->flag & CALL_CONSTR_TRIO ) + error("todo: constrained trio calling temporarily disabled\n"); //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]); + else + mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]); } - else - mcall_call_genotypes(call,rec,nals,nout,out_als); // Skip the site if all samples are 0/0. This can happen occasionally. - nAC = 0; - for (i=1; iac[i]; + for (i=1; inals_new; i++) nAC += call->ac[i]; if ( !nAC && call->flag & CALL_VARONLY ) return 0; - mcall_trim_PLs(call, rec, nals, nout, out_als); + + if ( call->output_tags & CALL_FMT_GP ) + bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new); + if ( call->output_tags & CALL_FMT_GQ ) + bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl); + + mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new); } - if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als); + if ( nals_ori!=call->nals_new ) + mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new); - // Set QUAL and calculate HWE-related annotations + // Set QUAL if ( nAC ) { - float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid); - if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1); - - float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid); - if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1); - // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set - rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk)); + rec->qual = max_qual; } else { // Set the quality of a REF site - if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior + if ( lk_sum!=-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior + rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk)); + else if ( call->ac[0] ) rec->qual = call->theta ? -4.343*call->theta : 0; else - rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk)); + bcf_float_set_missing(rec->qual); } - if ( rec->qual>999 ) rec->qual = 999; - if ( rec->qual>50 ) rec->qual = rint(rec->qual); - // AC, AN - if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1); + if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1); nAC += call->ac[0]; bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1); // Remove unused alleles - hts_expand(char*,nout,call->nals,call->als); - for (i=0; inals_new,call->nals,call->als); + for (i=0; ials_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i]; - bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout); + bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new); bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2); - // DP4 tag + // DP4 and PV4 tags if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 ) { int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3]; @@ -1712,10 +1666,22 @@ int mcall(call_t *call, bcf1_t *rec) int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]); bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1); + + if ( call->output_tags & CALL_FMT_PV4 ) + { + anno16_t a; + float tmpf[4]; + int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0; + if ( is_tested ) + { + for (i=0; i<4; i++) tmpf[i] = a.p[i]; + bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4); + } + } } bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag - return nout; + return call->nals_new; } diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c index c621b4c..1f40eff 100644 --- a/bcftools/mpileup.c +++ b/bcftools/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2018 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -39,6 +39,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include "regidx.h" #include "bcftools.h" @@ -59,16 +60,19 @@ DEALINGS IN THE SOFTWARE. */ #define MPLP_PRINT_MAPQ (1<<10) #define MPLP_PER_SAMPLE (1<<11) #define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_REALN_PARTIAL (1<<13) typedef struct _mplp_aux_t mplp_aux_t; typedef struct _mplp_pileup_t mplp_pileup_t; // Data shared by all bam files typedef struct { - int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, + max_indel_depth, max_read_len, fmt_flag, ambig_reads; int rflag_require, rflag_filter, output_type; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels + double indel_bias; char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads; faidx_t *fai; @@ -231,7 +235,46 @@ static int mplp_func(void *data, bam1_t *b) has_ref = 0; } - if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); + // Allow sufficient room for bam_aux_append of ZQ tag without + // a realloc and consequent breakage of pileup's cached pointers. + if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) { + // Doing sam_prob_realn later is problematic as it adds to + // the tag list (ZQ or BQ), which causes a realloc of b->data. + // This happens after pileup has built a hash table on the + // read name. It's a deficiency in pileup IMO. + + // We could implement a new sam_prob_realn that returns ZQ + // somewhere else and cache it ourselves (pileup clientdata), + // but for now we simply use a workaround. + // + // We create a fake tag of the correct length, which we remove + // just prior calling sam_prob_realn so we can guarantee there is + // room. (We can't just make room now as bam_copy1 removes it + // again). + if (b->core.l_qseq > 500) { + uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1); + memset(ZQ, '@', b->core.l_qseq); + ZQ[b->core.l_qseq] = 0; + bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ); + free(ZQ); + } else { + static uint8_t ZQ[501] = + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"; + ZQ[b->core.l_qseq] = 0; + bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ); + ZQ[b->core.l_qseq] = '@'; + } + } + if (has_ref && ma->conf->capQ_thres > 10) { int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres); if (q < 0) continue; // skip @@ -257,18 +300,46 @@ static int mplp_func(void *data, bam1_t *b) static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { mplp_aux_t *ma = (mplp_aux_t *)data; - cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1; - if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) ) - { - int i; - for (i=0; icore.n_cigar; i++) - { - int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; - if ( cig!=BAM_CSOFT_CLIP ) continue; - cd->i |= 1; + int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); + cd->i = 0; + PLP_SET_SAMPLE_ID(cd->i, n); + // Whether read has a soft-clip is used in mplp_realn's heuristics. + // TODO: consider whether clip length is beneficial to use? + int i; + for (i=0; icore.n_cigar; i++) { + int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; + if (cig == BAM_CSOFT_CLIP) { + PLP_SET_SOFT_CLIP(cd->i); break; } } + + if (ma->conf->flag & MPLP_REALN) { + int i, tot_ins = 0; + uint32_t *cigar = bam_get_cigar(b); + int p = 0; + for (i=0; icore.n_cigar; i++) { + int cig = cigar[i] & BAM_CIGAR_MASK; + if (bam_cigar_type(cig) & 2) + p += cigar[i] >> BAM_CIGAR_SHIFT; + if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) { + tot_ins += cigar[i] >> BAM_CIGAR_SHIFT; + // Possible further optimsation, check tot_ins==1 later + // (and remove break) so we can detect single bp indels. + // We may want to focus BAQ on more complex regions only. + PLP_SET_INDEL(cd->i); + break; + } + + // TODO: proper p->cd struct and have cd->i as a size rather + // than a flag. + + // Then aggregate together the sizes and if just 1 size for all + // reads or 2 sizes for approx 50/50 split in all reads, then + // treat this as a well-aligned variant and don't run BAQ. + } + } + return 0; } @@ -282,7 +353,7 @@ static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, c { const bam_pileup1_t *p = plp[i] + j; int id = PLP_SAMPLE_ID(p->cd.i); - if (m->n_plp[id] == m->m_plp[id]) + if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); @@ -317,6 +388,150 @@ static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bc if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); } +/* + * Loops for an indel at this position. + * + * Only reads that overlap an indel loci get realigned. This considerably + * reduces the cost of running BAQ while keeping the main benefits. + * + * TODO: also consider only realigning reads that don't span the indel + * by more than a certain amount either-side. Ie focus BAQ only on reads + * ending adjacent to the indel, where the alignment is most likely to + * be wrong. (2nd TODO: do this based on sequence context; STRs bad, unique + * data good.) + * + * NB: this may sadly realign after we've already used the data. Hmm... + */ +static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, + int flag, int max_read_len, + char *ref, int ref_len, int pos) { + int i, j, has_indel = 0, has_clip = 0, nt = 0; + int min_indel = INT_MAX, max_indel = INT_MIN; + + // Is an indel present. + // NB: don't bother even checking if very long as almost guaranteed + // to have indel (and likely soft-clips too). + for (i = 0; i < n; i++) { // iterate over bams + nt += n_plp[i]; + for (j = 0; j < n_plp[i]; j++) { // iterate over reads + bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j; + has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0; + // Has_clip is almost always true for very long reads + // (eg PacBio CCS), but these rarely matter as the clip + // is likely a long way from this indel. + has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0; + if (max_indel < p->indel) + max_indel = p->indel; + if (min_indel > p->indel) + min_indel = p->indel; + } + } + + if (flag & MPLP_REALN_PARTIAL) { + if (has_indel == 0 || + (has_clip < 0.2*nt && max_indel == min_indel && + (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1))) + return; + } + + // Realign + for (i = 0; i < n; i++) { // iterate over bams + for (j = 0; j < n_plp[i]; j++) { // iterate over reads + const bam_pileup1_t *p = plp[i] + j; + bam1_t *b = p->b; + + // Avoid doing multiple times. + // + // Note we cannot modify p->cd.i here with a PLP_SET macro + // because the cd item is held by mpileup in an lbnode_t + // struct and copied over to the pileup struct for each + // iteration, essentially making p->cd.i read only. + // + // We could use our own structure (p->cd.p), allocated during + // the constructor, but for simplicity we play dirty and + // abuse an unused flag bit instead. + if (b->core.flag & 32768) + continue; + b->core.flag |= 32768; + + if (b->core.l_qseq > max_read_len) + continue; + + // Check p->cigar_ind and see what cigar elements are before + // and after. How close is this location to the end of the + // read? Only realign if we don't span by more than X bases. + // + // Again, best only done on deeper data as BAQ helps + // disproportionately more on shallow data sets. + // + // This rescues some of the false negatives that are caused by + // systematic reduction in quality due to sample vs ref alignment. + +// At deep coverage we skip realigning more reads as we have sufficient depth. +// This rescues for false negatives. At shallow depth we pay for this with +// more FP so are more stringent on spanning size. +#define REALN_DIST (40+10*(nt<40)+10*(nt<20)) + uint32_t *cig = bam_get_cigar(b); + int ncig = b->core.n_cigar; + + // Don't realign reads where indel is in middle? + // On long read data we don't care about soft-clips at the ends. + // For short read data, we always calc BAQ on these as they're + // a common source of false positives. + if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) { + // Left & right cigar op match. + int lr = b->core.l_qseq > 500; + int lm = 0, rm = 0, k; + for (k = 0; k < ncig; k++) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + lm += bam_cigar_oplen(cig[k]); + else + break; + } + + for (k = ncig-1; k >= 0; k--) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + rm += bam_cigar_oplen(cig[k]); + else + break; + } + + if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) + continue; + + if (lm >= REALN_DIST && rm >= REALN_DIST && + has_clip < (0.15+0.05*(nt>20))*nt) + continue; + } + + if (b->core.l_qseq > 500) { + // don't do BAQ on long-read data if it's going to + // cause us to have a large band-with and costly in CPU + int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000) + continue; + } + + // Fudge: make room for ZQ tag. + uint8_t *_Q = bam_aux_get(b, "_Q"); + if (_Q) bam_aux_del(b, _Q); + sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3); + } + } + + return; +} + static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) { bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list @@ -324,7 +539,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) int ret, i, tid, pos, ref_len; char *ref; - while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) + while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) { if ( posend ) continue; if ( conf->bed && tid >= 0 ) @@ -333,7 +548,10 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) if ( !conf->bed_logic ) overlap = overlap ? 0 : 1; if ( !overlap ) continue; } - mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); + int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); + if (has_ref && (conf->flag & MPLP_REALN)) + mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, + conf->max_read_len, ref, ref_len, pos); int total_depth, _ref0, ref16; for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i]; @@ -346,18 +564,19 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) conf->bc.tid = tid; conf->bc.pos = pos; bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc); bcf_clear1(conf->bcf_rec); - bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, + conf->bca, 0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth - && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0) + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth + && (bcf_callaux_clean(conf->bca, &conf->bc), + bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)) { - bcf_callaux_clean(conf->bca, &conf->bc); for (i = 0; i < conf->gplp->n; ++i) bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); - if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) + if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) { bcf_clear1(conf->bcf_rec); bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref); @@ -461,7 +680,7 @@ static int mpileup(mplp_conf_t *conf) conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s); - if ( !conf->mplp_data[i]->iter ) + if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { @@ -487,15 +706,19 @@ static int mpileup(mplp_conf_t *conf) conf->mplp_data[i]->h = hdr; } } + if ( !hdr ) { + fprintf(stderr, "[%s] failed to find a file header with usable read groups\n", __func__); + exit(EXIT_FAILURE); + } // allocate data storage proportionate to number of samples being studied sm->n bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n); conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); - conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); + conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header - conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); + conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname)); if (conf->bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); @@ -542,11 +765,24 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##INFO="); if ( conf->fmt_flag&B2B_INFO_VDB ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_RPB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + + if (conf->fmt_flag & B2B_INFO_ZSCORE) { + if ( conf->fmt_flag&B2B_INFO_RPB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_SCB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + } else { + if ( conf->fmt_flag&B2B_INFO_RPB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + } + + bcf_hdr_append(conf->bcf_hdr,"##INFO="); #if CDF_MWU_TESTS bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); @@ -576,6 +812,8 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); if ( conf->fmt_flag&B2B_FMT_ADR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_QS ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); if ( conf->fmt_flag&B2B_INFO_AD ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); if ( conf->fmt_flag&B2B_INFO_ADF ) @@ -595,17 +833,23 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]); if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); - conf->bca = bcf_call_init(-1., conf->min_baseQ); + conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ, + conf->delta_baseQ); conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; + conf->bca->indel_bias = conf->indel_bias; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bca->fmt_flag = conf->fmt_flag; + conf->bca->ambig_reads = conf->ambig_reads; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); + conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES); + for (i=0; ibcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES; if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); @@ -643,7 +887,7 @@ static int mpileup(mplp_conf_t *conf) if ( nregs ) { int ireg = 0; - do + do { // first region is already positioned if ( ireg++ > 0 ) @@ -651,11 +895,11 @@ static int mpileup(mplp_conf_t *conf) conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); - for (i=0; infiles; i++) + for (i=0; infiles; i++) { hts_itr_destroy(conf->mplp_data[i]->iter); conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); - if ( !conf->mplp_data[i]->iter ) + if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { @@ -690,6 +934,7 @@ static int mpileup(mplp_conf_t *conf) free(conf->bc.ADR); free(conf->bc.ADF); free(conf->bc.SCR); + free(conf->bc.QS); free(conf->bc.fmt_arr); free(conf->bcr); } @@ -793,10 +1038,12 @@ int parse_format_flag(const char *str) else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF; else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; + else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS; else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR; + else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB; else { fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str); @@ -821,6 +1068,7 @@ static void list_annotations(FILE *fp) " FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n" " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" +" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n" " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" " FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" "\n" @@ -843,78 +1091,98 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) // source code in 80 columns, to the extent that's possible.) fprintf(fp, -"\n" -"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n" -"\n" -"Input options:\n" -" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n" -" -A, --count-orphans do not discard anomalous read pairs\n" -" -b, --bam-list FILE list of input BAM filenames, one per line\n" -" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" -" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" -" -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + "\n" + "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n" + "\n" + "Input options:\n" + " -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n" + " -A, --count-orphans do not discard anomalous read pairs\n" + " -b, --bam-list FILE list of input BAM filenames, one per line\n" + " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" + " -C, --adjust-MQ INT adjust mapping quality [0]\n" + " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" + " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + fprintf(fp, + " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" + " -f, --fasta-ref FILE faidx indexed reference sequence file\n" + " --no-reference do not require fasta reference file\n" + " -G, --read-groups FILE select or exclude read groups listed in the file\n" + " -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); fprintf(fp, -" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" -" -f, --fasta-ref FILE faidx indexed reference sequence file\n" -" --no-reference do not require fasta reference file\n" -" -G, --read-groups FILE select or exclude read groups listed in the file\n" -" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); fprintf(fp, -" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + " --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); fprintf(fp, -" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" -" -R, --regions-file FILE restrict to regions listed in a file\n" -" --ignore-RG ignore RG tags (one BAM = one sample)\n" -" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); + " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ); fprintf(fp, -" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" -" [%s]\n", tmp_filter); + " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" + " -R, --regions-file FILE restrict to regions listed in a file\n" + " --ignore-RG ignore RG tags (one BAM = one sample)\n" + " --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); fprintf(fp, -" -s, --samples LIST comma separated list of samples to include\n" -" -S, --samples-file FILE file of samples to include\n" -" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" -" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -" -x, --ignore-overlaps disable read-pair overlap detection\n" -"\n" -"Output options:\n" -" -a, --annotate LIST optional tags to output; '?' to list []\n" -" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" -" to minimum per-sample DP\n" -" --no-version do not append version and command line to the header\n" -" -o, --output FILE write output to FILE [standard output]\n" -" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" -" 'z' compressed VCF; 'v' uncompressed VCF [v]\n" -" --threads INT use multithreading with INT worker threads [0]\n" -"\n" -"SNP/INDEL genotype likelihoods options:\n" -" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); + " --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" + " [%s]\n", tmp_filter); fprintf(fp, -" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); + " -s, --samples LIST comma separated list of samples to include\n" + " -S, --samples-file FILE file of samples to include\n" + " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + " -x, --ignore-overlaps disable read-pair overlap detection\n" + " --seed INT random number seed used for sampling deep regions [0]\n" + "\n" + "Output options:\n" + " -a, --annotate LIST optional tags to output; '?' to list available tags []\n" + " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" + " to minimum per-sample DP\n" + " --no-version do not append version and command line to the header\n" + " -o, --output FILE write output to FILE [standard output]\n" + " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" + " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" + " -U, --mwu-u use older probability scale for Mann-Whitney U test\n" + " --threads INT use multithreading with INT worker threads [0]\n" + "\n" + "SNP/INDEL genotype likelihoods options:\n" + " -X, --config STR Specify platform specific profiles (see below)\n" + " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); fprintf(fp, -" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); fprintf(fp, -" -I, --skip-indels do not perform indel calling\n" -" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); fprintf(fp, -" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + " -I, --skip-indels do not perform indel calling\n" + " -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); fprintf(fp, -" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); fprintf(fp, -" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" -" -P, --platforms STR comma separated list of platforms for indels [all]\n" -"\n" -"Notes: Assuming diploid individuals.\n" -"\n" -"Example:\n" -" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" -" bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" -"\n"); + " -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); + fprintf(fp, + " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + fprintf(fp, + " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" + " -P, --platforms STR comma separated list of platforms for indels [all]\n" + " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); + fprintf(fp, + " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); + fprintf(fp,"\n"); + fprintf(fp, + "Configuration profiles activated with -X, --config:\n" + " 1.12: -Q13 -h100 -m1 -F0.002\n" + " illumina: [ default values ]\n" + " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" + " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n" + "\n" + "Notes: Assuming diploid individuals.\n" + "\n" + "Example:\n" + " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" + " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" + "\n"); free(tmp_require); free(tmp_filter); } -int bam_mpileup(int argc, char *argv[]) +int main_mpileup(int argc, char *argv[]) { int c; const char *file_list = NULL; @@ -922,12 +1190,15 @@ int bam_mpileup(int argc, char *argv[]) int nfiles = 0, use_orphan = 0, noref = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); - mplp.min_baseQ = 13; + mplp.min_baseQ = 1; + mplp.max_baseQ = 60; + mplp.delta_baseQ = 30; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; - mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; - mplp.min_frac = 0.002; mplp.min_support = 1; - mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500; + mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL + | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; @@ -935,7 +1206,11 @@ int bam_mpileup(int argc, char *argv[]) mplp.record_cmd_line = 1; mplp.n_threads = 0; mplp.bsmpl = bam_smpl_init(); - mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag() + // the default to be changed in future, see also parse_format_flag() + mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; + mplp.max_read_len = 500; + mplp.ambig_reads = B2B_DROP; + hts_srand48(0); static const struct option lopts[] = { @@ -956,6 +1231,8 @@ int bam_mpileup(int argc, char *argv[]) {"bam-list", required_argument, NULL, 'b'}, {"no-BAQ", no_argument, NULL, 'B'}, {"no-baq", no_argument, NULL, 'B'}, + {"full-BAQ", no_argument, NULL, 'D'}, + {"full-baq", no_argument, NULL, 'D'}, {"adjust-MQ", required_argument, NULL, 'C'}, {"adjust-mq", required_argument, NULL, 'C'}, {"max-depth", required_argument, NULL, 'd'}, @@ -972,6 +1249,9 @@ int bam_mpileup(int argc, char *argv[]) {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, + {"max-bq", required_argument, NULL, 11}, + {"max-BQ", required_argument, NULL, 11}, + {"delta-BQ", required_argument, NULL, 12}, {"ignore-overlaps", no_argument, NULL, 'x'}, {"output-type", required_argument, NULL, 'O'}, {"samples", required_argument, NULL, 's'}, @@ -979,16 +1259,23 @@ int bam_mpileup(int argc, char *argv[]) {"annotate", required_argument, NULL, 'a'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, + {"indel-bias", required_argument, NULL, 10}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, - {"min-ireads ", required_argument, NULL, 'm'}, + {"min-ireads", required_argument, NULL, 'm'}, {"per-sample-mF", no_argument, NULL, 'p'}, {"per-sample-mf", no_argument, NULL, 'p'}, {"platforms", required_argument, NULL, 'P'}, + {"max-read-len", required_argument, NULL, 'M'}, + {"config", required_argument, NULL, 'X'}, + {"mwu-u", no_argument, NULL, 'U'}, + {"seed", required_argument, NULL, 13}, + {"ambig-reads", required_argument, NULL, 14}, + {"ar", required_argument, NULL, 14}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -1040,23 +1327,26 @@ int bam_mpileup(int argc, char *argv[]) case 'P': mplp.pl_list = strdup(optarg); break; case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'B': mplp.flag &= ~MPLP_REALN; break; + case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break; case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break; - case 'O': + case 'O': switch (optarg[0]) { case 'b': mplp.output_type = FT_BCF_GZ; break; case 'u': mplp.output_type = FT_BCF; break; case 'z': mplp.output_type = FT_VCF_GZ; break; case 'v': mplp.output_type = FT_VCF; break; - default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); + default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); } break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; + case 11: mplp.max_baseQ = atoi(optarg); break; + case 12: mplp.delta_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': { char *end; @@ -1068,6 +1358,12 @@ int bam_mpileup(int argc, char *argv[]) break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; + case 10: // --indel-bias (inverted so higher => more indels called) + if (atof(optarg) < 1e-2) + mplp.indel_bias = 1/1e2; + else + mplp.indel_bias = 1/atof(optarg); + break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1080,6 +1376,49 @@ int bam_mpileup(int argc, char *argv[]) } mplp.fmt_flag |= parse_format_flag(optarg); break; + case 'M': mplp.max_read_len = atoi(optarg); break; + case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break; + case 'X': + if (strcasecmp(optarg, "pacbio-ccs") == 0) { + mplp.min_frac = 0.1; + mplp.min_baseQ = 5; + mplp.max_baseQ = 50; + mplp.delta_baseQ = 10; + mplp.openQ = 25; + mplp.extQ = 1; + mplp.flag |= MPLP_REALN_PARTIAL; + mplp.max_read_len = 99999; + } else if (strcasecmp(optarg, "ont") == 0) { + fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " + "a higher -P, eg -P0.01 or -P 0.1\n"); + mplp.min_baseQ = 5; + mplp.max_baseQ = 30; + mplp.flag &= ~MPLP_REALN; + mplp.flag |= MPLP_NO_INDEL; + } else if (strcasecmp(optarg, "1.12") == 0) { + // 1.12 and earlier + mplp.min_frac = 0.002; + mplp.min_support = 1; + mplp.min_baseQ = 13; + mplp.tandemQ = 100; + mplp.flag &= ~MPLP_REALN_PARTIAL; + mplp.flag |= MPLP_REALN; + } else if (strcasecmp(optarg, "illumina") == 0) { + mplp.flag |= MPLP_REALN_PARTIAL; + } else { + fprintf(stderr, "Unknown configuration name '%s'\n" + "Please choose from 1.12, illumina, pacbio-ccs or ont\n", + optarg); + return 1; + } + break; + case 13: hts_srand48(atoi(optarg)); break; + case 14: + if ( !strcasecmp(optarg,"drop") ) mplp.ambig_reads = B2B_DROP; + else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD; + else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0; + else error("The option to --ambig-reads not recognised: %s\n",optarg); + break; default: fprintf(stderr,"Invalid option: '%c'\n", c); return 1; @@ -1120,7 +1459,7 @@ int bam_mpileup(int argc, char *argv[]) return 1; } int ret,i; - if (file_list) + if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mplp.files = fn; @@ -1142,5 +1481,6 @@ int bam_mpileup(int argc, char *argv[]) if (mplp.bed_itr) regitr_destroy(mplp.bed_itr); if (mplp.reg) regidx_destroy(mplp.reg); bam_smpl_destroy(mplp.bsmpl); + return ret; } diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index 51fcf8b..c66c752 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -2,7 +2,7 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2018 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -41,6 +41,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include "regidx.h" #include "bcftools.h" @@ -61,16 +62,19 @@ DEALINGS IN THE SOFTWARE. */ #define MPLP_PRINT_MAPQ (1<<10) #define MPLP_PER_SAMPLE (1<<11) #define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_REALN_PARTIAL (1<<13) typedef struct _mplp_aux_t mplp_aux_t; typedef struct _mplp_pileup_t mplp_pileup_t; // Data shared by all bam files typedef struct { - int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, + max_indel_depth, max_read_len, fmt_flag, ambig_reads; int rflag_require, rflag_filter, output_type; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels + double indel_bias; char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads; faidx_t *fai; @@ -233,7 +237,46 @@ static int mplp_func(void *data, bam1_t *b) has_ref = 0; } - if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); + // Allow sufficient room for bam_aux_append of ZQ tag without + // a realloc and consequent breakage of pileup's cached pointers. + if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) { + // Doing sam_prob_realn later is problematic as it adds to + // the tag list (ZQ or BQ), which causes a realloc of b->data. + // This happens after pileup has built a hash table on the + // read name. It's a deficiency in pileup IMO. + + // We could implement a new sam_prob_realn that returns ZQ + // somewhere else and cache it ourselves (pileup clientdata), + // but for now we simply use a workaround. + // + // We create a fake tag of the correct length, which we remove + // just prior calling sam_prob_realn so we can guarantee there is + // room. (We can't just make room now as bam_copy1 removes it + // again). + if (b->core.l_qseq > 500) { + uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1); + memset(ZQ, '@', b->core.l_qseq); + ZQ[b->core.l_qseq] = 0; + bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ); + free(ZQ); + } else { + static uint8_t ZQ[501] = + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"; + ZQ[b->core.l_qseq] = 0; + bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ); + ZQ[b->core.l_qseq] = '@'; + } + } + if (has_ref && ma->conf->capQ_thres > 10) { int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres); if (q < 0) continue; // skip @@ -259,18 +302,46 @@ static int mplp_func(void *data, bam1_t *b) static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { mplp_aux_t *ma = (mplp_aux_t *)data; - cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1; - if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) ) - { - int i; - for (i=0; icore.n_cigar; i++) - { - int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; - if ( cig!=BAM_CSOFT_CLIP ) continue; - cd->i |= 1; + int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); + cd->i = 0; + PLP_SET_SAMPLE_ID(cd->i, n); + // Whether read has a soft-clip is used in mplp_realn's heuristics. + // TODO: consider whether clip length is beneficial to use? + int i; + for (i=0; icore.n_cigar; i++) { + int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; + if (cig == BAM_CSOFT_CLIP) { + PLP_SET_SOFT_CLIP(cd->i); break; } } + + if (ma->conf->flag & MPLP_REALN) { + int i, tot_ins = 0; + uint32_t *cigar = bam_get_cigar(b); + int p = 0; + for (i=0; icore.n_cigar; i++) { + int cig = cigar[i] & BAM_CIGAR_MASK; + if (bam_cigar_type(cig) & 2) + p += cigar[i] >> BAM_CIGAR_SHIFT; + if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) { + tot_ins += cigar[i] >> BAM_CIGAR_SHIFT; + // Possible further optimsation, check tot_ins==1 later + // (and remove break) so we can detect single bp indels. + // We may want to focus BAQ on more complex regions only. + PLP_SET_INDEL(cd->i); + break; + } + + // TODO: proper p->cd struct and have cd->i as a size rather + // than a flag. + + // Then aggregate together the sizes and if just 1 size for all + // reads or 2 sizes for approx 50/50 split in all reads, then + // treat this as a well-aligned variant and don't run BAQ. + } + } + return 0; } @@ -284,7 +355,7 @@ static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, c { const bam_pileup1_t *p = plp[i] + j; int id = PLP_SAMPLE_ID(p->cd.i); - if (m->n_plp[id] == m->m_plp[id]) + if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); @@ -319,6 +390,150 @@ static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bc if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); } +/* + * Loops for an indel at this position. + * + * Only reads that overlap an indel loci get realigned. This considerably + * reduces the cost of running BAQ while keeping the main benefits. + * + * TODO: also consider only realigning reads that don't span the indel + * by more than a certain amount either-side. Ie focus BAQ only on reads + * ending adjacent to the indel, where the alignment is most likely to + * be wrong. (2nd TODO: do this based on sequence context; STRs bad, unique + * data good.) + * + * NB: this may sadly realign after we've already used the data. Hmm... + */ +static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, + int flag, int max_read_len, + char *ref, int ref_len, int pos) { + int i, j, has_indel = 0, has_clip = 0, nt = 0; + int min_indel = INT_MAX, max_indel = INT_MIN; + + // Is an indel present. + // NB: don't bother even checking if very long as almost guaranteed + // to have indel (and likely soft-clips too). + for (i = 0; i < n; i++) { // iterate over bams + nt += n_plp[i]; + for (j = 0; j < n_plp[i]; j++) { // iterate over reads + bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j; + has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0; + // Has_clip is almost always true for very long reads + // (eg PacBio CCS), but these rarely matter as the clip + // is likely a long way from this indel. + has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0; + if (max_indel < p->indel) + max_indel = p->indel; + if (min_indel > p->indel) + min_indel = p->indel; + } + } + + if (flag & MPLP_REALN_PARTIAL) { + if (has_indel == 0 || + (has_clip < 0.2*nt && max_indel == min_indel && + (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1))) + return; + } + + // Realign + for (i = 0; i < n; i++) { // iterate over bams + for (j = 0; j < n_plp[i]; j++) { // iterate over reads + const bam_pileup1_t *p = plp[i] + j; + bam1_t *b = p->b; + + // Avoid doing multiple times. + // + // Note we cannot modify p->cd.i here with a PLP_SET macro + // because the cd item is held by mpileup in an lbnode_t + // struct and copied over to the pileup struct for each + // iteration, essentially making p->cd.i read only. + // + // We could use our own structure (p->cd.p), allocated during + // the constructor, but for simplicity we play dirty and + // abuse an unused flag bit instead. + if (b->core.flag & 32768) + continue; + b->core.flag |= 32768; + + if (b->core.l_qseq > max_read_len) + continue; + + // Check p->cigar_ind and see what cigar elements are before + // and after. How close is this location to the end of the + // read? Only realign if we don't span by more than X bases. + // + // Again, best only done on deeper data as BAQ helps + // disproportionately more on shallow data sets. + // + // This rescues some of the false negatives that are caused by + // systematic reduction in quality due to sample vs ref alignment. + +// At deep coverage we skip realigning more reads as we have sufficient depth. +// This rescues for false negatives. At shallow depth we pay for this with +// more FP so are more stringent on spanning size. +#define REALN_DIST (40+10*(nt<40)+10*(nt<20)) + uint32_t *cig = bam_get_cigar(b); + int ncig = b->core.n_cigar; + + // Don't realign reads where indel is in middle? + // On long read data we don't care about soft-clips at the ends. + // For short read data, we always calc BAQ on these as they're + // a common source of false positives. + if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) { + // Left & right cigar op match. + int lr = b->core.l_qseq > 500; + int lm = 0, rm = 0, k; + for (k = 0; k < ncig; k++) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + lm += bam_cigar_oplen(cig[k]); + else + break; + } + + for (k = ncig-1; k >= 0; k--) { + int cop = bam_cigar_op(cig[k]); + if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP)) + continue; + + if (cop == BAM_CMATCH || cop == BAM_CDIFF || + cop == BAM_CEQUAL) + rm += bam_cigar_oplen(cig[k]); + else + break; + } + + if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4) + continue; + + if (lm >= REALN_DIST && rm >= REALN_DIST && + has_clip < (0.15+0.05*(nt>20))*nt) + continue; + } + + if (b->core.l_qseq > 500) { + // don't do BAQ on long-read data if it's going to + // cause us to have a large band-with and costly in CPU + int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000) + continue; + } + + // Fudge: make room for ZQ tag. + uint8_t *_Q = bam_aux_get(b, "_Q"); + if (_Q) bam_aux_del(b, _Q); + sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3); + } + } + + return; +} + static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) { bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list @@ -326,7 +541,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) int ret, i, tid, pos, ref_len; char *ref; - while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) + while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) { if ( posend ) continue; if ( conf->bed && tid >= 0 ) @@ -335,7 +550,10 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) if ( !conf->bed_logic ) overlap = overlap ? 0 : 1; if ( !overlap ) continue; } - mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); + int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); + if (has_ref && (conf->flag & MPLP_REALN)) + mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, + conf->max_read_len, ref, ref_len, pos); int total_depth, _ref0, ref16; for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i]; @@ -348,18 +566,19 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) conf->bc.tid = tid; conf->bc.pos = pos; bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc); bcf_clear1(conf->bcf_rec); - bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, + conf->bca, 0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth - && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0) + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth + && (bcf_callaux_clean(conf->bca, &conf->bc), + bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)) { - bcf_callaux_clean(conf->bca, &conf->bc); for (i = 0; i < conf->gplp->n; ++i) bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); - if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) + if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) { bcf_clear1(conf->bcf_rec); bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref); @@ -374,7 +593,7 @@ static int mpileup(mplp_conf_t *conf) { if (conf->nfiles == 0) { fprintf(bcftools_stderr,"[%s] no input file/data given\n", __func__); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } mplp_ref_t mp_ref = MPLP_REF_INIT; @@ -395,7 +614,7 @@ static int mpileup(mplp_conf_t *conf) conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL); if ( !conf->reg ) { fprintf(bcftools_stderr,"Could not parse the regions: %s\n", conf->reg_fname); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } } else @@ -403,7 +622,7 @@ static int mpileup(mplp_conf_t *conf) conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) { fprintf(bcftools_stderr,"Could not parse the regions: %s\n", conf->reg_fname); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } } nregs = regidx_nregs(conf->reg); @@ -422,23 +641,23 @@ static int mpileup(mplp_conf_t *conf) if ( !conf->mplp_data[i]->fp ) { fprintf(bcftools_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(bcftools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) { fprintf(bcftools_stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } conf->mplp_data[i]->conf = conf; conf->mplp_data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(conf->mplp_data[i]->fp); if ( !h_tmp ) { fprintf(bcftools_stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]); @@ -458,20 +677,20 @@ static int mpileup(mplp_conf_t *conf) hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); if (idx == NULL) { fprintf(bcftools_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s); - if ( !conf->mplp_data[i]->iter ) + if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(bcftools_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } fprintf(bcftools_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } if ( nregs==1 ) // no need to keep the index in memory hts_idx_destroy(idx); @@ -489,18 +708,22 @@ static int mpileup(mplp_conf_t *conf) conf->mplp_data[i]->h = hdr; } } + if ( !hdr ) { + fprintf(bcftools_stderr, "[%s] failed to find a file header with usable read groups\n", __func__); + bcftools_exit(EXIT_FAILURE); + } // allocate data storage proportionate to number of samples being studied sm->n bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n); conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); - conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); + conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); fprintf(bcftools_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header - conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); + conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname)); if (conf->bcf_fp == NULL) { fprintf(bcftools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads); @@ -544,11 +767,24 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##INFO="); if ( conf->fmt_flag&B2B_INFO_VDB ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_RPB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + + if (conf->fmt_flag & B2B_INFO_ZSCORE) { + if ( conf->fmt_flag&B2B_INFO_RPB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_SCB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + } else { + if ( conf->fmt_flag&B2B_INFO_RPB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + } + + bcf_hdr_append(conf->bcf_hdr,"##INFO="); #if CDF_MWU_TESTS bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); @@ -578,6 +814,8 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); if ( conf->fmt_flag&B2B_FMT_ADR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_QS ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); if ( conf->fmt_flag&B2B_INFO_AD ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); if ( conf->fmt_flag&B2B_INFO_ADF ) @@ -597,17 +835,23 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]); if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); - conf->bca = bcf_call_init(-1., conf->min_baseQ); + conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ, + conf->delta_baseQ); conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; + conf->bca->indel_bias = conf->indel_bias; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bca->fmt_flag = conf->fmt_flag; + conf->bca->ambig_reads = conf->ambig_reads; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); + conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES); + for (i=0; ibcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES; if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); @@ -645,7 +889,7 @@ static int mpileup(mplp_conf_t *conf) if ( nregs ) { int ireg = 0; - do + do { // first region is already positioned if ( ireg++ > 0 ) @@ -653,19 +897,19 @@ static int mpileup(mplp_conf_t *conf) conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); - for (i=0; infiles; i++) + for (i=0; infiles; i++) { hts_itr_destroy(conf->mplp_data[i]->iter); conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); - if ( !conf->mplp_data[i]->iter ) + if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(bcftools_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } fprintf(bcftools_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } bam_mplp_reset(conf->iter); } @@ -692,6 +936,7 @@ static int mpileup(mplp_conf_t *conf) free(conf->bc.ADR); free(conf->bc.ADF); free(conf->bc.SCR); + free(conf->bc.QS); free(conf->bc.fmt_arr); free(conf->bcr); } @@ -795,14 +1040,16 @@ int parse_format_flag(const char *str) else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF; else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; + else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS; else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR; + else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB; else { fprintf(bcftools_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } free(tags[i]); } @@ -823,6 +1070,7 @@ static void list_annotations(FILE *fp) " FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n" " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" +" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n" " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" " FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" "\n" @@ -845,78 +1093,98 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) // source code in 80 columns, to the extent that's possible.) fprintf(fp, -"\n" -"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n" -"\n" -"Input options:\n" -" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n" -" -A, --count-orphans do not discard anomalous read pairs\n" -" -b, --bam-list FILE list of input BAM filenames, one per line\n" -" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" -" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" -" -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + "\n" + "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n" + "\n" + "Input options:\n" + " -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n" + " -A, --count-orphans do not discard anomalous read pairs\n" + " -b, --bam-list FILE list of input BAM filenames, one per line\n" + " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" + " -C, --adjust-MQ INT adjust mapping quality [0]\n" + " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" + " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + fprintf(fp, + " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" + " -f, --fasta-ref FILE faidx indexed reference sequence file\n" + " --no-reference do not require fasta reference file\n" + " -G, --read-groups FILE select or exclude read groups listed in the file\n" + " -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); fprintf(fp, -" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" -" -f, --fasta-ref FILE faidx indexed reference sequence file\n" -" --no-reference do not require fasta reference file\n" -" -G, --read-groups FILE select or exclude read groups listed in the file\n" -" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); fprintf(fp, -" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + " --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); fprintf(fp, -" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" -" -R, --regions-file FILE restrict to regions listed in a file\n" -" --ignore-RG ignore RG tags (one BAM = one sample)\n" -" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); + " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ); fprintf(fp, -" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" -" [%s]\n", tmp_filter); + " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" + " -R, --regions-file FILE restrict to regions listed in a file\n" + " --ignore-RG ignore RG tags (one BAM = one sample)\n" + " --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); fprintf(fp, -" -s, --samples LIST comma separated list of samples to include\n" -" -S, --samples-file FILE file of samples to include\n" -" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" -" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -" -x, --ignore-overlaps disable read-pair overlap detection\n" -"\n" -"Output options:\n" -" -a, --annotate LIST optional tags to output; '?' to list []\n" -" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" -" to minimum per-sample DP\n" -" --no-version do not append version and command line to the header\n" -" -o, --output FILE write output to FILE [standard output]\n" -" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" -" 'z' compressed VCF; 'v' uncompressed VCF [v]\n" -" --threads INT use multithreading with INT worker threads [0]\n" -"\n" -"SNP/INDEL genotype likelihoods options:\n" -" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); + " --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" + " [%s]\n", tmp_filter); fprintf(fp, -" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); + " -s, --samples LIST comma separated list of samples to include\n" + " -S, --samples-file FILE file of samples to include\n" + " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + " -x, --ignore-overlaps disable read-pair overlap detection\n" + " --seed INT random number seed used for sampling deep regions [0]\n" + "\n" + "Output options:\n" + " -a, --annotate LIST optional tags to output; '?' to list available tags []\n" + " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" + " to minimum per-sample DP\n" + " --no-version do not append version and command line to the header\n" + " -o, --output FILE write output to FILE [standard output]\n" + " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" + " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" + " -U, --mwu-u use older probability scale for Mann-Whitney U test\n" + " --threads INT use multithreading with INT worker threads [0]\n" + "\n" + "SNP/INDEL genotype likelihoods options:\n" + " -X, --config STR Specify platform specific profiles (see below)\n" + " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); fprintf(fp, -" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); fprintf(fp, -" -I, --skip-indels do not perform indel calling\n" -" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); fprintf(fp, -" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + " -I, --skip-indels do not perform indel calling\n" + " -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); fprintf(fp, -" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); fprintf(fp, -" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" -" -P, --platforms STR comma separated list of platforms for indels [all]\n" -"\n" -"Notes: Assuming diploid individuals.\n" -"\n" -"Example:\n" -" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" -" bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" -"\n"); + " -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); + fprintf(fp, + " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); + fprintf(fp, + " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" + " -P, --platforms STR comma separated list of platforms for indels [all]\n" + " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); + fprintf(fp, + " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); + fprintf(fp,"\n"); + fprintf(fp, + "Configuration profiles activated with -X, --config:\n" + " 1.12: -Q13 -h100 -m1 -F0.002\n" + " illumina: [ default values ]\n" + " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" + " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n" + "\n" + "Notes: Assuming diploid individuals.\n" + "\n" + "Example:\n" + " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" + " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" + "\n"); free(tmp_require); free(tmp_filter); } -int bam_mpileup(int argc, char *argv[]) +int main_mpileup(int argc, char *argv[]) { int c; const char *file_list = NULL; @@ -924,12 +1192,15 @@ int bam_mpileup(int argc, char *argv[]) int nfiles = 0, use_orphan = 0, noref = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); - mplp.min_baseQ = 13; + mplp.min_baseQ = 1; + mplp.max_baseQ = 60; + mplp.delta_baseQ = 30; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; - mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; - mplp.min_frac = 0.002; mplp.min_support = 1; - mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500; + mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL + | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; @@ -937,7 +1208,11 @@ int bam_mpileup(int argc, char *argv[]) mplp.record_cmd_line = 1; mplp.n_threads = 0; mplp.bsmpl = bam_smpl_init(); - mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag() + // the default to be changed in future, see also parse_format_flag() + mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; + mplp.max_read_len = 500; + mplp.ambig_reads = B2B_DROP; + hts_srand48(0); static const struct option lopts[] = { @@ -958,6 +1233,8 @@ int bam_mpileup(int argc, char *argv[]) {"bam-list", required_argument, NULL, 'b'}, {"no-BAQ", no_argument, NULL, 'B'}, {"no-baq", no_argument, NULL, 'B'}, + {"full-BAQ", no_argument, NULL, 'D'}, + {"full-baq", no_argument, NULL, 'D'}, {"adjust-MQ", required_argument, NULL, 'C'}, {"adjust-mq", required_argument, NULL, 'C'}, {"max-depth", required_argument, NULL, 'd'}, @@ -974,6 +1251,9 @@ int bam_mpileup(int argc, char *argv[]) {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, + {"max-bq", required_argument, NULL, 11}, + {"max-BQ", required_argument, NULL, 11}, + {"delta-BQ", required_argument, NULL, 12}, {"ignore-overlaps", no_argument, NULL, 'x'}, {"output-type", required_argument, NULL, 'O'}, {"samples", required_argument, NULL, 's'}, @@ -981,16 +1261,23 @@ int bam_mpileup(int argc, char *argv[]) {"annotate", required_argument, NULL, 'a'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, + {"indel-bias", required_argument, NULL, 10}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, - {"min-ireads ", required_argument, NULL, 'm'}, + {"min-ireads", required_argument, NULL, 'm'}, {"per-sample-mF", no_argument, NULL, 'p'}, {"per-sample-mf", no_argument, NULL, 'p'}, {"platforms", required_argument, NULL, 'P'}, + {"max-read-len", required_argument, NULL, 'M'}, + {"config", required_argument, NULL, 'X'}, + {"mwu-u", no_argument, NULL, 'U'}, + {"seed", required_argument, NULL, 13}, + {"ambig-reads", required_argument, NULL, 14}, + {"ar", required_argument, NULL, 14}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -1030,7 +1317,7 @@ int bam_mpileup(int argc, char *argv[]) if ( regidx_insert_list(mplp.bed,optarg,',') !=0 ) { fprintf(bcftools_stderr,"Could not parse the targets: %s\n", optarg); - exit(EXIT_FAILURE); + bcftools_exit(EXIT_FAILURE); } break; case 'T': @@ -1042,23 +1329,26 @@ int bam_mpileup(int argc, char *argv[]) case 'P': mplp.pl_list = strdup(optarg); break; case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'B': mplp.flag &= ~MPLP_REALN; break; + case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break; case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break; - case 'O': + case 'O': switch (optarg[0]) { case 'b': mplp.output_type = FT_BCF_GZ; break; case 'u': mplp.output_type = FT_BCF; break; case 'z': mplp.output_type = FT_VCF_GZ; break; case 'v': mplp.output_type = FT_VCF; break; - default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); + default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); } break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; + case 11: mplp.max_baseQ = atoi(optarg); break; + case 12: mplp.delta_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': { char *end; @@ -1070,6 +1360,12 @@ int bam_mpileup(int argc, char *argv[]) break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; + case 10: // --indel-bias (inverted so higher => more indels called) + if (atof(optarg) < 1e-2) + mplp.indel_bias = 1/1e2; + else + mplp.indel_bias = 1/atof(optarg); + break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1082,6 +1378,49 @@ int bam_mpileup(int argc, char *argv[]) } mplp.fmt_flag |= parse_format_flag(optarg); break; + case 'M': mplp.max_read_len = atoi(optarg); break; + case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break; + case 'X': + if (strcasecmp(optarg, "pacbio-ccs") == 0) { + mplp.min_frac = 0.1; + mplp.min_baseQ = 5; + mplp.max_baseQ = 50; + mplp.delta_baseQ = 10; + mplp.openQ = 25; + mplp.extQ = 1; + mplp.flag |= MPLP_REALN_PARTIAL; + mplp.max_read_len = 99999; + } else if (strcasecmp(optarg, "ont") == 0) { + fprintf(bcftools_stderr, "For ONT it may be beneficial to also run bcftools call with " + "a higher -P, eg -P0.01 or -P 0.1\n"); + mplp.min_baseQ = 5; + mplp.max_baseQ = 30; + mplp.flag &= ~MPLP_REALN; + mplp.flag |= MPLP_NO_INDEL; + } else if (strcasecmp(optarg, "1.12") == 0) { + // 1.12 and earlier + mplp.min_frac = 0.002; + mplp.min_support = 1; + mplp.min_baseQ = 13; + mplp.tandemQ = 100; + mplp.flag &= ~MPLP_REALN_PARTIAL; + mplp.flag |= MPLP_REALN; + } else if (strcasecmp(optarg, "illumina") == 0) { + mplp.flag |= MPLP_REALN_PARTIAL; + } else { + fprintf(bcftools_stderr, "Unknown configuration name '%s'\n" + "Please choose from 1.12, illumina, pacbio-ccs or ont\n", + optarg); + return 1; + } + break; + case 13: hts_srand48(atoi(optarg)); break; + case 14: + if ( !strcasecmp(optarg,"drop") ) mplp.ambig_reads = B2B_DROP; + else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD; + else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0; + else error("The option to --ambig-reads not recognised: %s\n",optarg); + break; default: fprintf(bcftools_stderr,"Invalid option: '%c'\n", c); return 1; @@ -1122,7 +1461,7 @@ int bam_mpileup(int argc, char *argv[]) return 1; } int ret,i; - if (file_list) + if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mplp.files = fn; @@ -1144,5 +1483,6 @@ int bam_mpileup(int argc, char *argv[]) if (mplp.bed_itr) regitr_destroy(mplp.bed_itr); if (mplp.reg) regidx_destroy(mplp.reg); bam_smpl_destroy(mplp.bsmpl); + return ret; } diff --git a/bcftools/ploidy.h b/bcftools/ploidy.h index 1e7d2f7..7697c65 100644 --- a/bcftools/ploidy.h +++ b/bcftools/ploidy.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2015 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 954d43c..3ab7bcb 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -1,7 +1,7 @@ /* prob1.c -- mathematical utility functions. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. + Copyright (C) 2012, 2013-2014, 2017 Genome Research Ltd. Author: Heng Li diff --git a/bcftools/prob1.c.pysam.c b/bcftools/prob1.c.pysam.c index bd73e1d..6d2bbd1 100644 --- a/bcftools/prob1.c.pysam.c +++ b/bcftools/prob1.c.pysam.c @@ -3,7 +3,7 @@ /* prob1.c -- mathematical utility functions. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. + Copyright (C) 2012, 2013-2014, 2017 Genome Research Ltd. Author: Heng Li diff --git a/bcftools/prob1.h b/bcftools/prob1.h index a3d4b0d..a562265 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -1,7 +1,7 @@ /* prob1.h -- mathematical utility functions. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012, 2013 Genome Research Ltd. + Copyright (C) 2012, 2013-2014 Genome Research Ltd. Author: Heng Li diff --git a/bcftools/rbuf.h b/bcftools/rbuf.h index 2c0e5b1..ef2e206 100644 --- a/bcftools/rbuf.h +++ b/bcftools/rbuf.h @@ -1,6 +1,6 @@ /* rbuf.h -- round buffers. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2014, 2017 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/regidx.c b/bcftools/regidx.c index 5c6c8ce..cdaf7ea 100644 --- a/bcftools/regidx.c +++ b/bcftools/regidx.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2014-2017 Genome Research Ltd. + Copyright (C) 2014-2018 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c index 684993c..4eb96e8 100644 --- a/bcftools/regidx.c.pysam.c +++ b/bcftools/regidx.c.pysam.c @@ -1,7 +1,7 @@ #include "bcftools.pysam.h" /* - Copyright (C) 2014-2017 Genome Research Ltd. + Copyright (C) 2014-2018 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/regidx.h b/bcftools/regidx.h index a654dbd..f13b52a 100644 --- a/bcftools/regidx.h +++ b/bcftools/regidx.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2014-2016 Genome Research Ltd. + Copyright (C) 2014-2016, 2018 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/reheader.c b/bcftools/reheader.c index 60a60e1..ae7c622 100644 --- a/bcftools/reheader.c +++ b/bcftools/reheader.c @@ -1,6 +1,6 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014-2018 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -49,7 +49,7 @@ THE SOFTWARE. */ typedef struct _args_t { char **argv, *fname, *samples_fname, *header_fname, *output_fname; - char *fai_fname, *rm_tmpfile; + char *fai_fname, *rm_tmpfile, *tmp_prefix; htsFile *fp; htsFormat type; htsThreadPool *threads; @@ -140,6 +140,33 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see free(key.s); free(val.s); free(tmp.s); return q; } +char *init_tmp_prefix(const char *tmp_prefix) +{ + char *prefix = NULL; + if ( tmp_prefix ) + { + int len = strlen(tmp_prefix); + prefix = (char*) calloc(len+7,1); + memcpy(prefix,tmp_prefix,len); + memcpy(prefix+len,"XXXXXX",6); + } + else + { + #ifdef _WIN32 + char tmp_path[MAX_PATH]; + int ret = GetTempPath(MAX_PATH, tmp_path); + if (!ret || ret > MAX_PATH) + error("Could not get the path to the temporary folder\n"); + if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH) + error("Full path to the temporary folder is too long\n"); + strcat(tmp_path, "/bcftools.XXXXXX"); + prefix = strdup(tmp_path); + #else + prefix = strdup("/tmp/bcftools.XXXXXX"); + #endif + } + return prefix; +} static void update_from_fai(args_t *args) { if ( !strcmp("-",args->fname) ) @@ -147,18 +174,7 @@ static void update_from_fai(args_t *args) faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA); if ( !fai ) error("Could not parse %s\n", args->fai_fname); -#ifdef _WIN32 - char tmp_path[MAX_PATH]; - int ret = GetTempPath(MAX_PATH, tmp_path); - if (!ret || ret > MAX_PATH) - error("Could not get the path to the temporary folder\n"); - if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH) - error("Full path to the temporary folder is too long\n"); - strcat(tmp_path, "/bcftools-fai-header-XXXXXX"); - args->rm_tmpfile = strdup(tmp_path); -#else - args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX"); -#endif + args->rm_tmpfile = init_tmp_prefix(args->tmp_prefix); int fd = mkstemp(args->rm_tmpfile); if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile); @@ -273,8 +289,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id hdr->s[hdr->l] = 0; kstring_t tmp = {0,0,0}; - i = j = n = 0; - while ( hdr->s[idx+i] && hdr->s[idx+i]) + i = j = n = 0; // i:traverse the #CHROM line 1 by 1; j:points to the last column + while ( hdr->s[idx+i] ) { if ( hdr->s[idx+i]=='\t' ) { @@ -282,8 +298,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id if ( ++n>9 ) { - char *ori = khash_str2str_get(hash,hdr->s+idx+j); - kputs(ori ? ori : hdr->s+idx+j, &tmp); + char *new_name = khash_str2str_get(hash,hdr->s+idx+j); + kputs(new_name ? new_name : hdr->s+idx+j, &tmp); } else kputs(hdr->s+idx+j, &tmp); @@ -295,8 +311,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id } i++; } - char *ori = khash_str2str_get(hash,hdr->s+idx+j); - kputs(ori ? ori : hdr->s+idx+j, &tmp); + char *new_name = khash_str2str_get(hash,hdr->s+idx+j); + kputs(new_name ? new_name : hdr->s+idx+j, &tmp); khash_str2str_destroy_free_all(hash); @@ -317,7 +333,13 @@ static void set_samples(char **samples, int nsamples, kstring_t *hdr) if ( hdr->s[i]=='\t' ) ncols++; i--; } - if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) error("Could not parse the header: %s\n", hdr->s); + if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) + { + if ( i>0 && !strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",38) ) + error("Error: missing FORMAT fields, cowardly refusing to add samples\n"); + + error("Could not parse the header: %s\n", hdr->s); + } // Are the samples "old-sample new-sample" pairs? if ( set_sample_pairs(samples,nsamples,hdr, i+1) ) return; @@ -388,7 +410,10 @@ static void reheader_vcf_gz(args_t *args) int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) + { samples = hts_readlines(args->samples_fname, &nsamples); + if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname); + } if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; @@ -444,7 +469,10 @@ static void reheader_vcf(args_t *args) int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) + { samples = hts_readlines(args->samples_fname, &nsamples); + if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname); + } if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; @@ -548,7 +576,10 @@ static void reheader_bcf(args_t *args, int is_compressed) int i, nsamples = 0; char **samples = NULL; if ( args->samples_fname ) + { samples = hts_readlines(args->samples_fname, &nsamples); + if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname); + } if ( args->header_fname ) { free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0; @@ -639,11 +670,16 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools reheader [OPTIONS] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -f, --fai update sequences and their lengths from the .fai file\n"); - fprintf(stderr, " -h, --header new header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -s, --samples new sample names\n"); - fprintf(stderr, " --threads use multithreading with worker threads (BCF only) [0]\n"); + fprintf(stderr, " -f, --fai FILE update sequences and their lengths from the .fai file\n"); + fprintf(stderr, " -h, --header FILE new header\n"); + fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(stderr, " -s, --samples FILE new sample names\n"); +#ifdef _WIN32 + fprintf(stderr, " -T, --temp-prefix PATH template for temporary file name [/bcftools.XXXXXX]\n"); +#else + fprintf(stderr, " -T, --temp-prefix PATH template for temporary file name [/tmp/bcftools.XXXXXX]\n"); +#endif + fprintf(stderr, " --threads INT use multithreading with worker threads (BCF only) [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Example:\n"); fprintf(stderr, " # Write out the header to be modified\n"); @@ -666,6 +702,7 @@ int main_reheader(int argc, char *argv[]) static struct option loptions[] = { + {"temp-prefix",1,0,'T'}, {"fai",1,0,'f'}, {"output",1,0,'o'}, {"header",1,0,'h'}, @@ -673,11 +710,12 @@ int main_reheader(int argc, char *argv[]) {"threads",1,NULL,1}, {0,0,0,0} }; - while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0) { switch (c) { case 1 : args->n_threads = strtol(optarg, 0, 0); break; + case 'T': args->tmp_prefix = optarg; break; case 'f': args->fai_fname = optarg; break; case 'o': args->output_fname = optarg; break; case 's': args->samples_fname = optarg; break; @@ -704,10 +742,14 @@ int main_reheader(int argc, char *argv[]) if ( args->type.format==vcf ) { - if ( args->type.compression==bgzf || args->type.compression==gzip ) + if ( args->type.compression==bgzf ) reheader_vcf_gz(args); - else + else if ( args->type.compression==no_compression ) reheader_vcf(args); + else if ( args->type.compression==gzip ) + error("Error: cannot reheader gzip-compressed files, first convert with `bcftools view --output-type` to a supported format\n"); + else + error("Error: the compression type of \"%s\" is not recognised/supported\n", args->fname); } else reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip); diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c index 9f84e4c..380843b 100644 --- a/bcftools/reheader.c.pysam.c +++ b/bcftools/reheader.c.pysam.c @@ -2,7 +2,7 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014-2018 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -51,7 +51,7 @@ THE SOFTWARE. */ typedef struct _args_t { char **argv, *fname, *samples_fname, *header_fname, *output_fname; - char *fai_fname, *rm_tmpfile; + char *fai_fname, *rm_tmpfile, *tmp_prefix; htsFile *fp; htsFormat type; htsThreadPool *threads; @@ -142,6 +142,33 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see free(key.s); free(val.s); free(tmp.s); return q; } +char *init_tmp_prefix(const char *tmp_prefix) +{ + char *prefix = NULL; + if ( tmp_prefix ) + { + int len = strlen(tmp_prefix); + prefix = (char*) calloc(len+7,1); + memcpy(prefix,tmp_prefix,len); + memcpy(prefix+len,"XXXXXX",6); + } + else + { + #ifdef _WIN32 + char tmp_path[MAX_PATH]; + int ret = GetTempPath(MAX_PATH, tmp_path); + if (!ret || ret > MAX_PATH) + error("Could not get the path to the temporary folder\n"); + if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH) + error("Full path to the temporary folder is too long\n"); + strcat(tmp_path, "/bcftools.XXXXXX"); + prefix = strdup(tmp_path); + #else + prefix = strdup("/tmp/bcftools.XXXXXX"); + #endif + } + return prefix; +} static void update_from_fai(args_t *args) { if ( !strcmp("-",args->fname) ) @@ -149,18 +176,7 @@ static void update_from_fai(args_t *args) faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA); if ( !fai ) error("Could not parse %s\n", args->fai_fname); -#ifdef _WIN32 - char tmp_path[MAX_PATH]; - int ret = GetTempPath(MAX_PATH, tmp_path); - if (!ret || ret > MAX_PATH) - error("Could not get the path to the temporary folder\n"); - if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH) - error("Full path to the temporary folder is too long\n"); - strcat(tmp_path, "/bcftools-fai-header-XXXXXX"); - args->rm_tmpfile = strdup(tmp_path); -#else - args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX"); -#endif + args->rm_tmpfile = init_tmp_prefix(args->tmp_prefix); int fd = mkstemp(args->rm_tmpfile); if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile); @@ -275,8 +291,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id hdr->s[hdr->l] = 0; kstring_t tmp = {0,0,0}; - i = j = n = 0; - while ( hdr->s[idx+i] && hdr->s[idx+i]) + i = j = n = 0; // i:traverse the #CHROM line 1 by 1; j:points to the last column + while ( hdr->s[idx+i] ) { if ( hdr->s[idx+i]=='\t' ) { @@ -284,8 +300,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id if ( ++n>9 ) { - char *ori = khash_str2str_get(hash,hdr->s+idx+j); - kputs(ori ? ori : hdr->s+idx+j, &tmp); + char *new_name = khash_str2str_get(hash,hdr->s+idx+j); + kputs(new_name ? new_name : hdr->s+idx+j, &tmp); } else kputs(hdr->s+idx+j, &tmp); @@ -297,8 +313,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id } i++; } - char *ori = khash_str2str_get(hash,hdr->s+idx+j); - kputs(ori ? ori : hdr->s+idx+j, &tmp); + char *new_name = khash_str2str_get(hash,hdr->s+idx+j); + kputs(new_name ? new_name : hdr->s+idx+j, &tmp); khash_str2str_destroy_free_all(hash); @@ -319,7 +335,13 @@ static void set_samples(char **samples, int nsamples, kstring_t *hdr) if ( hdr->s[i]=='\t' ) ncols++; i--; } - if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) error("Could not parse the header: %s\n", hdr->s); + if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) + { + if ( i>0 && !strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",38) ) + error("Error: missing FORMAT fields, cowardly refusing to add samples\n"); + + error("Could not parse the header: %s\n", hdr->s); + } // Are the samples "old-sample new-sample" pairs? if ( set_sample_pairs(samples,nsamples,hdr, i+1) ) return; @@ -390,7 +412,10 @@ static void reheader_vcf_gz(args_t *args) int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) + { samples = hts_readlines(args->samples_fname, &nsamples); + if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname); + } if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; @@ -446,7 +471,10 @@ static void reheader_vcf(args_t *args) int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) + { samples = hts_readlines(args->samples_fname, &nsamples); + if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname); + } if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; @@ -550,7 +578,10 @@ static void reheader_bcf(args_t *args, int is_compressed) int i, nsamples = 0; char **samples = NULL; if ( args->samples_fname ) + { samples = hts_readlines(args->samples_fname, &nsamples); + if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname); + } if ( args->header_fname ) { free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0; @@ -641,11 +672,16 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools reheader [OPTIONS] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -f, --fai update sequences and their lengths from the .fai file\n"); - fprintf(bcftools_stderr, " -h, --header new header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -s, --samples new sample names\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads (BCF only) [0]\n"); + fprintf(bcftools_stderr, " -f, --fai FILE update sequences and their lengths from the .fai file\n"); + fprintf(bcftools_stderr, " -h, --header FILE new header\n"); + fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -s, --samples FILE new sample names\n"); +#ifdef _WIN32 + fprintf(bcftools_stderr, " -T, --temp-prefix PATH template for temporary file name [/bcftools.XXXXXX]\n"); +#else + fprintf(bcftools_stderr, " -T, --temp-prefix PATH template for temporary file name [/tmp/bcftools.XXXXXX]\n"); +#endif + fprintf(bcftools_stderr, " --threads INT use multithreading with worker threads (BCF only) [0]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Example:\n"); fprintf(bcftools_stderr, " # Write out the header to be modified\n"); @@ -657,7 +693,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " # Reheader the file\n"); fprintf(bcftools_stderr, " bcftools reheader -h header.txt -o new.bcf old.bcf\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_reheader(int argc, char *argv[]) @@ -668,6 +704,7 @@ int main_reheader(int argc, char *argv[]) static struct option loptions[] = { + {"temp-prefix",1,0,'T'}, {"fai",1,0,'f'}, {"output",1,0,'o'}, {"header",1,0,'h'}, @@ -675,11 +712,12 @@ int main_reheader(int argc, char *argv[]) {"threads",1,NULL,1}, {0,0,0,0} }; - while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0) { switch (c) { case 1 : args->n_threads = strtol(optarg, 0, 0); break; + case 'T': args->tmp_prefix = optarg; break; case 'f': args->fai_fname = optarg; break; case 'o': args->output_fname = optarg; break; case 's': args->samples_fname = optarg; break; @@ -706,10 +744,14 @@ int main_reheader(int argc, char *argv[]) if ( args->type.format==vcf ) { - if ( args->type.compression==bgzf || args->type.compression==gzip ) + if ( args->type.compression==bgzf ) reheader_vcf_gz(args); - else + else if ( args->type.compression==no_compression ) reheader_vcf(args); + else if ( args->type.compression==gzip ) + error("Error: cannot reheader gzip-compressed files, first convert with `bcftools view --output-type` to a supported format\n"); + else + error("Error: the compression type of \"%s\" is not recognised/supported\n", args->fname); } else reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip); diff --git a/bcftools/smpl_ilist.c b/bcftools/smpl_ilist.c index 9a77e62..d170db5 100644 --- a/bcftools/smpl_ilist.c +++ b/bcftools/smpl_ilist.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016, 2018 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/smpl_ilist.c.pysam.c b/bcftools/smpl_ilist.c.pysam.c index 45fe5af..85b5e2f 100644 --- a/bcftools/smpl_ilist.c.pysam.c +++ b/bcftools/smpl_ilist.c.pysam.c @@ -1,7 +1,7 @@ #include "bcftools.pysam.h" /* - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016, 2018 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/str_finder.c b/bcftools/str_finder.c new file mode 100644 index 0000000..800cbfe --- /dev/null +++ b/bcftools/str_finder.c @@ -0,0 +1,270 @@ +/* str_finder.c -- Short Tandem Repeat finder. + Originally from Crumble (https://github.com/jkbonfield/crumble) + + Copyright (C) 2015-2016, 2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include + +#include "str_finder.h" +#include "utlist.h" + +#define MAX(a,b) ((a)>(b)?(a):(b)) +#define MIN(a,b) ((a)<(b)?(a):(b)) + +typedef unsigned char uc; + +static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen, + int lower_only, unsigned int w) { + rep_ele *el, *tmp, *prev; + char *cp1, *cp2, *cp_end; + int i; + + // Already handled this in previous overlap? + if (*list) { + tmp = DL_TAIL(*list); + if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos) + return; + } + + // Find current and last occurence of repeated word. + + cp2 = &cons[pos+1]; + // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen]; + + for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads + if (*cp1 == '*') + continue; + else + i++; + while (*cp1 == '*') + cp1--; + + + // Scan ahead to see how much further it goes. + cp_end = &cons[clen]; + while (cp2 < cp_end) { + if (*cp1 != *cp2) + break; + + w<<=2; + w|=*cp2; + cp1++; + cp2++; + } + + if (!(el = malloc(sizeof(*el)))) + return; + + el->end = pos + cp2-&cons[pos+1]; + el->rep_len = rlen; + pos++; + while (rlen--) { + while (cons[--pos] == '*'); + while (cons[--pos] == '*'); + } + //pos++; + while (pos > 1 && cons[pos-1] == '*') pos--; + el->start = pos; + + // Check it meets the lower-case only criteria + if (lower_only) { + int lc = 0; + for (i = el->start; i <= el->end; i++) { + if (islower(cons[i])) { + lc = 1; + break; + } + } + + if (!lc) { + free(el); + return; + } + } + + // Remove any older items on the list that are entirely contained within el + if (*list) { + tmp = DL_TAIL(*list); + do { + prev = tmp->prev; + if (tmp->end < el->start) + break; + + if (tmp->start >= el->start) { + DL_DELETE(*list, tmp); + free(tmp); + } + + if (tmp == DL_HEAD(*list)) + break; + tmp = prev; + } while (*list); + } + + DL_APPEND(*list, el); + + return; +} + +/* + * Finds repeated homopolymers up to 8-mers. + * Note this assumes cons is 0-3, so N of 4 may rarely give false hits. + * + * Returns a list of rep_ele structs holding the start,end tuples of repeats; + * NULL on failure. + */ +rep_ele *find_STR(char *cons, int len, int lower_only) { + int i, j; + uint32_t w = 0; + rep_ele *reps = NULL; + + for (i = j = 0; i < len && j < 15; i++) { + if (cons[i] == '*') continue; + + w <<= 2; + w |= cons[i]; + //printf("%3d %c w=%08x\n", i, cons[i], w); + if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003)) + add_rep(&reps, cons, len, i, 1, lower_only, w); + if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f)) + add_rep(&reps, cons, len, i, 2, lower_only, w); + if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f)) + add_rep(&reps, cons, len, i, 3, lower_only, w); + if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff)) + add_rep(&reps, cons, len, i, 4, lower_only, w); + if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff)) + add_rep(&reps, cons, len, i, 5, lower_only, w); + if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff)) + add_rep(&reps, cons, len, i, 6, lower_only, w); + if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff)) + add_rep(&reps, cons, len, i, 7, lower_only, w); + + j++; + } + + for (; i < len; i++) { + if (cons[i] == '*') continue; + + w <<= 2; + w |= cons[i]; + //printf("%3d %c w=%08x\n", i, cons[i], w); + if ((w&0xffff) == ((w>>16)&0xffff)) + add_rep(&reps, cons, len, i, 8, lower_only, w); + else if ((w&0x3fff) == ((w>>14)&0x3fff)) + add_rep(&reps, cons, len, i, 7, lower_only, w); + else if ((w&0x0fff) == ((w>>12)&0x0fff)) + add_rep(&reps, cons, len, i, 6, lower_only, w); + else if ((w&0x03ff) == ((w>>10)&0x03ff)) + add_rep(&reps, cons, len, i, 5, lower_only, w); + else if ((w&0x00ff) == ((w>> 8)&0x00ff)) + add_rep(&reps, cons, len, i, 4, lower_only, w); + else if ((w&0x003f) == ((w>> 6)&0x003f)) + add_rep(&reps, cons, len, i, 3, lower_only, w); + else if ((w&0x000f) == ((w>> 4)&0x000f)) + add_rep(&reps, cons, len, i, 2, lower_only, w); + else if ((w&0x0003) == ((w>> 2)&0x0003)) + add_rep(&reps, cons, len, i, 1, lower_only, w); + } + + return reps; +} + +/* ----------------------------------------------------------------------------- + * Computes repeat regions in the consensus and then provides a bit mask + * indicating the extend of the STRs. + * + * The purpose of this is to identify where a read needs to span the entire + * region in order to validate how many copies of a repeat word are present. + * This only really has a major impact when indels are involved. + * + * For example, given this multiple alignment: + * + * S1 GATCGGACGAGAG + * S2 GATCGGACGAGAGAGAGAGAGT + * S3 GATCGGACGAGAGAGAGAG**TCGGAC + * S4 GGACGAGAGAGAGAGAGTCGGAC + * S5 CGAGAGAGAGAG**TCGGAC + * S6 AGAGAGAGTCGGAC + * + * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last + * (S1 and S6) sequences do not span and so we do not know which allele they + * match. Specifically as the pad is at the right hand end, the alignment of + * S6 gives incorrect weight to the consensus as it is stating AG when it + * may actually be ** at that point. + * + * By identifying the repeats we can soft clip as follows: + * + * S1 GATCGGACgagag + * S2 GATCGGACGAGAGAGAGAGAGT + * S3 GATCGGACGAGAGAGAGAG**TCGGAC + * S4 GGACGAGAGAGAGAGAGTCGGAC + * S5 CGAGAGAGAGAG**TCGGAC + * S6 agagagagTCGGAC + * + * Returns an array of STR vs no-STR values. + * 0 => non repetitive. + * 1+ => repeat with consecutive bit-number for repeat size. + * + * Eg: AGGGGAGGAGAAGAC + * 1111 1111 + * 2222222 + * 444444 + * => 011331137754440 + */ +char *cons_mark_STR(char *cons, int len, int lower_only) { + rep_ele *reps, *elt, *tmp; + char *str; + + str = calloc(1, len); + reps = find_STR(cons, len, lower_only); + + DL_FOREACH_SAFE(reps, elt, tmp) { + int i, v = 0; + + //printf("%2d .. %2d %.*s\n", elt->start, elt->end, + // elt->end - elt->start+1, &cons[elt->start]); + + // What is there? + for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++) + v |= str[i]; + + for (i = 0; i < 8; i++) { + if (!(v&(1<start; i <= elt->end; i++) + str[i] |= v; + + DL_DELETE(reps, elt); + free(elt); + } + + return str; +} diff --git a/bcftools/str_finder.c.pysam.c b/bcftools/str_finder.c.pysam.c new file mode 100644 index 0000000..296c867 --- /dev/null +++ b/bcftools/str_finder.c.pysam.c @@ -0,0 +1,272 @@ +#include "bcftools.pysam.h" + +/* str_finder.c -- Short Tandem Repeat finder. + Originally from Crumble (https://github.com/jkbonfield/crumble) + + Copyright (C) 2015-2016, 2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include + +#include "str_finder.h" +#include "utlist.h" + +#define MAX(a,b) ((a)>(b)?(a):(b)) +#define MIN(a,b) ((a)<(b)?(a):(b)) + +typedef unsigned char uc; + +static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen, + int lower_only, unsigned int w) { + rep_ele *el, *tmp, *prev; + char *cp1, *cp2, *cp_end; + int i; + + // Already handled this in previous overlap? + if (*list) { + tmp = DL_TAIL(*list); + if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos) + return; + } + + // Find current and last occurence of repeated word. + + cp2 = &cons[pos+1]; + // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen]; + + for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads + if (*cp1 == '*') + continue; + else + i++; + while (*cp1 == '*') + cp1--; + + + // Scan ahead to see how much further it goes. + cp_end = &cons[clen]; + while (cp2 < cp_end) { + if (*cp1 != *cp2) + break; + + w<<=2; + w|=*cp2; + cp1++; + cp2++; + } + + if (!(el = malloc(sizeof(*el)))) + return; + + el->end = pos + cp2-&cons[pos+1]; + el->rep_len = rlen; + pos++; + while (rlen--) { + while (cons[--pos] == '*'); + while (cons[--pos] == '*'); + } + //pos++; + while (pos > 1 && cons[pos-1] == '*') pos--; + el->start = pos; + + // Check it meets the lower-case only criteria + if (lower_only) { + int lc = 0; + for (i = el->start; i <= el->end; i++) { + if (islower(cons[i])) { + lc = 1; + break; + } + } + + if (!lc) { + free(el); + return; + } + } + + // Remove any older items on the list that are entirely contained within el + if (*list) { + tmp = DL_TAIL(*list); + do { + prev = tmp->prev; + if (tmp->end < el->start) + break; + + if (tmp->start >= el->start) { + DL_DELETE(*list, tmp); + free(tmp); + } + + if (tmp == DL_HEAD(*list)) + break; + tmp = prev; + } while (*list); + } + + DL_APPEND(*list, el); + + return; +} + +/* + * Finds repeated homopolymers up to 8-mers. + * Note this assumes cons is 0-3, so N of 4 may rarely give false hits. + * + * Returns a list of rep_ele structs holding the start,end tuples of repeats; + * NULL on failure. + */ +rep_ele *find_STR(char *cons, int len, int lower_only) { + int i, j; + uint32_t w = 0; + rep_ele *reps = NULL; + + for (i = j = 0; i < len && j < 15; i++) { + if (cons[i] == '*') continue; + + w <<= 2; + w |= cons[i]; + //printf("%3d %c w=%08x\n", i, cons[i], w); + if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003)) + add_rep(&reps, cons, len, i, 1, lower_only, w); + if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f)) + add_rep(&reps, cons, len, i, 2, lower_only, w); + if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f)) + add_rep(&reps, cons, len, i, 3, lower_only, w); + if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff)) + add_rep(&reps, cons, len, i, 4, lower_only, w); + if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff)) + add_rep(&reps, cons, len, i, 5, lower_only, w); + if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff)) + add_rep(&reps, cons, len, i, 6, lower_only, w); + if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff)) + add_rep(&reps, cons, len, i, 7, lower_only, w); + + j++; + } + + for (; i < len; i++) { + if (cons[i] == '*') continue; + + w <<= 2; + w |= cons[i]; + //printf("%3d %c w=%08x\n", i, cons[i], w); + if ((w&0xffff) == ((w>>16)&0xffff)) + add_rep(&reps, cons, len, i, 8, lower_only, w); + else if ((w&0x3fff) == ((w>>14)&0x3fff)) + add_rep(&reps, cons, len, i, 7, lower_only, w); + else if ((w&0x0fff) == ((w>>12)&0x0fff)) + add_rep(&reps, cons, len, i, 6, lower_only, w); + else if ((w&0x03ff) == ((w>>10)&0x03ff)) + add_rep(&reps, cons, len, i, 5, lower_only, w); + else if ((w&0x00ff) == ((w>> 8)&0x00ff)) + add_rep(&reps, cons, len, i, 4, lower_only, w); + else if ((w&0x003f) == ((w>> 6)&0x003f)) + add_rep(&reps, cons, len, i, 3, lower_only, w); + else if ((w&0x000f) == ((w>> 4)&0x000f)) + add_rep(&reps, cons, len, i, 2, lower_only, w); + else if ((w&0x0003) == ((w>> 2)&0x0003)) + add_rep(&reps, cons, len, i, 1, lower_only, w); + } + + return reps; +} + +/* ----------------------------------------------------------------------------- + * Computes repeat regions in the consensus and then provides a bit mask + * indicating the extend of the STRs. + * + * The purpose of this is to identify where a read needs to span the entire + * region in order to validate how many copies of a repeat word are present. + * This only really has a major impact when indels are involved. + * + * For example, given this multiple alignment: + * + * S1 GATCGGACGAGAG + * S2 GATCGGACGAGAGAGAGAGAGT + * S3 GATCGGACGAGAGAGAGAG**TCGGAC + * S4 GGACGAGAGAGAGAGAGTCGGAC + * S5 CGAGAGAGAGAG**TCGGAC + * S6 AGAGAGAGTCGGAC + * + * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last + * (S1 and S6) sequences do not span and so we do not know which allele they + * match. Specifically as the pad is at the right hand end, the alignment of + * S6 gives incorrect weight to the consensus as it is stating AG when it + * may actually be ** at that point. + * + * By identifying the repeats we can soft clip as follows: + * + * S1 GATCGGACgagag + * S2 GATCGGACGAGAGAGAGAGAGT + * S3 GATCGGACGAGAGAGAGAG**TCGGAC + * S4 GGACGAGAGAGAGAGAGTCGGAC + * S5 CGAGAGAGAGAG**TCGGAC + * S6 agagagagTCGGAC + * + * Returns an array of STR vs no-STR values. + * 0 => non repetitive. + * 1+ => repeat with consecutive bit-number for repeat size. + * + * Eg: AGGGGAGGAGAAGAC + * 1111 1111 + * 2222222 + * 444444 + * => 011331137754440 + */ +char *cons_mark_STR(char *cons, int len, int lower_only) { + rep_ele *reps, *elt, *tmp; + char *str; + + str = calloc(1, len); + reps = find_STR(cons, len, lower_only); + + DL_FOREACH_SAFE(reps, elt, tmp) { + int i, v = 0; + + //printf("%2d .. %2d %.*s\n", elt->start, elt->end, + // elt->end - elt->start+1, &cons[elt->start]); + + // What is there? + for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++) + v |= str[i]; + + for (i = 0; i < 8; i++) { + if (!(v&(1<start; i <= elt->end; i++) + str[i] |= v; + + DL_DELETE(reps, elt); + free(elt); + } + + return str; +} diff --git a/bcftools/str_finder.h b/bcftools/str_finder.h new file mode 100644 index 0000000..242f59e --- /dev/null +++ b/bcftools/str_finder.h @@ -0,0 +1,64 @@ +/* str_finder.c -- Short Tandem Repeat finder. + Originally from Crumble (https://github.com/jkbonfield/crumble) + + Copyright (C) 2015-2016, 2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef _STR_FINDER_H_ +#define _STR_FINDER_H_ + +#include "utlist.h" + +typedef struct rep_ele { + int start, end, rep_len; + struct rep_ele *prev; + struct rep_ele *next; +} rep_ele; + +/* + * Finds repeated homopolymers up to 8-mers. + * + * If lower_only is true then it only adds STRs for regions that + * contain at least one lower-case base. This can be used as a marker + * for looking for specific types of repeats. + * (One use for this is to only mark STRs that overlap a heterozygous + * indel region.) + * + * Returns a list of rep_ele structs holding the start,end tuples of repeats; + * NULL on failure. + */ +rep_ele *find_STR(char *cons, int len, int lower_only); + +/* + * Returns an array of STR vs no-STR values. + * 0 => non repetitive. + * 1+ => repeat with consecutive bit-number for repeat size. + * + * Eg: AGGGGAGGAGAAGAC + * 1111 1111 + * 2222222 + * 444444 + * => 011331137754440 + */ +char *cons_mark_STR(char *cons, int len, int lower_only); + +#endif /* _STR_FINDER_H_ */ diff --git a/bcftools/utlist.h b/bcftools/utlist.h new file mode 100644 index 0000000..28cf8a3 --- /dev/null +++ b/bcftools/utlist.h @@ -0,0 +1,761 @@ +/* +Copyright (c) 2007-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef UTLIST_H +#define UTLIST_H + +#define UTLIST_VERSION 1.9.9 + +#include + +/* + * This file contains macros to manipulate singly and doubly-linked lists. + * + * 1. LL_ macros: singly-linked lists. + * 2. DL_ macros: doubly-linked lists. + * 3. CDL_ macros: circular doubly-linked lists. + * + * To use singly-linked lists, your structure must have a "next" pointer. + * To use doubly-linked lists, your structure must "prev" and "next" pointers. + * Either way, the pointer to the head of the list must be initialized to NULL. + * + * ----------------.EXAMPLE ------------------------- + * struct item { + * int id; + * struct item *prev, *next; + * } + * + * struct item *list = NULL: + * + * int main() { + * struct item *item; + * ... allocate and populate item ... + * DL_APPEND(list, item); + * } + * -------------------------------------------------- + * + * For doubly-linked lists, the append and delete macros are O(1) + * For singly-linked lists, append and delete are O(n) but prepend is O(1) + * The sort macro is O(n log(n)) for all types of single/double/circular lists. + */ + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ code), this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#ifdef _MSC_VER /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define LDECLTYPE(x) decltype(x) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#define LDECLTYPE(x) char* +#endif +#elif defined(__ICCARM__) +#define NO_DECLTYPE +#define LDECLTYPE(x) char* +#else /* GNU, Sun and other compilers */ +#define LDECLTYPE(x) __typeof(x) +#endif + +/* for VS2008 we use some workarounds to get around the lack of decltype, + * namely, we always reassign our tmp variable to the list head if we need + * to dereference its prev/next pointers, and save/restore the real head.*/ +#ifdef NO_DECLTYPE +#define _SV(elt,list) _tmp = (char*)(list); {char **_alias = (char**)&(list); *_alias = (elt); } +#define _NEXT(elt,list,next) ((char*)((list)->next)) +#define _NEXTASGN(elt,list,to,next) { char **_alias = (char**)&((list)->next); *_alias=(char*)(to); } +/* #define _PREV(elt,list,prev) ((char*)((list)->prev)) */ +#define _PREVASGN(elt,list,to,prev) { char **_alias = (char**)&((list)->prev); *_alias=(char*)(to); } +#define _RS(list) { char **_alias = (char**)&(list); *_alias=_tmp; } +#define _CASTASGN(a,b) { char **_alias = (char**)&(a); *_alias=(char*)(b); } +#else +#define _SV(elt,list) +#define _NEXT(elt,list,next) ((elt)->next) +#define _NEXTASGN(elt,list,to,next) ((elt)->next)=(to) +/* #define _PREV(elt,list,prev) ((elt)->prev) */ +#define _PREVASGN(elt,list,to,prev) ((elt)->prev)=(to) +#define _RS(list) +#define _CASTASGN(a,b) (a)=(b) +#endif + +/****************************************************************************** + * The sort macro is an adaptation of Simon Tatham's O(n log(n)) mergesort * + * Unwieldy variable names used here to avoid shadowing passed-in variables. * + *****************************************************************************/ +#define LL_SORT(list, cmp) \ + LL_SORT2(list, cmp, next) + +#define LL_SORT2(list, cmp, next) \ +do { \ + LDECLTYPE(list) _ls_p; \ + LDECLTYPE(list) _ls_q; \ + LDECLTYPE(list) _ls_e; \ + LDECLTYPE(list) _ls_tail; \ + int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ + if (list) { \ + _ls_insize = 1; \ + _ls_looping = 1; \ + while (_ls_looping) { \ + _CASTASGN(_ls_p,list); \ + list = NULL; \ + _ls_tail = NULL; \ + _ls_nmerges = 0; \ + while (_ls_p) { \ + _ls_nmerges++; \ + _ls_q = _ls_p; \ + _ls_psize = 0; \ + for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ + _ls_psize++; \ + _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \ + if (!_ls_q) break; \ + } \ + _ls_qsize = _ls_insize; \ + while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \ + if (_ls_psize == 0) { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + } else if (_ls_qsize == 0 || !_ls_q) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + } else if (cmp(_ls_p,_ls_q) <= 0) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + } else { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + } \ + if (_ls_tail) { \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ + } else { \ + _CASTASGN(list,_ls_e); \ + } \ + _ls_tail = _ls_e; \ + } \ + _ls_p = _ls_q; \ + } \ + if (_ls_tail) { \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \ + } \ + if (_ls_nmerges <= 1) { \ + _ls_looping=0; \ + } \ + _ls_insize *= 2; \ + } \ + } \ +} while (0) + + +#define DL_SORT(list, cmp) \ + DL_SORT2(list, cmp, prev, next) + +#define DL_SORT2(list, cmp, prev, next) \ +do { \ + LDECLTYPE(list) _ls_p; \ + LDECLTYPE(list) _ls_q; \ + LDECLTYPE(list) _ls_e; \ + LDECLTYPE(list) _ls_tail; \ + int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ + if (list) { \ + _ls_insize = 1; \ + _ls_looping = 1; \ + while (_ls_looping) { \ + _CASTASGN(_ls_p,list); \ + list = NULL; \ + _ls_tail = NULL; \ + _ls_nmerges = 0; \ + while (_ls_p) { \ + _ls_nmerges++; \ + _ls_q = _ls_p; \ + _ls_psize = 0; \ + for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ + _ls_psize++; \ + _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \ + if (!_ls_q) break; \ + } \ + _ls_qsize = _ls_insize; \ + while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \ + if (_ls_psize == 0) { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + } else if (_ls_qsize == 0 || !_ls_q) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + } else if (cmp(_ls_p,_ls_q) <= 0) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + } else { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + } \ + if (_ls_tail) { \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ + } else { \ + _CASTASGN(list,_ls_e); \ + } \ + _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \ + _ls_tail = _ls_e; \ + } \ + _ls_p = _ls_q; \ + } \ + _CASTASGN(list->prev, _ls_tail); \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \ + if (_ls_nmerges <= 1) { \ + _ls_looping=0; \ + } \ + _ls_insize *= 2; \ + } \ + } \ +} while (0) + + +#define DL_HEAD(list) (list) +#define DL_TAIL(list) ((list) ? (list)->prev : NULL) + +#define CDL_SORT(list, cmp) \ + CDL_SORT2(list, cmp, prev, next) + +#define CDL_SORT2(list, cmp, prev, next) \ +do { \ + LDECLTYPE(list) _ls_p; \ + LDECLTYPE(list) _ls_q; \ + LDECLTYPE(list) _ls_e; \ + LDECLTYPE(list) _ls_tail; \ + LDECLTYPE(list) _ls_oldhead; \ + LDECLTYPE(list) _tmp; \ + int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ + if (list) { \ + _ls_insize = 1; \ + _ls_looping = 1; \ + while (_ls_looping) { \ + _CASTASGN(_ls_p,list); \ + _CASTASGN(_ls_oldhead,list); \ + list = NULL; \ + _ls_tail = NULL; \ + _ls_nmerges = 0; \ + while (_ls_p) { \ + _ls_nmerges++; \ + _ls_q = _ls_p; \ + _ls_psize = 0; \ + for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ + _ls_psize++; \ + _SV(_ls_q,list); \ + if (_NEXT(_ls_q,list,next) == _ls_oldhead) { \ + _ls_q = NULL; \ + } else { \ + _ls_q = _NEXT(_ls_q,list,next); \ + } \ + _RS(list); \ + if (!_ls_q) break; \ + } \ + _ls_qsize = _ls_insize; \ + while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \ + if (_ls_psize == 0) { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \ + } else if (_ls_qsize == 0 || !_ls_q) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \ + } else if (cmp(_ls_p,_ls_q) <= 0) { \ + _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ + _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ + if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \ + } else { \ + _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ + _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ + if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \ + } \ + if (_ls_tail) { \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ + } else { \ + _CASTASGN(list,_ls_e); \ + } \ + _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \ + _ls_tail = _ls_e; \ + } \ + _ls_p = _ls_q; \ + } \ + _CASTASGN(list->prev,_ls_tail); \ + _CASTASGN(_tmp,list); \ + _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_tmp,next); _RS(list); \ + if (_ls_nmerges <= 1) { \ + _ls_looping=0; \ + } \ + _ls_insize *= 2; \ + } \ + } \ +} while (0) + +/****************************************************************************** + * singly linked list macros (non-circular) * + *****************************************************************************/ +#define LL_PREPEND(head,add) \ + LL_PREPEND2(head,add,next) + +#define LL_PREPEND2(head,add,next) \ +do { \ + (add)->next = head; \ + head = add; \ +} while (0) + +#define LL_CONCAT(head1,head2) \ + LL_CONCAT2(head1,head2,next) + +#define LL_CONCAT2(head1,head2,next) \ +do { \ + LDECLTYPE(head1) _tmp; \ + if (head1) { \ + _tmp = head1; \ + while (_tmp->next) { _tmp = _tmp->next; } \ + _tmp->next=(head2); \ + } else { \ + (head1)=(head2); \ + } \ +} while (0) + +#define LL_APPEND(head,add) \ + LL_APPEND2(head,add,next) + +#define LL_APPEND2(head,add,next) \ +do { \ + LDECLTYPE(head) _tmp; \ + (add)->next=NULL; \ + if (head) { \ + _tmp = head; \ + while (_tmp->next) { _tmp = _tmp->next; } \ + _tmp->next=(add); \ + } else { \ + (head)=(add); \ + } \ +} while (0) + +#define LL_DELETE(head,del) \ + LL_DELETE2(head,del,next) + +#define LL_DELETE2(head,del,next) \ +do { \ + LDECLTYPE(head) _tmp; \ + if ((head) == (del)) { \ + (head)=(head)->next; \ + } else { \ + _tmp = head; \ + while (_tmp->next && (_tmp->next != (del))) { \ + _tmp = _tmp->next; \ + } \ + if (_tmp->next) { \ + _tmp->next = ((del)->next); \ + } \ + } \ +} while (0) + +/* Here are VS2008 replacements for LL_APPEND and LL_DELETE */ +#define LL_APPEND_VS2008(head,add) \ + LL_APPEND2_VS2008(head,add,next) + +#define LL_APPEND2_VS2008(head,add,next) \ +do { \ + if (head) { \ + (add)->next = head; /* use add->next as a temp variable */ \ + while ((add)->next->next) { (add)->next = (add)->next->next; } \ + (add)->next->next=(add); \ + } else { \ + (head)=(add); \ + } \ + (add)->next=NULL; \ +} while (0) + +#define LL_DELETE_VS2008(head,del) \ + LL_DELETE2_VS2008(head,del,next) + +#define LL_DELETE2_VS2008(head,del,next) \ +do { \ + if ((head) == (del)) { \ + (head)=(head)->next; \ + } else { \ + char *_tmp = (char*)(head); \ + while ((head)->next && ((head)->next != (del))) { \ + head = (head)->next; \ + } \ + if ((head)->next) { \ + (head)->next = ((del)->next); \ + } \ + { \ + char **_head_alias = (char**)&(head); \ + *_head_alias = _tmp; \ + } \ + } \ +} while (0) +#ifdef NO_DECLTYPE +#undef LL_APPEND +#define LL_APPEND LL_APPEND_VS2008 +#undef LL_DELETE +#define LL_DELETE LL_DELETE_VS2008 +#undef LL_DELETE2 +#define LL_DELETE2 LL_DELETE2_VS2008 +#undef LL_APPEND2 +#define LL_APPEND2 LL_APPEND2_VS2008 +#undef LL_CONCAT /* no LL_CONCAT_VS2008 */ +#undef DL_CONCAT /* no DL_CONCAT_VS2008 */ +#endif +/* end VS2008 replacements */ + +#define LL_COUNT(head,el,counter) \ + LL_COUNT2(head,el,counter,next) \ + +#define LL_COUNT2(head,el,counter,next) \ +{ \ + counter = 0; \ + LL_FOREACH2(head,el,next){ ++counter; } \ +} + +#define LL_FOREACH(head,el) \ + LL_FOREACH2(head,el,next) + +#define LL_FOREACH2(head,el,next) \ + for(el=head;el;el=(el)->next) + +#define LL_FOREACH_SAFE(head,el,tmp) \ + LL_FOREACH_SAFE2(head,el,tmp,next) + +#define LL_FOREACH_SAFE2(head,el,tmp,next) \ + for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp) + +#define LL_SEARCH_SCALAR(head,out,field,val) \ + LL_SEARCH_SCALAR2(head,out,field,val,next) + +#define LL_SEARCH_SCALAR2(head,out,field,val,next) \ +do { \ + LL_FOREACH2(head,out,next) { \ + if ((out)->field == (val)) break; \ + } \ +} while(0) + +#define LL_SEARCH(head,out,elt,cmp) \ + LL_SEARCH2(head,out,elt,cmp,next) + +#define LL_SEARCH2(head,out,elt,cmp,next) \ +do { \ + LL_FOREACH2(head,out,next) { \ + if ((cmp(out,elt))==0) break; \ + } \ +} while(0) + +#define LL_REPLACE_ELEM(head, el, add) \ +do { \ + LDECLTYPE(head) _tmp; \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + (add)->next = (el)->next; \ + if ((head) == (el)) { \ + (head) = (add); \ + } else { \ + _tmp = head; \ + while (_tmp->next && (_tmp->next != (el))) { \ + _tmp = _tmp->next; \ + } \ + if (_tmp->next) { \ + _tmp->next = (add); \ + } \ + } \ +} while (0) + +#define LL_PREPEND_ELEM(head, el, add) \ +do { \ + LDECLTYPE(head) _tmp; \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + (add)->next = (el); \ + if ((head) == (el)) { \ + (head) = (add); \ + } else { \ + _tmp = head; \ + while (_tmp->next && (_tmp->next != (el))) { \ + _tmp = _tmp->next; \ + } \ + if (_tmp->next) { \ + _tmp->next = (add); \ + } \ + } \ +} while (0) \ + + +/****************************************************************************** + * doubly linked list macros (non-circular) * + *****************************************************************************/ +#define DL_PREPEND(head,add) \ + DL_PREPEND2(head,add,prev,next) + +#define DL_PREPEND2(head,add,prev,next) \ +do { \ + (add)->next = head; \ + if (head) { \ + (add)->prev = (head)->prev; \ + (head)->prev = (add); \ + } else { \ + (add)->prev = (add); \ + } \ + (head) = (add); \ +} while (0) + +#define DL_APPEND(head,add) \ + DL_APPEND2(head,add,prev,next) + +#define DL_APPEND2(head,add,prev,next) \ +do { \ + if (head) { \ + (add)->prev = (head)->prev; \ + (head)->prev->next = (add); \ + (head)->prev = (add); \ + (add)->next = NULL; \ + } else { \ + (head)=(add); \ + (head)->prev = (head); \ + (head)->next = NULL; \ + } \ +} while (0) + +#define DL_CONCAT(head1,head2) \ + DL_CONCAT2(head1,head2,prev,next) + +#define DL_CONCAT2(head1,head2,prev,next) \ +do { \ + LDECLTYPE(head1) _tmp; \ + if (head2) { \ + if (head1) { \ + _tmp = (head2)->prev; \ + (head2)->prev = (head1)->prev; \ + (head1)->prev->next = (head2); \ + (head1)->prev = _tmp; \ + } else { \ + (head1)=(head2); \ + } \ + } \ +} while (0) + +#define DL_DELETE(head,del) \ + DL_DELETE2(head,del,prev,next) + +#define DL_DELETE2(head,del,prev,next) \ +do { \ + assert((del)->prev != NULL); \ + if ((del)->prev == (del)) { \ + (head)=NULL; \ + } else if ((del)==(head)) { \ + (del)->next->prev = (del)->prev; \ + (head) = (del)->next; \ + } else { \ + (del)->prev->next = (del)->next; \ + if ((del)->next) { \ + (del)->next->prev = (del)->prev; \ + } else { \ + (head)->prev = (del)->prev; \ + } \ + } \ +} while (0) + +#define DL_COUNT(head,el,counter) \ + DL_COUNT2(head,el,counter,next) \ + +#define DL_COUNT2(head,el,counter,next) \ +{ \ + counter = 0; \ + DL_FOREACH2(head,el,next){ ++counter; } \ +} + +#define DL_FOREACH(head,el) \ + DL_FOREACH2(head,el,next) + +#define DL_FOREACH2(head,el,next) \ + for(el=head;el;el=(el)->next) + +/* this version is safe for deleting the elements during iteration */ +#define DL_FOREACH_SAFE(head,el,tmp) \ + DL_FOREACH_SAFE2(head,el,tmp,next) + +#define DL_FOREACH_SAFE2(head,el,tmp,next) \ + for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp) + +/* these are identical to their singly-linked list counterparts */ +#define DL_SEARCH_SCALAR LL_SEARCH_SCALAR +#define DL_SEARCH LL_SEARCH +#define DL_SEARCH_SCALAR2 LL_SEARCH_SCALAR2 +#define DL_SEARCH2 LL_SEARCH2 + +#define DL_REPLACE_ELEM(head, el, add) \ +do { \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + if ((head) == (el)) { \ + (head) = (add); \ + (add)->next = (el)->next; \ + if ((el)->next == NULL) { \ + (add)->prev = (add); \ + } else { \ + (add)->prev = (el)->prev; \ + (add)->next->prev = (add); \ + } \ + } else { \ + (add)->next = (el)->next; \ + (add)->prev = (el)->prev; \ + (add)->prev->next = (add); \ + if ((el)->next == NULL) { \ + (head)->prev = (add); \ + } else { \ + (add)->next->prev = (add); \ + } \ + } \ +} while (0) + +#define DL_PREPEND_ELEM(head, el, add) \ +do { \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + (add)->next = (el); \ + (add)->prev = (el)->prev; \ + (el)->prev = (add); \ + if ((head) == (el)) { \ + (head) = (add); \ + } else { \ + (add)->prev->next = (add); \ + } \ +} while (0) \ + + +/****************************************************************************** + * circular doubly linked list macros * + *****************************************************************************/ +#define CDL_PREPEND(head,add) \ + CDL_PREPEND2(head,add,prev,next) + +#define CDL_PREPEND2(head,add,prev,next) \ +do { \ + if (head) { \ + (add)->prev = (head)->prev; \ + (add)->next = (head); \ + (head)->prev = (add); \ + (add)->prev->next = (add); \ + } else { \ + (add)->prev = (add); \ + (add)->next = (add); \ + } \ +(head)=(add); \ +} while (0) + +#define CDL_DELETE(head,del) \ + CDL_DELETE2(head,del,prev,next) + +#define CDL_DELETE2(head,del,prev,next) \ +do { \ + if ( ((head)==(del)) && ((head)->next == (head))) { \ + (head) = 0L; \ + } else { \ + (del)->next->prev = (del)->prev; \ + (del)->prev->next = (del)->next; \ + if ((del) == (head)) (head)=(del)->next; \ + } \ +} while (0) + +#define CDL_COUNT(head,el,counter) \ + CDL_COUNT2(head,el,counter,next) \ + +#define CDL_COUNT2(head, el, counter,next) \ +{ \ + counter = 0; \ + CDL_FOREACH2(head,el,next){ ++counter; } \ +} + +#define CDL_FOREACH(head,el) \ + CDL_FOREACH2(head,el,next) + +#define CDL_FOREACH2(head,el,next) \ + for(el=head;el;el=((el)->next==head ? 0L : (el)->next)) + +#define CDL_FOREACH_SAFE(head,el,tmp1,tmp2) \ + CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next) + +#define CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next) \ + for((el)=(head), ((tmp1)=(head)?((head)->prev):NULL); \ + (el) && ((tmp2)=(el)->next, 1); \ + ((el) = (((el)==(tmp1)) ? 0L : (tmp2)))) + +#define CDL_SEARCH_SCALAR(head,out,field,val) \ + CDL_SEARCH_SCALAR2(head,out,field,val,next) + +#define CDL_SEARCH_SCALAR2(head,out,field,val,next) \ +do { \ + CDL_FOREACH2(head,out,next) { \ + if ((out)->field == (val)) break; \ + } \ +} while(0) + +#define CDL_SEARCH(head,out,elt,cmp) \ + CDL_SEARCH2(head,out,elt,cmp,next) + +#define CDL_SEARCH2(head,out,elt,cmp,next) \ +do { \ + CDL_FOREACH2(head,out,next) { \ + if ((cmp(out,elt))==0) break; \ + } \ +} while(0) + +#define CDL_REPLACE_ELEM(head, el, add) \ +do { \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + if ((el)->next == (el)) { \ + (add)->next = (add); \ + (add)->prev = (add); \ + (head) = (add); \ + } else { \ + (add)->next = (el)->next; \ + (add)->prev = (el)->prev; \ + (add)->next->prev = (add); \ + (add)->prev->next = (add); \ + if ((head) == (el)) { \ + (head) = (add); \ + } \ + } \ +} while (0) + +#define CDL_PREPEND_ELEM(head, el, add) \ +do { \ + assert(head != NULL); \ + assert(el != NULL); \ + assert(add != NULL); \ + (add)->next = (el); \ + (add)->prev = (el)->prev; \ + (el)->prev = (add); \ + (add)->prev->next = (add); \ + if ((head) == (el)) { \ + (head) = (add); \ + } \ +} while (0) \ + +#endif /* UTLIST_H */ + diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index 3697847..0976fe3 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -26,6 +26,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ annot_line_t; #define REPLACE_ALL 1 // replace both missing and existing values #define REPLACE_NON_MISSING 2 // replace only if tgt is not missing #define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise +#define MATCH_VALUE 4 // do not set, just match the value -c ~ID #define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest #define MM_APPEND 1 // append, possibly multiple times #define MM_UNIQUE 2 // append, only unique values @@ -77,19 +79,26 @@ annot_line_t; #define MM_AVG 4 #define MM_MIN 5 #define MM_MAX 6 +#define MM_APPEND_MISSING 7 // missing values will be transferred as well typedef struct _annot_col_t { int icol, replace, number; // number: one of BCF_VL_* types char *hdr_key_src, *hdr_key_dst; - int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*); + // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors, + // or 1 on (repeated partial updates) concluded with a src=NULL call + int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t + int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr); int merge_method; // one of the MM_* defines khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values kstring_t mm_kstr; - double + size_t mm_dbl_nalloc, // the allocated size --merge-logic values array mm_dbl_nused, // the number of used elements in the mm_dbl array - mm_dbl_ndat, // the number of merged rows (for calculating the average) + mm_dbl_ndat; // the number of merged rows (for calculating the average) + double *mm_dbl; + void *ptr; + int mptr, done; } annot_col_t; @@ -103,12 +112,12 @@ annot_col_t; typedef struct _args_t { bcf_srs_t *files; - bcf_hdr_t *hdr, *hdr_out; + bcf_hdr_t *hdr, *hdr_out, *tgts_hdr; htsFile *out_fh; int output_type, n_threads; bcf_sr_regions_t *tgts; - regidx_t *tgt_idx; + regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns regitr_t *tgt_itr; int tgt_is_bed; @@ -123,10 +132,13 @@ typedef struct _args_t vcmp_t *vcmp; // for matching annotation and VCF lines by allele annot_line_t *alines; // buffered annotation lines - int nalines, malines; + annot_line_t *aline_missing; + uint32_t *srt_alines; // sorted indexes (iALT<<16 || iAline) + int nalines, malines, nsrt_alines, msrt_alines; int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present annot_col_t *cols; // column indexes and setters int ncols; + int match_id; // set iff `-c ~ID` given char *set_ids_fmt; convert_t *set_ids; @@ -144,9 +156,10 @@ typedef struct _args_t kstring_t tmpks; char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; - char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; - char *merge_method_str; + char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites; + kstring_t merge_method_str; int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; + int columns_is_file, has_append_mode; } args_t; @@ -195,6 +208,8 @@ void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag) for (i=0; in_info; i++) { bcf_info_t *inf = &line->d.info[i]; + if ( !strcmp("END",bcf_hdr_int2id(args->hdr,BCF_DT_ID,inf->key)) ) + line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0; if ( inf->vptr_free ) { free(inf->vptr - inf->vptr_off); @@ -374,6 +389,10 @@ static void init_remove_annots(args_t *args) } else if ( str.l ) { + int id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, str.s); + if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,id) ) error("Error: did you mean INFO/%s?\n",str.s); + if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: did you mean FORMAT/%s?\n",str.s); + if ( !args->keep_sites ) { if ( str.s[0]=='#' && str.s[1]=='#' ) @@ -441,6 +460,42 @@ static void init_header_lines(args_t *args) if (bcf_hdr_sync(args->hdr) < 0) error_errno("[%s] Failed to update input header", __func__); } +static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) +{ + return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); +} +static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) +{ + char *str = *((char**)ptr); + int len = strlen(rec->d.id); + if ( len >= *mptr ) str = realloc(str, len+1); + strcpy(str, rec->d.id); + *((char**)ptr) = str; + *mptr = len+1; + return len; +} +static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) +{ + kstring_t str; + str.s = *((char**)ptr); + str.m = *mptr; + str.l = 0; + + int i; + if ( rec->d.n_flt ) + { + for (i=0; id.n_flt; i++) + { + if (i) kputc(';', &str); + kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str); + } + } + else kputc('.', &str); + + *((char**)ptr) = str.s; + *mptr = str.m; + return str.l; +} static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n"); @@ -450,24 +505,24 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." hts_expand(int,1,args->mtmpi,args->tmpi); args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]); - if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]); - if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; } + if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]); + if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]); if ( col->replace!=REPLACE_MISSING ) { bcf_update_filter(args->hdr_out,line,NULL,0); - bcf_update_filter(args->hdr_out,line,args->tmpi,1); - return 0; + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); } // only update missing FILTER if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); if ( !line->d.n_flt ) - bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return 0; } static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { - int i; + int i, ret = 0; bcf1_t *rec = (bcf1_t*) data; if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT); if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); @@ -478,9 +533,9 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void for (i=0; id.n_flt; i++) { const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]); - bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)); + if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1; } - return 0; + return ret; } hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi); for (i=0; id.n_flt; i++) @@ -489,12 +544,12 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt); } bcf_update_filter(args->hdr_out,line,NULL,0); - bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt); - return 0; + return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt); } static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n"); + if ( col->replace==MATCH_VALUE ) return 0; // possible cases: // IN ANNOT OUT ACHIEVED_BY @@ -517,14 +572,28 @@ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) } static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { + if ( col->replace==MATCH_VALUE ) return 0; + bcf1_t *rec = (bcf1_t*) data; - if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "." - if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,rec->d.id); - if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,rec->d.id); + + char *id; + if ( col->getter ) + { + int nret = col->getter(args,rec,col,&col->ptr,&col->mptr); + id = (char*) col->ptr; + if ( nret<=0 || (nret==1 && *id=='.') ) return 0; // don't replace with "." + } + else + { + if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "." + id = rec->d.id; + } + if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id); + if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id); // running with +ID, only update missing ids if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) - return bcf_update_id(args->hdr_out,line,rec->d.id); + return bcf_update_id(args->hdr_out,line,id); return 0; } static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -535,9 +604,9 @@ static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *da als[0] = rec->d.allele[0]; int i; for (i=1; in_allele; i++) als[i] = line->d.allele[i]; - bcf_update_alleles(args->hdr_out, line, als, line->n_allele); + int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele); free(als); - return 0; + return ret; } static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -551,9 +620,9 @@ static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *da const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele); als[0] = line->d.allele[0]; for (i=1; in_allele; i++) als[i] = rec->d.allele[i]; - bcf_update_alleles(args->hdr_out, line, als, rec->n_allele); + int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele); free(als); - return 0; + return ret; } static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -627,34 +696,51 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpi2[i] = args->tmpi[ map[i] ]; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst); } static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { annot_line_t *tab = (annot_line_t*) data; + // This is a bit hacky, only to reuse existing code with minimal changes: + // -c =TAG will now behave as -l TAG:APPEND for integers + if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND; + if ( !tab ) { - if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) - error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n"); + if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && + col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && + col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND_MISSING ) + error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n"); } int i,ntmpi = 0; - if ( tab ) + if ( tab ) // has data, not flushing yet { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 ) return 0; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; while ( *end ) { - int val = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ntmpi++; hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); - args->tmpi[ntmpi-1] = val; - str = end+1; + if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) + { + if ( col->merge_method==MM_APPEND_MISSING ) + args->tmpi[ntmpi-1] = bcf_int32_missing; + else + ntmpi--; + if ( str[1]==0 ) end = str+1; + str += 2; + } + else + { + args->tmpi[ntmpi-1] = strtol(str, &end, 10); + if ( end==str ) + error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + str = end+1; + } } if ( col->merge_method!=MM_FIRST ) { @@ -667,7 +753,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } else { - if ( col->merge_method==MM_APPEND ) + if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { int nori = col->mm_dbl_nused; col->mm_dbl_nused += ntmpi; @@ -687,9 +773,10 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } } col->mm_dbl_ndat++; + return 1; } } - else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) + else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { ntmpi = col->mm_dbl_nused; hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); @@ -713,8 +800,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); } static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -731,8 +817,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); } static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) { @@ -763,34 +848,51 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpf2[i] = args->tmpf[ map[i] ]; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst); } static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { annot_line_t *tab = (annot_line_t*) data; + // This is a bit hacky, only to reuse existing code with minimal changes: + // -c =TAG will now behave as -l TAG:APPEND for floats + if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND; + if ( !tab ) { - if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) - error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n"); + if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && + col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && + col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND_MISSING ) + error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n"); } int i,ntmpf = 0; if ( tab ) { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 ) return 0; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; while ( *end ) { - double val = strtod(str, &end); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ntmpf++; hts_expand(float,ntmpf,args->mtmpf,args->tmpf); - args->tmpf[ntmpf-1] = val; - str = end+1; + if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) + { + if ( col->merge_method==MM_APPEND_MISSING ) + bcf_float_set_missing(args->tmpf[ntmpf-1]); + else + ntmpf--; + if ( str[1]==0 ) end = str+1; + str += 2; + } + else + { + args->tmpf[ntmpf-1] = strtod(str, &end); + if ( end==str ) + error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + str = end+1; + } } if ( col->merge_method!=MM_FIRST ) { @@ -799,17 +901,27 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * col->mm_dbl_nused = ntmpf; hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); for (i=0; imm_dbl[i] = args->tmpf[i]; + { + if ( bcf_float_is_missing(args->tmpf[i]) ) + bcf_double_set_missing(col->mm_dbl[i]); + else + col->mm_dbl[i] = args->tmpf[i]; + } } else { - if ( col->merge_method==MM_APPEND ) + if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { int nori = col->mm_dbl_nused; col->mm_dbl_nused += ntmpf; hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); for (i=0; imm_dbl[i+nori] = args->tmpf[i]; + { + if ( bcf_float_is_missing(args->tmpf[i]) ) + bcf_double_set_missing(col->mm_dbl[i+nori]); + else + col->mm_dbl[i+nori] = args->tmpf[i]; + } } else { @@ -823,13 +935,20 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * } } col->mm_dbl_ndat++; + return 1; } } - else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) + else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { ntmpf = col->mm_dbl_nused; hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); - for (i=0; itmpf[i] = col->mm_dbl[i]; + for (i=0; imm_dbl[i]) ) + bcf_float_set_missing(args->tmpf[i]); + else + args->tmpf[i] = col->mm_dbl[i]; + } col->mm_dbl_nused = col->mm_dbl_ndat = 0; } else if ( col->merge_method==MM_AVG ) @@ -849,8 +968,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); } static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -867,8 +985,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); } int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) @@ -923,10 +1040,9 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue; // value already set } int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i); - assert( ret==0 ); + if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n", __FILE__,__LINE__,__func__); } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); } void khash_str2int_clear_free(void *_hash) { @@ -945,14 +1061,18 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } + // This is a bit hacky, only to reuse existing code with minimal changes: + // -c =TAG will now behave as -l TAG:unique for strings + if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE; + annot_line_t *tab = (annot_line_t*) data; - + int len = 0; if ( tab ) { len = strlen(tab->cols[col->icol]); if ( !len ) return 0; - if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0; + if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1; } if ( col->merge_method!=MM_FIRST ) @@ -962,17 +1082,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( data ) { - assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE ); + assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE ); if ( col->merge_method==MM_UNIQUE ) { if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init(); - if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0; + if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1; khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); } if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); kputs(tab->cols[col->icol], &col->mm_kstr); - return 0; + return 1; } if ( col->mm_kstr.l ) @@ -983,12 +1103,10 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d else return 0; - if ( !data ) // flush the line - { - if ( col->merge_method==MM_UNIQUE ) - khash_str2int_clear_free(col->mm_str_hash); - col->mm_kstr.l = 0; - } + // flush the line + if ( col->merge_method==MM_UNIQUE ) + khash_str2int_clear_free(col->mm_str_hash); + col->mm_kstr.l = 0; } else { @@ -1000,14 +1118,19 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d return setter_ARinfo_string(args,line,col,tab->nals,tab->als); } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); } static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps); - if ( ntmps < 0 ) return 0; // nothing to add + + if ( col->getter ) + col->getter(args,rec,col,(void**)&args->tmps, &args->mtmps); + else + { + int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps); + if ( ntmps < 0 ) return 0; // nothing to add + } if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele); @@ -1018,8 +1141,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); } static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str) { @@ -1689,7 +1811,6 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, } } return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1); - } static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -1771,17 +1892,12 @@ static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) // tab annotation file, expecting that all samples are present: sample map not needed if ( !src ) return 0; - int nmatch = 0, order_ok = 1; + int nmatch = 0; for (i=0; isamples[i]); - if ( id!=-1 ) - { - nmatch++; - if ( i!=id ) order_ok = 0; - } + if ( id!=-1 ) nmatch++; } - if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0; // not needed if ( !nmatch ) return -1; // No matching samples found in the source and the destination file args->nsample_map = bcf_hdr_nsamples(dst); @@ -1900,11 +2016,45 @@ static void init_columns(args_t *args) int need_sample_map = 0; int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr); + kstring_t tmp = {0,0,0}; + if ( args->columns_is_file ) + { + int i,n; + char **str = hts_readlist(args->columns, args->columns_is_file, &n); + if ( !str ) error("Could not parse %s\n", args->columns); + for (i=0; imerge_method_str.l ) kputc(',',&args->merge_method_str); + kputs(str[i],&args->merge_method_str); + kputc(':',&args->merge_method_str); + kputs(ptr,&args->merge_method_str); + } + } + if ( tmp.l ) kputc(',',&tmp); + kputs(str[i],&tmp); + free(str[i]); + } + free(str); + free(args->columns); + args->columns = tmp.s; + tmp.l = tmp.m = 0; + tmp.s = NULL; + } + void *skip_fmt = NULL, *skip_info = NULL; if ( args->tgts_is_vcf ) args->columns = columns_complement(args->columns, &skip_info, &skip_fmt); - kstring_t str = {0,0,0}, tmp = {0,0,0}; + kstring_t str = {0,0,0}; char *ss = args->columns, *se = ss; args->ncols = 0; int icol = -1, has_fmt_str = 0; @@ -1929,6 +2079,7 @@ static void init_columns(args_t *args) { args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->setter = vcf_setter_ref; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); @@ -1941,28 +2092,54 @@ static void init_columns(args_t *args) { args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->setter = vcf_setter_alt; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); } else args->alt_idx = icol; } - else if ( !strcasecmp("ID",str.s) ) + else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) ) { if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + if ( str.s[0]=='~' ) replace = MATCH_VALUE; + if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); + if ( replace==MATCH_VALUE ) args->match_id = icol; + } + else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column + { + if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->setter = vcf_setter_id; + col->getter = vcf_getter_info_str2str; + str.s[2] = 0; + col->hdr_key_dst = strdup(str.s); + col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5); + int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src); + if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) + error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname); + if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR ) + error("Only Type=String tags can be used to annotate the ID column\n"); } else if ( !strcasecmp("FILTER",str.s) ) { if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter; @@ -1977,7 +2154,7 @@ static void init_columns(args_t *args) bcf_hrec_t *hrec = tgts_hdr->hrec[j]; if ( hrec->type!=BCF_HL_FLT ) continue; int k = bcf_hrec_find_key(hrec,"ID"); - assert( k>=0 ); // this should always be true for valid VCFs + if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__); tmp.l = 0; bcf_hrec_format(hrec, &tmp); bcf_hdr_append(args->hdr_out, tmp.s); @@ -1992,6 +2169,7 @@ static void init_columns(args_t *args) if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual; @@ -2001,7 +2179,7 @@ static void init_columns(args_t *args) else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields { if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); + if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n"); bcf_hdr_t *tgts_hdr = args->files->readers[1].header; int j; for (j=0; jnhrec; j++) @@ -2019,6 +2197,7 @@ static void init_columns(args_t *args) int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = -1; col->replace = replace; col->hdr_key_src = strdup(hrec->vals[k]); @@ -2054,11 +2233,16 @@ static void init_columns(args_t *args) int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = -1; col->replace = replace; col->hdr_key_src = strdup(hrec->vals[k]); col->hdr_key_dst = strdup(hrec->vals[k]); - if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt; + if ( !strcasecmp("GT",col->hdr_key_src) ) + { + if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n"); + col->setter = vcf_setter_format_gt; + } else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { @@ -2097,9 +2281,10 @@ static void init_columns(args_t *args) } int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) - error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); if ( !args->tgts_is_vcf ) { col->icol = icol; @@ -2110,7 +2295,11 @@ static void init_columns(args_t *args) col->replace = replace; col->hdr_key_src = strdup(key_src); col->hdr_key_dst = strdup(key_dst); - if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt; + if ( !strcasecmp("GT",key_src) ) + { + if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n"); + col->setter = vcf_setter_format_gt; + } else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { @@ -2129,13 +2318,20 @@ static void init_columns(args_t *args) else { if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); - int explicit_info = 0; + if ( replace==SET_OR_APPEND ) + { + if ( args->tgts_is_vcf ) + error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n" + " (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n"); + fprintf(stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n"); + } + int explicit_src_info = 0; + int explicit_dst_info = 0; char *key_dst; if ( !strncasecmp("INFO/",str.s,5) ) { key_dst = str.s + 5; - explicit_info = 1; + explicit_dst_info = 1; } else key_dst = str.s; @@ -2147,7 +2343,7 @@ static void init_columns(args_t *args) if ( !strncasecmp("INFO/",key_src,5) ) { key_src += 5; - explicit_info = 1; + explicit_src_info = 1; } else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) ) { @@ -2157,38 +2353,65 @@ static void init_columns(args_t *args) } else key_src = key_dst; + + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->hdr_key_src = strdup(key_src); + col->hdr_key_dst = strdup(key_dst); + int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ) { if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line { - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); - if ( !hrec ) + if ( !strcasecmp("ID",key_src) && !explicit_src_info ) { - if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) ) - error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); - fprintf(stderr,"[%s] %d\n",key_src,explicit_info); - error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname); + // transferring ID column into a new INFO tag + tmp.l = 0; + ksprintf(&tmp,"##INFO=",key_dst); + } + else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) + { + // transferring FILTER column into a new INFO tag + tmp.l = 0; + ksprintf(&tmp,"##INFO=",key_dst); + } + else + { + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); + if ( !hrec ) + { + if ( explicit_dst_info+explicit_src_info==0 && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) ) + error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); + char *ptr = strchr(key_src,'='); + if ( ptr ) + { + *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '='; + error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s); + } + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname); + } + tmp.l = 0; + bcf_hrec_format_rename(hrec, key_dst, &tmp); } - tmp.l = 0; - bcf_hrec_format_rename(hrec, key_dst, &tmp); bcf_hdr_append(args->hdr_out, tmp.s); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } - - args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); - annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = icol; - col->replace = replace; - col->hdr_key_src = strdup(key_src); - col->hdr_key_dst = strdup(key_dst); - col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); + if ( args->tgts_is_vcf ) + { + if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str; + else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str; + } + col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) ) { case BCF_HT_FLAG: col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break; @@ -2197,6 +2420,18 @@ static void init_columns(args_t *args) case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break; default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id)); } + if ( replace==SET_OR_APPEND ) // change to Number=. + { + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL); + if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst); + hrec = bcf_hrec_dup(hrec); + int j = bcf_hrec_find_key(hrec, "Number"); + if ( j<0 ) error("Uh, could not find the entry Number in the header record of %s\n",key_dst); + free(hrec->vals[j]); + hrec->vals[j] = strdup("."); + bcf_hdr_remove(args->hdr_out,BCF_HL_INFO, key_dst); + bcf_hdr_add_hrec(args->hdr_out, hrec); + } } if ( !*se ) break; ss = ++se; @@ -2232,10 +2467,10 @@ static void init_merge_method(args_t *args) args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0; memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr)); } - if ( !args->merge_method_str ) return; + if ( !args->merge_method_str.l ) return; if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n"); - if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n"); - char *sb = args->merge_method_str; + if ( !args->tgt_idx && !args->tgts ) error("Error: BEG,END (or FROM,TO) columns or REF,ALT columns are expected with the --merge-logic option.\n"); + char *sb = args->merge_method_str.s; while ( *sb ) { char *se = sb; @@ -2246,21 +2481,27 @@ static void init_merge_method(args_t *args) char *mm_type_str = args->tmpks.s + args->tmpks.l; while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--; if ( *mm_type_str!=':' ) - error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str); + error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str.s); *mm_type_str = 0; mm_type_str++; int mm_type = MM_FIRST; if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE; + else if ( !strcasecmp("first",mm_type_str) ) mm_type = MM_FIRST; else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND; + else if ( !strcasecmp("append-missing",mm_type_str) ) + { + mm_type = MM_APPEND_MISSING; + if ( args->ref_idx!=-1 ) args->has_append_mode = 1; + } else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM; else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG; else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN; else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX; - else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str); + else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str.s,mm_type_str); for (i=0; incols; i++) { if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue; - if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR ) + if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR ) error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n"); args->cols[i].merge_method = mm_type; break; @@ -2268,6 +2509,20 @@ static void init_merge_method(args_t *args) if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s); sb = *se ? se + 1 : se; } + if ( args->has_append_mode ) + { + // create a missing line to insert missing values when VCF ALT finds no match in the annotation file + args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing)); + int ncol = 0; + for (i=0; incols; i++) + if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1; + if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1; + args->aline_missing->mcols = ncol; + args->aline_missing->ncols = ncol; + args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*)); + for (i=0; ialine_missing->cols[i] = strdup("."); + } } static void rename_chrs(args_t *args, char *fname) @@ -2299,6 +2554,42 @@ static void rename_chrs(args_t *args, char *fname) free(map); } +static void rename_annots(args_t *args, char *fname) +{ + int n, i; + char **map = hts_readlist(fname, 1, &n); + if ( !map ) error("Could not read: %s\n", fname); + for (i=0; ihdr_out, BCF_DT_ID, sb); + if ( id<0 ) continue; + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL); + if ( !hrec ) continue; // the sequence not present + int j = bcf_hrec_find_key(hrec, "ID"); + assert( j>=0 ); + free(hrec->vals[j]); + ss++; + while ( *ss && isspace(*ss) ) ss++; + char *se = ss; + while ( *se && !isspace(*se) ) se++; + *se = 0; + hrec->vals[j] = strdup(ss); + args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + } + for (i=0; ihdr = args->files->readers[0].header; @@ -2311,6 +2602,7 @@ static void init_data(args_t *args) // reading annots from a VCF if ( !bcf_sr_add_reader(args->files, args->targets_fname) ) error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum)); + args->tgts_hdr = args->files->readers[1].header; } if ( args->columns ) init_columns(args); if ( args->targets_fname && !args->tgts_is_vcf ) @@ -2318,8 +2610,8 @@ static void init_data(args_t *args) if ( !args->columns ) error("The -c option not given\n"); if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n"); if ( args->beg_idx==-1 ) error("The -c POS option not given\n"); - if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n"); - if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) ) + if ( args->single_overlaps && args->merge_method_str.l ) error("The options --merge-logic and --single-overlaps cannot be combined\n"); + if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str.l) ) { args->end_idx = -args->beg_idx - 1; args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx); @@ -2363,8 +2655,9 @@ static void init_data(args_t *args) if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); + if ( args->rename_annots ) rename_annots(args, args->rename_annots); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); @@ -2386,8 +2679,15 @@ static void destroy_data(args_t *args) free(args->cols[i].mm_kstr.s); if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash); free(args->cols[i].mm_dbl); + free(args->cols[i].ptr); } free(args->cols); + if ( args->aline_missing ) + { + for (i=0; ialine_missing->ncols; i++) free(args->aline_missing->cols[i]); + free(args->aline_missing->cols); + free(args->aline_missing); + } for (i=0; imalines; i++) { free(args->alines[i].cols); @@ -2395,6 +2695,7 @@ static void destroy_data(args_t *args) free(args->alines[i].line.s); } free(args->alines); + free(args->srt_alines); if ( args->tgt_idx ) { regidx_destroy(args->tgt_idx); @@ -2420,6 +2721,7 @@ static void destroy_data(args_t *args) filter_destroy(args->filter); if (args->out_fh) hts_close(args->out_fh); free(args->sample_map); + free(args->merge_method_str.s); } static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) @@ -2483,7 +2785,6 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en } else i++; } - if ( args->ref_idx==-1 && args->nalines ) return; while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) ) @@ -2504,6 +2805,36 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en } } +// search string in semicolon separated strings (xx vs aa;bb) +static int str_match(char *needle, char *haystack) +{ + int len = strlen(needle); + char *ptr = haystack; + while ( *ptr && (ptr=strstr(ptr,needle)) ) + { + if ( ptr[len]!=0 && ptr[len]!=';' ) ptr++; // a prefix, not a match + else if ( ptr==haystack || ptr[-1]==';' ) return 1; // a match + ptr++; // a suffix, not a match + } + return 0; +} +// search common string in semicolon separated strings (xx;yy;zz vs aa;bb) +static int strstr_match(char *a, char *b) +{ + char *beg = a; + while ( *beg ) + { + char *end = beg; + while ( *end && *end!=';' ) end++; + char tmp = *end; + if ( *end==';' ) *end = 0; + int ret = str_match(beg,b); + *end = tmp; + if ( ret || !*end ) return ret; + beg = end + 1; + } + return 0; +} static void annotate(args_t *args, bcf1_t *line) { int i, j; @@ -2511,9 +2842,9 @@ static void annotate(args_t *args, bcf1_t *line) args->rm[i].handler(args, line, &args->rm[i]); int has_overlap = 0; - if ( args->tgt_idx ) { + for (j=0; jncols; j++) args->cols[j].done = 0; if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) { while ( regitr_overlap(args->tgt_itr) ) @@ -2524,49 +2855,145 @@ static void annotate(args_t *args, bcf1_t *line) tmp->end = args->tgt_itr->end; parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],tmp) ) + { + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],tmp); + if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } } has_overlap = 1; } for (j=0; jncols; j++) - if ( args->cols[j].merge_method != MM_FIRST ) - args->cols[j].setter(args,line,&args->cols[j],NULL); + { + if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } } else if ( args->tgts ) { - // Buffer annotation lines. When multiple ALT alleles are present in the - // annotation file, at least one must match one of the VCF alleles. - int len = 0; - bcf_get_variant_types(line); - for (i=1; in_allele; i++) - if ( len > line->d.var[i].n ) len = line->d.var[i].n; - int end_pos = len<0 ? line->pos - len : line->pos; + // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one + // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the + // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found + // for an ALT, missing value is appended instead. + int end_pos = line->pos + line->rlen - 1; buffer_annot_lines(args, line, line->pos, end_pos); + + args->nsrt_alines = 0; + hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines); + if ( args->nalines >= 0xffff || line->n_allele >= 0xffff ) + error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + + // Find matching lines for (i=0; inalines; i++) { if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue; - if ( args->ref_idx != -1 ) + if ( args->ref_idx != -1 ) // REF+ALT matching requested { - if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs not compatible + if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs are not compatible for (j=1; jalines[i].nals; j++) { - if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break; // no ALT allele in VCF and annot file has "." - if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break; + int ialt; + if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) // match: no ALT allele in VCF and annot file has "." + ialt = 0; + else + { + ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]); + if ( ialt < 0 ) continue; + ialt++; + } + if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue; + args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; + has_overlap = 1; + break; } - if ( j==args->alines[i].nals ) continue; // none of the annot alleles present in VCF's ALT } - break; + else // overlap, REF+ALT matching not requested + { + args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i; + has_overlap = 1; + } } - - if ( inalines ) + // Sort lines if needed + if ( args->has_append_mode ) + { + // insertion sort by VCF ALT index (top bits) and alines index (low bits) + uint32_t tmp; + for (i=1; insrt_alines; i++) + for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--) + tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp; + } + // Annotate + for (j=0; jncols; j++) args->cols[j].done = 0; + int ialt_exp = 1; + for (i=0; insrt_alines; i++) { - // there is a matching line + int ialt = args->srt_alines[i] >> 16; + int ilin = args->srt_alines[i] & 0xffff; + if ( args->has_append_mode ) + { + if ( ialt_exp > ialt ) continue; // multiple annotation lines for the same position + if ( ialt_exp < ialt ) + { + // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT + while ( ialt_exp++ < ialt ) + { + for (j=0; jncols; j++) + { + if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); + if ( ret < 0 ) + error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + } + } + } for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) + { + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]); + if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + ialt_exp = ialt + 1; + } + if ( args->nsrt_alines ) + { + // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one + // record was found. Otherwise leave the row will be left without annotation. + if ( args->has_append_mode && ialt_exp < line->n_allele ) + { + while ( ialt_exp++ < line->n_allele ) + { + for (j=0; jncols; j++) + { + if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); + if ( ret < 0 ) + error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + } + } + // Flush + for (j=0; jncols; j++) + { + if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],NULL); + if ( ret < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } } - has_overlap = inalines ? 1 : 0; } else if ( args->files->nreaders == 2 ) { @@ -2611,28 +3038,30 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools annotate [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(stderr, " --collapse matching records by , see man page for details [some]\n"); - fprintf(stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n"); - fprintf(stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); - fprintf(stderr, " -I, --set-id [+] set ID column, see man page for details\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); - fprintf(stderr, " -l, --merge-logic merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); - fprintf(stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); - fprintf(stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); - fprintf(stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); - fprintf(stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(stderr, " --collapse STR matching records by , see man page for details [some]\n"); + fprintf(stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); + fprintf(stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); + fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n"); + fprintf(stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n"); + fprintf(stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n"); + fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); + fprintf(stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); + fprintf(stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(stderr, " -r, --regions REGION restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n"); + fprintf(stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n"); + fprintf(stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n"); + fprintf(stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); + fprintf(stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); + fprintf(stderr, " --threads INT number of extra output compression threads [0]\n"); fprintf(stderr, "\n"); exit(1); } @@ -2649,6 +3078,7 @@ int main_vcfannotate(int argc, char *argv[]) args->record_cmd_line = 1; args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; args->set_ids_replace = 1; + args->match_id = -1; int regions_is_file = 0, collapse = 0; static struct option loptions[] = @@ -2667,7 +3097,9 @@ int main_vcfannotate(int argc, char *argv[]) {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"remove",required_argument,NULL,'x'}, + {"columns-file",required_argument,NULL,'C'}, {"columns",required_argument,NULL,'c'}, + {"rename-annots",required_argument,NULL,11}, {"rename-chrs",required_argument,NULL,1}, {"header-lines",required_argument,NULL,'h'}, {"samples",required_argument,NULL,'s'}, @@ -2677,7 +3109,7 @@ int main_vcfannotate(int argc, char *argv[]) {"force",no_argument,NULL,'f'}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) { switch (c) { case 'f': args->force = 1; break; @@ -2688,11 +3120,15 @@ int main_vcfannotate(int argc, char *argv[]) else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } else args->mark_sites = optarg; break; - case 'l': args->merge_method_str = optarg; break; + case 'l': + if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str); + kputs(optarg,&args->merge_method_str); + break; case 'I': args->set_ids_fmt = optarg; break; case 's': args->sample_names = optarg; break; case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; case 'c': args->columns = strdup(optarg); break; + case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -2703,8 +3139,12 @@ int main_vcfannotate(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); }; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'x': args->remove_annots = optarg; break; case 'a': args->targets_fname = optarg; break; case 'r': args->regions_list = optarg; break; @@ -2724,6 +3164,7 @@ int main_vcfannotate(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->single_overlaps = 1; break; + case 11 : args->rename_annots = optarg; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index e9d31bf..b7e707b 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -2,7 +2,7 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -28,6 +28,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ annot_line_t; #define REPLACE_ALL 1 // replace both missing and existing values #define REPLACE_NON_MISSING 2 // replace only if tgt is not missing #define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise +#define MATCH_VALUE 4 // do not set, just match the value -c ~ID #define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest #define MM_APPEND 1 // append, possibly multiple times #define MM_UNIQUE 2 // append, only unique values @@ -79,19 +81,26 @@ annot_line_t; #define MM_AVG 4 #define MM_MIN 5 #define MM_MAX 6 +#define MM_APPEND_MISSING 7 // missing values will be transferred as well typedef struct _annot_col_t { int icol, replace, number; // number: one of BCF_VL_* types char *hdr_key_src, *hdr_key_dst; - int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*); + // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors, + // or 1 on (repeated partial updates) concluded with a src=NULL call + int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t + int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr); int merge_method; // one of the MM_* defines khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values kstring_t mm_kstr; - double + size_t mm_dbl_nalloc, // the allocated size --merge-logic values array mm_dbl_nused, // the number of used elements in the mm_dbl array - mm_dbl_ndat, // the number of merged rows (for calculating the average) + mm_dbl_ndat; // the number of merged rows (for calculating the average) + double *mm_dbl; + void *ptr; + int mptr, done; } annot_col_t; @@ -105,12 +114,12 @@ annot_col_t; typedef struct _args_t { bcf_srs_t *files; - bcf_hdr_t *hdr, *hdr_out; + bcf_hdr_t *hdr, *hdr_out, *tgts_hdr; htsFile *out_fh; int output_type, n_threads; bcf_sr_regions_t *tgts; - regidx_t *tgt_idx; + regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns regitr_t *tgt_itr; int tgt_is_bed; @@ -125,10 +134,13 @@ typedef struct _args_t vcmp_t *vcmp; // for matching annotation and VCF lines by allele annot_line_t *alines; // buffered annotation lines - int nalines, malines; + annot_line_t *aline_missing; + uint32_t *srt_alines; // sorted indexes (iALT<<16 || iAline) + int nalines, malines, nsrt_alines, msrt_alines; int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present annot_col_t *cols; // column indexes and setters int ncols; + int match_id; // set iff `-c ~ID` given char *set_ids_fmt; convert_t *set_ids; @@ -146,9 +158,10 @@ typedef struct _args_t kstring_t tmpks; char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; - char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; - char *merge_method_str; + char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites; + kstring_t merge_method_str; int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; + int columns_is_file, has_append_mode; } args_t; @@ -197,6 +210,8 @@ void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag) for (i=0; in_info; i++) { bcf_info_t *inf = &line->d.info[i]; + if ( !strcmp("END",bcf_hdr_int2id(args->hdr,BCF_DT_ID,inf->key)) ) + line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0; if ( inf->vptr_free ) { free(inf->vptr - inf->vptr_off); @@ -376,6 +391,10 @@ static void init_remove_annots(args_t *args) } else if ( str.l ) { + int id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, str.s); + if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,id) ) error("Error: did you mean INFO/%s?\n",str.s); + if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: did you mean FORMAT/%s?\n",str.s); + if ( !args->keep_sites ) { if ( str.s[0]=='#' && str.s[1]=='#' ) @@ -443,6 +462,42 @@ static void init_header_lines(args_t *args) if (bcf_hdr_sync(args->hdr) < 0) error_errno("[%s] Failed to update input header", __func__); } +static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) +{ + return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); +} +static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) +{ + char *str = *((char**)ptr); + int len = strlen(rec->d.id); + if ( len >= *mptr ) str = realloc(str, len+1); + strcpy(str, rec->d.id); + *((char**)ptr) = str; + *mptr = len+1; + return len; +} +static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) +{ + kstring_t str; + str.s = *((char**)ptr); + str.m = *mptr; + str.l = 0; + + int i; + if ( rec->d.n_flt ) + { + for (i=0; id.n_flt; i++) + { + if (i) kputc(';', &str); + kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str); + } + } + else kputc('.', &str); + + *((char**)ptr) = str.s; + *mptr = str.m; + return str.l; +} static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n"); @@ -452,24 +507,24 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." hts_expand(int,1,args->mtmpi,args->tmpi); args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]); - if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]); - if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; } + if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]); + if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]); if ( col->replace!=REPLACE_MISSING ) { bcf_update_filter(args->hdr_out,line,NULL,0); - bcf_update_filter(args->hdr_out,line,args->tmpi,1); - return 0; + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); } // only update missing FILTER if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); if ( !line->d.n_flt ) - bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return 0; } static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { - int i; + int i, ret = 0; bcf1_t *rec = (bcf1_t*) data; if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT); if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); @@ -480,9 +535,9 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void for (i=0; id.n_flt; i++) { const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]); - bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)); + if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1; } - return 0; + return ret; } hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi); for (i=0; id.n_flt; i++) @@ -491,12 +546,12 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt); } bcf_update_filter(args->hdr_out,line,NULL,0); - bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt); - return 0; + return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt); } static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n"); + if ( col->replace==MATCH_VALUE ) return 0; // possible cases: // IN ANNOT OUT ACHIEVED_BY @@ -519,14 +574,28 @@ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) } static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { + if ( col->replace==MATCH_VALUE ) return 0; + bcf1_t *rec = (bcf1_t*) data; - if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "." - if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,rec->d.id); - if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,rec->d.id); + + char *id; + if ( col->getter ) + { + int nret = col->getter(args,rec,col,&col->ptr,&col->mptr); + id = (char*) col->ptr; + if ( nret<=0 || (nret==1 && *id=='.') ) return 0; // don't replace with "." + } + else + { + if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "." + id = rec->d.id; + } + if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id); + if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id); // running with +ID, only update missing ids if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) - return bcf_update_id(args->hdr_out,line,rec->d.id); + return bcf_update_id(args->hdr_out,line,id); return 0; } static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -537,9 +606,9 @@ static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *da als[0] = rec->d.allele[0]; int i; for (i=1; in_allele; i++) als[i] = line->d.allele[i]; - bcf_update_alleles(args->hdr_out, line, als, line->n_allele); + int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele); free(als); - return 0; + return ret; } static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -553,9 +622,9 @@ static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *da const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele); als[0] = line->d.allele[0]; for (i=1; in_allele; i++) als[i] = rec->d.allele[i]; - bcf_update_alleles(args->hdr_out, line, als, rec->n_allele); + int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele); free(als); - return 0; + return ret; } static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -629,34 +698,51 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpi2[i] = args->tmpi[ map[i] ]; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst); } static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { annot_line_t *tab = (annot_line_t*) data; + // This is a bit hacky, only to reuse existing code with minimal changes: + // -c =TAG will now behave as -l TAG:APPEND for integers + if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND; + if ( !tab ) { - if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) - error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n"); + if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && + col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && + col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND_MISSING ) + error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n"); } int i,ntmpi = 0; - if ( tab ) + if ( tab ) // has data, not flushing yet { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 ) return 0; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; while ( *end ) { - int val = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ntmpi++; hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); - args->tmpi[ntmpi-1] = val; - str = end+1; + if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) + { + if ( col->merge_method==MM_APPEND_MISSING ) + args->tmpi[ntmpi-1] = bcf_int32_missing; + else + ntmpi--; + if ( str[1]==0 ) end = str+1; + str += 2; + } + else + { + args->tmpi[ntmpi-1] = strtol(str, &end, 10); + if ( end==str ) + error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + str = end+1; + } } if ( col->merge_method!=MM_FIRST ) { @@ -669,7 +755,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } else { - if ( col->merge_method==MM_APPEND ) + if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { int nori = col->mm_dbl_nused; col->mm_dbl_nused += ntmpi; @@ -689,9 +775,10 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } } col->mm_dbl_ndat++; + return 1; } } - else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) + else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { ntmpi = col->mm_dbl_nused; hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); @@ -715,8 +802,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); } static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -733,8 +819,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); } static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) { @@ -765,34 +850,51 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpf2[i] = args->tmpf[ map[i] ]; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst); } static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { annot_line_t *tab = (annot_line_t*) data; + // This is a bit hacky, only to reuse existing code with minimal changes: + // -c =TAG will now behave as -l TAG:APPEND for floats + if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND; + if ( !tab ) { - if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) - error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n"); + if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && + col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && + col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND_MISSING ) + error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n"); } int i,ntmpf = 0; if ( tab ) { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 ) return 0; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; while ( *end ) { - double val = strtod(str, &end); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ntmpf++; hts_expand(float,ntmpf,args->mtmpf,args->tmpf); - args->tmpf[ntmpf-1] = val; - str = end+1; + if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) + { + if ( col->merge_method==MM_APPEND_MISSING ) + bcf_float_set_missing(args->tmpf[ntmpf-1]); + else + ntmpf--; + if ( str[1]==0 ) end = str+1; + str += 2; + } + else + { + args->tmpf[ntmpf-1] = strtod(str, &end); + if ( end==str ) + error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + str = end+1; + } } if ( col->merge_method!=MM_FIRST ) { @@ -801,17 +903,27 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * col->mm_dbl_nused = ntmpf; hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); for (i=0; imm_dbl[i] = args->tmpf[i]; + { + if ( bcf_float_is_missing(args->tmpf[i]) ) + bcf_double_set_missing(col->mm_dbl[i]); + else + col->mm_dbl[i] = args->tmpf[i]; + } } else { - if ( col->merge_method==MM_APPEND ) + if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { int nori = col->mm_dbl_nused; col->mm_dbl_nused += ntmpf; hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); for (i=0; imm_dbl[i+nori] = args->tmpf[i]; + { + if ( bcf_float_is_missing(args->tmpf[i]) ) + bcf_double_set_missing(col->mm_dbl[i+nori]); + else + col->mm_dbl[i+nori] = args->tmpf[i]; + } } else { @@ -825,13 +937,20 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * } } col->mm_dbl_ndat++; + return 1; } } - else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) + else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { ntmpf = col->mm_dbl_nused; hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); - for (i=0; itmpf[i] = col->mm_dbl[i]; + for (i=0; imm_dbl[i]) ) + bcf_float_set_missing(args->tmpf[i]); + else + args->tmpf[i] = col->mm_dbl[i]; + } col->mm_dbl_nused = col->mm_dbl_ndat = 0; } else if ( col->merge_method==MM_AVG ) @@ -851,8 +970,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); } static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -869,8 +987,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); } int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) @@ -925,10 +1042,9 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue; // value already set } int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i); - assert( ret==0 ); + if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n", __FILE__,__LINE__,__func__); } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); } void khash_str2int_clear_free(void *_hash) { @@ -947,14 +1063,18 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } + // This is a bit hacky, only to reuse existing code with minimal changes: + // -c =TAG will now behave as -l TAG:unique for strings + if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE; + annot_line_t *tab = (annot_line_t*) data; - + int len = 0; if ( tab ) { len = strlen(tab->cols[col->icol]); if ( !len ) return 0; - if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0; + if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1; } if ( col->merge_method!=MM_FIRST ) @@ -964,17 +1084,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( data ) { - assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE ); + assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE ); if ( col->merge_method==MM_UNIQUE ) { if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init(); - if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0; + if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1; khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); } if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); kputs(tab->cols[col->icol], &col->mm_kstr); - return 0; + return 1; } if ( col->mm_kstr.l ) @@ -985,12 +1105,10 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d else return 0; - if ( !data ) // flush the line - { - if ( col->merge_method==MM_UNIQUE ) - khash_str2int_clear_free(col->mm_str_hash); - col->mm_kstr.l = 0; - } + // flush the line + if ( col->merge_method==MM_UNIQUE ) + khash_str2int_clear_free(col->mm_str_hash); + col->mm_kstr.l = 0; } else { @@ -1002,14 +1120,19 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d return setter_ARinfo_string(args,line,col,tab->nals,tab->als); } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); } static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps); - if ( ntmps < 0 ) return 0; // nothing to add + + if ( col->getter ) + col->getter(args,rec,col,(void**)&args->tmps, &args->mtmps); + else + { + int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps); + if ( ntmps < 0 ) return 0; // nothing to add + } if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele); @@ -1020,8 +1143,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); } static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str) { @@ -1691,7 +1813,6 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, } } return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1); - } static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -1773,17 +1894,12 @@ static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) // tab annotation file, expecting that all samples are present: sample map not needed if ( !src ) return 0; - int nmatch = 0, order_ok = 1; + int nmatch = 0; for (i=0; isamples[i]); - if ( id!=-1 ) - { - nmatch++; - if ( i!=id ) order_ok = 0; - } + if ( id!=-1 ) nmatch++; } - if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0; // not needed if ( !nmatch ) return -1; // No matching samples found in the source and the destination file args->nsample_map = bcf_hdr_nsamples(dst); @@ -1902,11 +2018,45 @@ static void init_columns(args_t *args) int need_sample_map = 0; int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr); + kstring_t tmp = {0,0,0}; + if ( args->columns_is_file ) + { + int i,n; + char **str = hts_readlist(args->columns, args->columns_is_file, &n); + if ( !str ) error("Could not parse %s\n", args->columns); + for (i=0; imerge_method_str.l ) kputc(',',&args->merge_method_str); + kputs(str[i],&args->merge_method_str); + kputc(':',&args->merge_method_str); + kputs(ptr,&args->merge_method_str); + } + } + if ( tmp.l ) kputc(',',&tmp); + kputs(str[i],&tmp); + free(str[i]); + } + free(str); + free(args->columns); + args->columns = tmp.s; + tmp.l = tmp.m = 0; + tmp.s = NULL; + } + void *skip_fmt = NULL, *skip_info = NULL; if ( args->tgts_is_vcf ) args->columns = columns_complement(args->columns, &skip_info, &skip_fmt); - kstring_t str = {0,0,0}, tmp = {0,0,0}; + kstring_t str = {0,0,0}; char *ss = args->columns, *se = ss; args->ncols = 0; int icol = -1, has_fmt_str = 0; @@ -1931,6 +2081,7 @@ static void init_columns(args_t *args) { args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->setter = vcf_setter_ref; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); @@ -1943,28 +2094,54 @@ static void init_columns(args_t *args) { args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->setter = vcf_setter_alt; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); } else args->alt_idx = icol; } - else if ( !strcasecmp("ID",str.s) ) + else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) ) { if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + if ( str.s[0]=='~' ) replace = MATCH_VALUE; + if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); + if ( replace==MATCH_VALUE ) args->match_id = icol; + } + else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column + { + if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s); + if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->setter = vcf_setter_id; + col->getter = vcf_getter_info_str2str; + str.s[2] = 0; + col->hdr_key_dst = strdup(str.s); + col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5); + int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src); + if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) + error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname); + if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR ) + error("Only Type=String tags can be used to annotate the ID column\n"); } else if ( !strcasecmp("FILTER",str.s) ) { if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter; @@ -1979,7 +2156,7 @@ static void init_columns(args_t *args) bcf_hrec_t *hrec = tgts_hdr->hrec[j]; if ( hrec->type!=BCF_HL_FLT ) continue; int k = bcf_hrec_find_key(hrec,"ID"); - assert( k>=0 ); // this should always be true for valid VCFs + if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__); tmp.l = 0; bcf_hrec_format(hrec, &tmp); bcf_hdr_append(args->hdr_out, tmp.s); @@ -1994,6 +2171,7 @@ static void init_columns(args_t *args) if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = icol; col->replace = replace; col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual; @@ -2003,7 +2181,7 @@ static void init_columns(args_t *args) else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields { if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); + if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n"); bcf_hdr_t *tgts_hdr = args->files->readers[1].header; int j; for (j=0; jnhrec; j++) @@ -2021,6 +2199,7 @@ static void init_columns(args_t *args) int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = -1; col->replace = replace; col->hdr_key_src = strdup(hrec->vals[k]); @@ -2056,11 +2235,16 @@ static void init_columns(args_t *args) int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); col->icol = -1; col->replace = replace; col->hdr_key_src = strdup(hrec->vals[k]); col->hdr_key_dst = strdup(hrec->vals[k]); - if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt; + if ( !strcasecmp("GT",col->hdr_key_src) ) + { + if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n"); + col->setter = vcf_setter_format_gt; + } else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { @@ -2099,9 +2283,10 @@ static void init_columns(args_t *args) } int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) - error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); if ( !args->tgts_is_vcf ) { col->icol = icol; @@ -2112,7 +2297,11 @@ static void init_columns(args_t *args) col->replace = replace; col->hdr_key_src = strdup(key_src); col->hdr_key_dst = strdup(key_dst); - if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt; + if ( !strcasecmp("GT",key_src) ) + { + if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n"); + col->setter = vcf_setter_format_gt; + } else switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) ) { @@ -2131,13 +2320,20 @@ static void init_columns(args_t *args) else { if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); - int explicit_info = 0; + if ( replace==SET_OR_APPEND ) + { + if ( args->tgts_is_vcf ) + error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n" + " (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n"); + fprintf(bcftools_stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n"); + } + int explicit_src_info = 0; + int explicit_dst_info = 0; char *key_dst; if ( !strncasecmp("INFO/",str.s,5) ) { key_dst = str.s + 5; - explicit_info = 1; + explicit_dst_info = 1; } else key_dst = str.s; @@ -2149,7 +2345,7 @@ static void init_columns(args_t *args) if ( !strncasecmp("INFO/",key_src,5) ) { key_src += 5; - explicit_info = 1; + explicit_src_info = 1; } else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) ) { @@ -2159,38 +2355,65 @@ static void init_columns(args_t *args) } else key_src = key_dst; + + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->hdr_key_src = strdup(key_src); + col->hdr_key_dst = strdup(key_dst); + int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ) { if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line { - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); - if ( !hrec ) + if ( !strcasecmp("ID",key_src) && !explicit_src_info ) { - if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) ) - error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); - fprintf(bcftools_stderr,"[%s] %d\n",key_src,explicit_info); - error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname); + // transferring ID column into a new INFO tag + tmp.l = 0; + ksprintf(&tmp,"##INFO=",key_dst); + } + else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) + { + // transferring FILTER column into a new INFO tag + tmp.l = 0; + ksprintf(&tmp,"##INFO=",key_dst); + } + else + { + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); + if ( !hrec ) + { + if ( explicit_dst_info+explicit_src_info==0 && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) ) + error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); + char *ptr = strchr(key_src,'='); + if ( ptr ) + { + *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '='; + error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s); + } + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname); + } + tmp.l = 0; + bcf_hrec_format_rename(hrec, key_dst, &tmp); } - tmp.l = 0; - bcf_hrec_format_rename(hrec, key_dst, &tmp); bcf_hdr_append(args->hdr_out, tmp.s); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update header", __func__); hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } - - args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); - annot_col_t *col = &args->cols[args->ncols-1]; - col->icol = icol; - col->replace = replace; - col->hdr_key_src = strdup(key_src); - col->hdr_key_dst = strdup(key_dst); - col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); + if ( args->tgts_is_vcf ) + { + if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str; + else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str; + } + col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) ) { case BCF_HT_FLAG: col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break; @@ -2199,6 +2422,18 @@ static void init_columns(args_t *args) case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break; default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id)); } + if ( replace==SET_OR_APPEND ) // change to Number=. + { + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL); + if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst); + hrec = bcf_hrec_dup(hrec); + int j = bcf_hrec_find_key(hrec, "Number"); + if ( j<0 ) error("Uh, could not find the entry Number in the header record of %s\n",key_dst); + free(hrec->vals[j]); + hrec->vals[j] = strdup("."); + bcf_hdr_remove(args->hdr_out,BCF_HL_INFO, key_dst); + bcf_hdr_add_hrec(args->hdr_out, hrec); + } } if ( !*se ) break; ss = ++se; @@ -2234,10 +2469,10 @@ static void init_merge_method(args_t *args) args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0; memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr)); } - if ( !args->merge_method_str ) return; + if ( !args->merge_method_str.l ) return; if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n"); - if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n"); - char *sb = args->merge_method_str; + if ( !args->tgt_idx && !args->tgts ) error("Error: BEG,END (or FROM,TO) columns or REF,ALT columns are expected with the --merge-logic option.\n"); + char *sb = args->merge_method_str.s; while ( *sb ) { char *se = sb; @@ -2248,21 +2483,27 @@ static void init_merge_method(args_t *args) char *mm_type_str = args->tmpks.s + args->tmpks.l; while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--; if ( *mm_type_str!=':' ) - error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str); + error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str.s); *mm_type_str = 0; mm_type_str++; int mm_type = MM_FIRST; if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE; + else if ( !strcasecmp("first",mm_type_str) ) mm_type = MM_FIRST; else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND; + else if ( !strcasecmp("append-missing",mm_type_str) ) + { + mm_type = MM_APPEND_MISSING; + if ( args->ref_idx!=-1 ) args->has_append_mode = 1; + } else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM; else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG; else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN; else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX; - else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str); + else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str.s,mm_type_str); for (i=0; incols; i++) { if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue; - if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR ) + if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR ) error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n"); args->cols[i].merge_method = mm_type; break; @@ -2270,6 +2511,20 @@ static void init_merge_method(args_t *args) if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s); sb = *se ? se + 1 : se; } + if ( args->has_append_mode ) + { + // create a missing line to insert missing values when VCF ALT finds no match in the annotation file + args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing)); + int ncol = 0; + for (i=0; incols; i++) + if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1; + if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1; + args->aline_missing->mcols = ncol; + args->aline_missing->ncols = ncol; + args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*)); + for (i=0; ialine_missing->cols[i] = strdup("."); + } } static void rename_chrs(args_t *args, char *fname) @@ -2301,6 +2556,42 @@ static void rename_chrs(args_t *args, char *fname) free(map); } +static void rename_annots(args_t *args, char *fname) +{ + int n, i; + char **map = hts_readlist(fname, 1, &n); + if ( !map ) error("Could not read: %s\n", fname); + for (i=0; ihdr_out, BCF_DT_ID, sb); + if ( id<0 ) continue; + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL); + if ( !hrec ) continue; // the sequence not present + int j = bcf_hrec_find_key(hrec, "ID"); + assert( j>=0 ); + free(hrec->vals[j]); + ss++; + while ( *ss && isspace(*ss) ) ss++; + char *se = ss; + while ( *se && !isspace(*se) ) se++; + *se = 0; + hrec->vals[j] = strdup(ss); + args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + } + for (i=0; ihdr = args->files->readers[0].header; @@ -2313,6 +2604,7 @@ static void init_data(args_t *args) // reading annots from a VCF if ( !bcf_sr_add_reader(args->files, args->targets_fname) ) error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum)); + args->tgts_hdr = args->files->readers[1].header; } if ( args->columns ) init_columns(args); if ( args->targets_fname && !args->tgts_is_vcf ) @@ -2320,8 +2612,8 @@ static void init_data(args_t *args) if ( !args->columns ) error("The -c option not given\n"); if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n"); if ( args->beg_idx==-1 ) error("The -c POS option not given\n"); - if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n"); - if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) ) + if ( args->single_overlaps && args->merge_method_str.l ) error("The options --merge-logic and --single-overlaps cannot be combined\n"); + if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str.l) ) { args->end_idx = -args->beg_idx - 1; args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx); @@ -2365,8 +2657,9 @@ static void init_data(args_t *args) if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); + if ( args->rename_annots ) rename_annots(args, args->rename_annots); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); @@ -2388,8 +2681,15 @@ static void destroy_data(args_t *args) free(args->cols[i].mm_kstr.s); if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash); free(args->cols[i].mm_dbl); + free(args->cols[i].ptr); } free(args->cols); + if ( args->aline_missing ) + { + for (i=0; ialine_missing->ncols; i++) free(args->aline_missing->cols[i]); + free(args->aline_missing->cols); + free(args->aline_missing); + } for (i=0; imalines; i++) { free(args->alines[i].cols); @@ -2397,6 +2697,7 @@ static void destroy_data(args_t *args) free(args->alines[i].line.s); } free(args->alines); + free(args->srt_alines); if ( args->tgt_idx ) { regidx_destroy(args->tgt_idx); @@ -2422,6 +2723,7 @@ static void destroy_data(args_t *args) filter_destroy(args->filter); if (args->out_fh) hts_close(args->out_fh); free(args->sample_map); + free(args->merge_method_str.s); } static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) @@ -2485,7 +2787,6 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en } else i++; } - if ( args->ref_idx==-1 && args->nalines ) return; while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) ) @@ -2506,6 +2807,36 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en } } +// search string in semicolon separated strings (xx vs aa;bb) +static int str_match(char *needle, char *haystack) +{ + int len = strlen(needle); + char *ptr = haystack; + while ( *ptr && (ptr=strstr(ptr,needle)) ) + { + if ( ptr[len]!=0 && ptr[len]!=';' ) ptr++; // a prefix, not a match + else if ( ptr==haystack || ptr[-1]==';' ) return 1; // a match + ptr++; // a suffix, not a match + } + return 0; +} +// search common string in semicolon separated strings (xx;yy;zz vs aa;bb) +static int strstr_match(char *a, char *b) +{ + char *beg = a; + while ( *beg ) + { + char *end = beg; + while ( *end && *end!=';' ) end++; + char tmp = *end; + if ( *end==';' ) *end = 0; + int ret = str_match(beg,b); + *end = tmp; + if ( ret || !*end ) return ret; + beg = end + 1; + } + return 0; +} static void annotate(args_t *args, bcf1_t *line) { int i, j; @@ -2513,9 +2844,9 @@ static void annotate(args_t *args, bcf1_t *line) args->rm[i].handler(args, line, &args->rm[i]); int has_overlap = 0; - if ( args->tgt_idx ) { + for (j=0; jncols; j++) args->cols[j].done = 0; if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) { while ( regitr_overlap(args->tgt_itr) ) @@ -2526,49 +2857,145 @@ static void annotate(args_t *args, bcf1_t *line) tmp->end = args->tgt_itr->end; parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],tmp) ) + { + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],tmp); + if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } } has_overlap = 1; } for (j=0; jncols; j++) - if ( args->cols[j].merge_method != MM_FIRST ) - args->cols[j].setter(args,line,&args->cols[j],NULL); + { + if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } } else if ( args->tgts ) { - // Buffer annotation lines. When multiple ALT alleles are present in the - // annotation file, at least one must match one of the VCF alleles. - int len = 0; - bcf_get_variant_types(line); - for (i=1; in_allele; i++) - if ( len > line->d.var[i].n ) len = line->d.var[i].n; - int end_pos = len<0 ? line->pos - len : line->pos; + // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one + // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the + // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found + // for an ALT, missing value is appended instead. + int end_pos = line->pos + line->rlen - 1; buffer_annot_lines(args, line, line->pos, end_pos); + + args->nsrt_alines = 0; + hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines); + if ( args->nalines >= 0xffff || line->n_allele >= 0xffff ) + error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + + // Find matching lines for (i=0; inalines; i++) { if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue; - if ( args->ref_idx != -1 ) + if ( args->ref_idx != -1 ) // REF+ALT matching requested { - if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs not compatible + if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs are not compatible for (j=1; jalines[i].nals; j++) { - if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break; // no ALT allele in VCF and annot file has "." - if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break; + int ialt; + if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) // match: no ALT allele in VCF and annot file has "." + ialt = 0; + else + { + ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]); + if ( ialt < 0 ) continue; + ialt++; + } + if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue; + args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; + has_overlap = 1; + break; } - if ( j==args->alines[i].nals ) continue; // none of the annot alleles present in VCF's ALT } - break; + else // overlap, REF+ALT matching not requested + { + args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i; + has_overlap = 1; + } } - - if ( inalines ) + // Sort lines if needed + if ( args->has_append_mode ) + { + // insertion sort by VCF ALT index (top bits) and alines index (low bits) + uint32_t tmp; + for (i=1; insrt_alines; i++) + for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--) + tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp; + } + // Annotate + for (j=0; jncols; j++) args->cols[j].done = 0; + int ialt_exp = 1; + for (i=0; insrt_alines; i++) { - // there is a matching line + int ialt = args->srt_alines[i] >> 16; + int ilin = args->srt_alines[i] & 0xffff; + if ( args->has_append_mode ) + { + if ( ialt_exp > ialt ) continue; // multiple annotation lines for the same position + if ( ialt_exp < ialt ) + { + // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT + while ( ialt_exp++ < ialt ) + { + for (j=0; jncols; j++) + { + if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); + if ( ret < 0 ) + error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + } + } + } for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) + { + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]); + if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + ialt_exp = ialt + 1; + } + if ( args->nsrt_alines ) + { + // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one + // record was found. Otherwise leave the row will be left without annotation. + if ( args->has_append_mode && ialt_exp < line->n_allele ) + { + while ( ialt_exp++ < line->n_allele ) + { + for (j=0; jncols; j++) + { + if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); + if ( ret < 0 ) + error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + } + } + // Flush + for (j=0; jncols; j++) + { + if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],NULL); + if ( ret < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } } - has_overlap = inalines ? 1 : 0; } else if ( args->files->nreaders == 2 ) { @@ -2613,30 +3040,32 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools annotate [options] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(bcftools_stderr, " --collapse matching records by , see man page for details [some]\n"); - fprintf(bcftools_stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " --force continue despite parsing error (at your own risk!)\n"); - fprintf(bcftools_stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); - fprintf(bcftools_stderr, " -I, --set-id [+] set ID column, see man page for details\n"); - fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); - fprintf(bcftools_stderr, " -l, --merge-logic merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); - fprintf(bcftools_stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); - fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); - fprintf(bcftools_stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); - fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(bcftools_stderr, " --collapse STR matching records by , see man page for details [some]\n"); + fprintf(bcftools_stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); + fprintf(bcftools_stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " --force continue despite parsing error (at your own risk!)\n"); + fprintf(bcftools_stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n"); + fprintf(bcftools_stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n"); + fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); + fprintf(bcftools_stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); + fprintf(bcftools_stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n"); + fprintf(bcftools_stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n"); + fprintf(bcftools_stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n"); + fprintf(bcftools_stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); + fprintf(bcftools_stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); + fprintf(bcftools_stderr, " --threads INT number of extra output compression threads [0]\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfannotate(int argc, char *argv[]) @@ -2651,6 +3080,7 @@ int main_vcfannotate(int argc, char *argv[]) args->record_cmd_line = 1; args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; args->set_ids_replace = 1; + args->match_id = -1; int regions_is_file = 0, collapse = 0; static struct option loptions[] = @@ -2669,7 +3099,9 @@ int main_vcfannotate(int argc, char *argv[]) {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"remove",required_argument,NULL,'x'}, + {"columns-file",required_argument,NULL,'C'}, {"columns",required_argument,NULL,'c'}, + {"rename-annots",required_argument,NULL,11}, {"rename-chrs",required_argument,NULL,1}, {"header-lines",required_argument,NULL,'h'}, {"samples",required_argument,NULL,'s'}, @@ -2679,7 +3111,7 @@ int main_vcfannotate(int argc, char *argv[]) {"force",no_argument,NULL,'f'}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) { switch (c) { case 'f': args->force = 1; break; @@ -2690,11 +3122,15 @@ int main_vcfannotate(int argc, char *argv[]) else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } else args->mark_sites = optarg; break; - case 'l': args->merge_method_str = optarg; break; + case 'l': + if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str); + kputs(optarg,&args->merge_method_str); + break; case 'I': args->set_ids_fmt = optarg; break; case 's': args->sample_names = optarg; break; case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; case 'c': args->columns = strdup(optarg); break; + case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -2705,8 +3141,12 @@ int main_vcfannotate(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); }; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'x': args->remove_annots = optarg; break; case 'a': args->targets_fname = optarg; break; case 'r': args->regions_list = optarg; break; @@ -2726,6 +3166,7 @@ int main_vcfannotate(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->single_overlaps = 1; break; + case 11 : args->rename_annots = optarg; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfbuf.c b/bcftools/vcfbuf.c index ffdfd40..71916bb 100644 --- a/bcftools/vcfbuf.c +++ b/bcftools/vcfbuf.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016-2019 Genome Research Ltd. + Copyright (c) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -24,16 +24,19 @@ */ +#include +#include #include #include +#include #include "bcftools.h" #include "vcfbuf.h" #include "rbuf.h" typedef struct { - double max; - int rand_missing, skip_filter; + double max[VCFBUF_LD_N]; + int rand_missing, filter1; } ld_t; @@ -41,13 +44,16 @@ typedef struct { bcf1_t *rec; double af; - int af_set:1, idx:31; + int af_set:1, filter:1, idx:30; } vcfrec_t; +#define PRUNE_MODE_MAX_AF 1 +#define PRUNE_MODE_1ST 2 +#define PRUNE_MODE_RAND 3 typedef struct { - int max_sites, mvrec, mac, mfarr; + int max_sites, mvrec, mac, mfarr, mode; int *ac, *idx; float *farr; char *af_tag; @@ -85,6 +91,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) buf->hdr = hdr; buf->win = win; buf->overlap.rid = -1; + int i; + for (i=0; ild.max[i] = HUGE_VAL; rbuf_init(&buf->rbuf, 0); return buf; } @@ -104,13 +112,30 @@ void vcfbuf_destroy(vcfbuf_t *buf) void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) { - if ( key==VCFBUF_LD_MAX ) { buf->ld.max = *((double*)value); return; } - if ( key==VCFBUF_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; } - if ( key==VCFBUF_SKIP_FILTER ) { buf->ld.skip_filter = *((int*)value); return; } - if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; } + if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; } + if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; } + if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; } + if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; } + if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; } + + if ( key==VCFBUF_NSITES ) + { + buf->prune.max_sites = *((int*)value); + if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF; + return; + } if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; } if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; } if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; } + + if ( key==VCFBUF_NSITES_MODE ) + { + char *mode = *((char**)value); + if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF; + else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST; + else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND; + else error("The mode \"%s\" is not recognised\n",mode); + } } int vcfbuf_nsites(vcfbuf_t *buf) @@ -118,10 +143,8 @@ int vcfbuf_nsites(vcfbuf_t *buf) return buf->rbuf.n; } -bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap) +bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec) { - if ( !swap ) error("todo: swap=%d\n", swap); - rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf); int i = rbuf_append(&buf->rbuf); @@ -130,6 +153,8 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap) bcf1_t *ret = buf->vcf[i].rec; buf->vcf[i].rec = rec; buf->vcf[i].af_set = 0; + buf->vcf[i].filter = buf->ld.filter1; + buf->ld.filter1 = 0; return ret; } @@ -170,6 +195,26 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) { int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1; + int nprune = nbuf - buf->prune.max_sites; + int i,k,irec = 0; + if ( buf->prune.mode==PRUNE_MODE_1ST ) + { + int eoff = flush_all ? 1 : 2; + for (i=0; irbuf, vcfrec_t, buf->rbuf.n - eoff, buf->vcf); + return; + } + if ( buf->prune.mode==PRUNE_MODE_RAND ) + { + int eoff = flush_all ? 0 : 1; + for (i=0; irbuf.n - eoff) * hts_drand48(); + rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf); + } + return; + } + if ( nbuf > buf->prune.mvrec ) { buf->prune.idx = (int*) realloc(buf->prune.idx, nbuf*sizeof(int)); @@ -178,7 +223,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) } // set allele frequency and prepare buffer for sorting - int i,k,irec = 0; for (i=-1; rbuf_next(&buf->rbuf,&i) && irecvcf[i].rec; @@ -211,7 +255,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) // sort the rbuf indexes to be pruned descendently so that j-th rbuf index // is removed before i-th index if iprune.max_sites; for (i=0; iprune.idx[i] = buf->prune.vrec[i]->idx; @@ -333,10 +376,21 @@ static double _estimate_af(int8_t *ptr, int size, int nvals, int nsamples) } /* - For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/ + The `ld` is set to D approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/ D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb)) + + and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage + disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265 + + \hat{D} = 1/[n*(n+1)]*[ + (n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9) + -(n2/2 + n3 + n5/4 + n6/2)*(n4/2 + n5/4 + n7 + n8/2) + ] + where n1,n2,..n9 are counts of RR/RR,RR/RA,..,AA/AA genotypes. + + Returns 0 on success, -1 if the values could not be determined (missing genotypes) */ -static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec) +static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *ld) { if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample); assert( arec->n_sample ); @@ -365,21 +419,24 @@ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec) baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample); } - // Calculate correlation + // Calculate r2, lf, hd + double nhd[] = {0,0,0,0,0,0,0,0,0}; double ab = 0, aa = 0, bb = 0, a = 0, b = 0; - int nab = 0, na = 0, nb = 0, ndiff = 0; + int nab = 0, ndiff = 0; + int an_tot = 0, bn_tot = 0; for (i=0; in_sample; i++) { int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size); int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size); - int adsg = 0, bdsg = 0, an = 0, bn = 0; + int adsg = 0, bdsg = 0; // dosages (0,1,2) at sites (a,b) + int an = 0, bn = 0; // number of alleles at sites (a,b) for (j=0; jn; j++) { if ( aptr[j]==bcf_int8_vector_end ) break; if ( aptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( rand()/RAND_MAX >= aaf ) adsg += 1; + if ( hts_drand48() >= aaf ) adsg += 1; } else if ( bcf_gt_allele(aptr[j]) ) adsg += 1; an++; @@ -390,89 +447,112 @@ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec) if ( bptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( rand()/RAND_MAX >= baf ) bdsg += 1; + if ( hts_drand48() >= baf ) bdsg += 1; } else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1; bn++; } - if ( an ) + if ( an && bn ) { + an_tot += an; aa += adsg*adsg; a += adsg; - na++; - } - if ( bn ) - { + + bn_tot += bn; bb += bdsg*bdsg; b += bdsg; - nb++; - } - if ( an && bn ) - { + if ( adsg!=bdsg ) ndiff++; ab += adsg*bdsg; nab++; } + if ( an==2 && bn==2 ) // for now only diploid genotypes + { + assert( adsg<=2 && bdsg<=2 ); + nhd[ bdsg*3 + adsg ]++; + } } - if ( !nab ) return -1; + if ( !nab ) return -1; // no data in common for the two sites + double pa = a/an_tot; + double pb = b/bn_tot; double cor; if ( !ndiff ) cor = 1; else { - // Don't know how to deal with zero variance. Since this the purpose is filtering, - // it is not enough to say the value is undefined. Therefore an artificial noise is - // added to make the denominator non-zero. - if ( aa == a*a/na || bb == b*b/nb ) + if ( aa == a*a/nab || bb == b*b/nab ) // zero variance, add small noise { - aa += 3*3; - bb += 3*3; - ab += 3*3; - a += 3; - b += 3; - na++; - nb++; + aa += 1e-4; + bb += 1e-4; + ab += 1e-4; + a += 1e-2; + b += 1e-2; nab++; } - cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb); + cor = (ab - a*b/nab) / sqrt(aa - a*a/nab) / sqrt(bb - b*b/nab); } - return cor*cor; + + ld->val[VCFBUF_LD_IDX_R2] = cor * cor; + + // Lewontin's normalization of D. Also we cap at 1 as the calculation + // can result in values bigger than 1 for high AFs. + ld->val[VCFBUF_LD_IDX_LD] = cor * sqrt(pa*(1-pa)*pb*(1-pb)); + double norm; + if ( ld->val[VCFBUF_LD_IDX_LD] < 0 ) + norm = -pa*pb > -(1-pa)*(1-pb) ? -pa*pb : -(1-pa)*(1-pb); + else + norm = pa*(1-pb) > (1-pa)*pb ? pa*(1-pb) : (1-pa)*pb; + if ( norm ) + ld->val[VCFBUF_LD_IDX_LD] = fabs(norm) > fabs(ld->val[VCFBUF_LD_IDX_LD]) ? ld->val[VCFBUF_LD_IDX_LD]/norm : 1; + if ( !ld->val[VCFBUF_LD_IDX_LD] ) + ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]); // avoid "-0" on output + + ld->val[VCFBUF_LD_IDX_HD] = + (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8]) + - (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.); + ld->val[VCFBUF_LD_IDX_HD] /= nab; + ld->val[VCFBUF_LD_IDX_HD] /= nab+1; + + return 0; } -bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld) +int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld) { - *ld = -1; - if ( !buf->rbuf.n ) return NULL; + int ret = -1; + if ( !buf->rbuf.n ) return ret; - int i = buf->rbuf.f; + int j, i = buf->rbuf.f; // Relying on vcfbuf being properly flushed - all sites in the buffer // must come from the same chromosome - if ( buf->vcf[i].rec->rid != rec->rid ) return NULL; + if ( buf->vcf[i].rec->rid != rec->rid ) return ret; + + vcfbuf_ld_t tmp; + for (j=0; jval[j] = -HUGE_VAL; + ld->rec[j] = NULL; + } - int imax = 0; - double max = 0; for (i=-1; rbuf_next(&buf->rbuf,&i); ) { - if ( buf->ld.skip_filter ) - { - if ( buf->vcf[i].rec->d.n_flt > 1 ) continue; // multiple filters are set - if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue; // not PASS - } - double val = _calc_ld(buf, buf->vcf[i].rec, rec); - if ( buf->ld.max && buf->ld.max < val ) - { - *ld = val; - return buf->vcf[i].rec; - } - if ( val > max ) + if ( buf->vcf[i].filter ) continue; + if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue; // missing genotypes + + int done = 0; + for (j=0; jval[j] < tmp.val[j] ) + { + ld->val[j] = tmp.val[j]; + ld->rec[j] = buf->vcf[i].rec; + } + if ( buf->ld.max[j] < tmp.val[j] ) done = 1; + ret = 0; } + if ( done ) return ret; } - *ld = max; - return buf->vcf[imax].rec; + return ret; } diff --git a/bcftools/vcfbuf.c.pysam.c b/bcftools/vcfbuf.c.pysam.c index d1dcf99..50df73d 100644 --- a/bcftools/vcfbuf.c.pysam.c +++ b/bcftools/vcfbuf.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2016-2019 Genome Research Ltd. + Copyright (c) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -26,16 +26,19 @@ */ +#include +#include #include #include +#include #include "bcftools.h" #include "vcfbuf.h" #include "rbuf.h" typedef struct { - double max; - int rand_missing, skip_filter; + double max[VCFBUF_LD_N]; + int rand_missing, filter1; } ld_t; @@ -43,13 +46,16 @@ typedef struct { bcf1_t *rec; double af; - int af_set:1, idx:31; + int af_set:1, filter:1, idx:30; } vcfrec_t; +#define PRUNE_MODE_MAX_AF 1 +#define PRUNE_MODE_1ST 2 +#define PRUNE_MODE_RAND 3 typedef struct { - int max_sites, mvrec, mac, mfarr; + int max_sites, mvrec, mac, mfarr, mode; int *ac, *idx; float *farr; char *af_tag; @@ -87,6 +93,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) buf->hdr = hdr; buf->win = win; buf->overlap.rid = -1; + int i; + for (i=0; ild.max[i] = HUGE_VAL; rbuf_init(&buf->rbuf, 0); return buf; } @@ -106,13 +114,30 @@ void vcfbuf_destroy(vcfbuf_t *buf) void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) { - if ( key==VCFBUF_LD_MAX ) { buf->ld.max = *((double*)value); return; } - if ( key==VCFBUF_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; } - if ( key==VCFBUF_SKIP_FILTER ) { buf->ld.skip_filter = *((int*)value); return; } - if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; } + if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; } + if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; } + if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; } + if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; } + if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; } + + if ( key==VCFBUF_NSITES ) + { + buf->prune.max_sites = *((int*)value); + if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF; + return; + } if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; } if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; } if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; } + + if ( key==VCFBUF_NSITES_MODE ) + { + char *mode = *((char**)value); + if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF; + else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST; + else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND; + else error("The mode \"%s\" is not recognised\n",mode); + } } int vcfbuf_nsites(vcfbuf_t *buf) @@ -120,10 +145,8 @@ int vcfbuf_nsites(vcfbuf_t *buf) return buf->rbuf.n; } -bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap) +bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec) { - if ( !swap ) error("todo: swap=%d\n", swap); - rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf); int i = rbuf_append(&buf->rbuf); @@ -132,6 +155,8 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap) bcf1_t *ret = buf->vcf[i].rec; buf->vcf[i].rec = rec; buf->vcf[i].af_set = 0; + buf->vcf[i].filter = buf->ld.filter1; + buf->ld.filter1 = 0; return ret; } @@ -172,6 +197,26 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) { int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1; + int nprune = nbuf - buf->prune.max_sites; + int i,k,irec = 0; + if ( buf->prune.mode==PRUNE_MODE_1ST ) + { + int eoff = flush_all ? 1 : 2; + for (i=0; irbuf, vcfrec_t, buf->rbuf.n - eoff, buf->vcf); + return; + } + if ( buf->prune.mode==PRUNE_MODE_RAND ) + { + int eoff = flush_all ? 0 : 1; + for (i=0; irbuf.n - eoff) * hts_drand48(); + rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf); + } + return; + } + if ( nbuf > buf->prune.mvrec ) { buf->prune.idx = (int*) realloc(buf->prune.idx, nbuf*sizeof(int)); @@ -180,7 +225,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) } // set allele frequency and prepare buffer for sorting - int i,k,irec = 0; for (i=-1; rbuf_next(&buf->rbuf,&i) && irecvcf[i].rec; @@ -213,7 +257,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) // sort the rbuf indexes to be pruned descendently so that j-th rbuf index // is removed before i-th index if iprune.max_sites; for (i=0; iprune.idx[i] = buf->prune.vrec[i]->idx; @@ -335,10 +378,21 @@ static double _estimate_af(int8_t *ptr, int size, int nvals, int nsamples) } /* - For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/ + The `ld` is set to D approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/ D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb)) + + and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage + disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265 + + \hat{D} = 1/[n*(n+1)]*[ + (n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9) + -(n2/2 + n3 + n5/4 + n6/2)*(n4/2 + n5/4 + n7 + n8/2) + ] + where n1,n2,..n9 are counts of RR/RR,RR/RA,..,AA/AA genotypes. + + Returns 0 on success, -1 if the values could not be determined (missing genotypes) */ -static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec) +static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *ld) { if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample); assert( arec->n_sample ); @@ -367,21 +421,24 @@ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec) baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample); } - // Calculate correlation + // Calculate r2, lf, hd + double nhd[] = {0,0,0,0,0,0,0,0,0}; double ab = 0, aa = 0, bb = 0, a = 0, b = 0; - int nab = 0, na = 0, nb = 0, ndiff = 0; + int nab = 0, ndiff = 0; + int an_tot = 0, bn_tot = 0; for (i=0; in_sample; i++) { int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size); int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size); - int adsg = 0, bdsg = 0, an = 0, bn = 0; + int adsg = 0, bdsg = 0; // dosages (0,1,2) at sites (a,b) + int an = 0, bn = 0; // number of alleles at sites (a,b) for (j=0; jn; j++) { if ( aptr[j]==bcf_int8_vector_end ) break; if ( aptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( rand()/RAND_MAX >= aaf ) adsg += 1; + if ( hts_drand48() >= aaf ) adsg += 1; } else if ( bcf_gt_allele(aptr[j]) ) adsg += 1; an++; @@ -392,89 +449,112 @@ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec) if ( bptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( rand()/RAND_MAX >= baf ) bdsg += 1; + if ( hts_drand48() >= baf ) bdsg += 1; } else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1; bn++; } - if ( an ) + if ( an && bn ) { + an_tot += an; aa += adsg*adsg; a += adsg; - na++; - } - if ( bn ) - { + + bn_tot += bn; bb += bdsg*bdsg; b += bdsg; - nb++; - } - if ( an && bn ) - { + if ( adsg!=bdsg ) ndiff++; ab += adsg*bdsg; nab++; } + if ( an==2 && bn==2 ) // for now only diploid genotypes + { + assert( adsg<=2 && bdsg<=2 ); + nhd[ bdsg*3 + adsg ]++; + } } - if ( !nab ) return -1; + if ( !nab ) return -1; // no data in common for the two sites + double pa = a/an_tot; + double pb = b/bn_tot; double cor; if ( !ndiff ) cor = 1; else { - // Don't know how to deal with zero variance. Since this the purpose is filtering, - // it is not enough to say the value is undefined. Therefore an artificial noise is - // added to make the denominator non-zero. - if ( aa == a*a/na || bb == b*b/nb ) + if ( aa == a*a/nab || bb == b*b/nab ) // zero variance, add small noise { - aa += 3*3; - bb += 3*3; - ab += 3*3; - a += 3; - b += 3; - na++; - nb++; + aa += 1e-4; + bb += 1e-4; + ab += 1e-4; + a += 1e-2; + b += 1e-2; nab++; } - cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb); + cor = (ab - a*b/nab) / sqrt(aa - a*a/nab) / sqrt(bb - b*b/nab); } - return cor*cor; + + ld->val[VCFBUF_LD_IDX_R2] = cor * cor; + + // Lewontin's normalization of D. Also we cap at 1 as the calculation + // can result in values bigger than 1 for high AFs. + ld->val[VCFBUF_LD_IDX_LD] = cor * sqrt(pa*(1-pa)*pb*(1-pb)); + double norm; + if ( ld->val[VCFBUF_LD_IDX_LD] < 0 ) + norm = -pa*pb > -(1-pa)*(1-pb) ? -pa*pb : -(1-pa)*(1-pb); + else + norm = pa*(1-pb) > (1-pa)*pb ? pa*(1-pb) : (1-pa)*pb; + if ( norm ) + ld->val[VCFBUF_LD_IDX_LD] = fabs(norm) > fabs(ld->val[VCFBUF_LD_IDX_LD]) ? ld->val[VCFBUF_LD_IDX_LD]/norm : 1; + if ( !ld->val[VCFBUF_LD_IDX_LD] ) + ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]); // avoid "-0" on output + + ld->val[VCFBUF_LD_IDX_HD] = + (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8]) + - (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.); + ld->val[VCFBUF_LD_IDX_HD] /= nab; + ld->val[VCFBUF_LD_IDX_HD] /= nab+1; + + return 0; } -bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld) +int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld) { - *ld = -1; - if ( !buf->rbuf.n ) return NULL; + int ret = -1; + if ( !buf->rbuf.n ) return ret; - int i = buf->rbuf.f; + int j, i = buf->rbuf.f; // Relying on vcfbuf being properly flushed - all sites in the buffer // must come from the same chromosome - if ( buf->vcf[i].rec->rid != rec->rid ) return NULL; + if ( buf->vcf[i].rec->rid != rec->rid ) return ret; + + vcfbuf_ld_t tmp; + for (j=0; jval[j] = -HUGE_VAL; + ld->rec[j] = NULL; + } - int imax = 0; - double max = 0; for (i=-1; rbuf_next(&buf->rbuf,&i); ) { - if ( buf->ld.skip_filter ) - { - if ( buf->vcf[i].rec->d.n_flt > 1 ) continue; // multiple filters are set - if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue; // not PASS - } - double val = _calc_ld(buf, buf->vcf[i].rec, rec); - if ( buf->ld.max && buf->ld.max < val ) - { - *ld = val; - return buf->vcf[i].rec; - } - if ( val > max ) + if ( buf->vcf[i].filter ) continue; + if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue; // missing genotypes + + int done = 0; + for (j=0; jval[j] < tmp.val[j] ) + { + ld->val[j] = tmp.val[j]; + ld->rec[j] = buf->vcf[i].rec; + } + if ( buf->ld.max[j] < tmp.val[j] ) done = 1; + ret = 0; } + if ( done ) return ret; } - *ld = max; - return buf->vcf[imax].rec; + return ret; } diff --git a/bcftools/vcfbuf.h b/bcftools/vcfbuf.h index 9ede5b5..d3be6c5 100644 --- a/bcftools/vcfbuf.h +++ b/bcftools/vcfbuf.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2017-2019 Genome Research Ltd. + Copyright (c) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -38,13 +38,18 @@ typedef struct _vcfbuf_t vcfbuf_t; // Modes of operation typedef enum { - VCFBUF_LD_MAX, // vcfbuf_max_ld() stops at the first record that exceeds the threshold - VCFBUF_RAND_MISSING, // randomize rather than ignore missing genotypes - VCFBUF_SKIP_FILTER, // skip sites with FILTER diferent from "PASS" or "." - VCFBUF_NSITES, // leave at max this many sites in the window - VCFBUF_AF_TAG, // use this INFO tag with LD_NSITES VCFBUF_OVERLAP_WIN, // keep only overlapping variants in the window VCFBUF_RMDUP, // remove duplicate sites (completely) + VCFBUF_NSITES, // leave at max this many sites in the window + VCFBUF_NSITES_MODE, // one of: maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly) + VCFBUF_AF_TAG, // use this INFO tag with VCFBUF_NSITES + + // LD related options + LD_RAND_MISSING, // randomize rather than ignore missing genotypes + LD_FILTER1, // exclude the next record inserted by vcfbuf_push() from LD analysis + LD_MAX_R2, // If set, vcfbuf_ld() will stop at the first record that exceeds the R2, + LD_MAX_LD, // LD, or HD threshold. When multiple are set, the OR logic is applied + LD_MAX_HD, // } vcfbuf_opt_t; @@ -61,9 +66,8 @@ void vcfbuf_destroy(vcfbuf_t *buf); /* * vcfbuf_push() - push a new site for analysis - * @swap: if set, do not create a copy, but return a substitute */ -bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap); +bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec); /* * vcfbuf_peek() - return pointer to i-th record in the buffer but do not remove it from the buffer @@ -85,10 +89,28 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all); int vcfbuf_nsites(vcfbuf_t *buf); /* - * vcfbuf_max_ld() - return a record that has maximum D or first record exceeding the threshold - * @ld: will be filled with the maximum D found + * vcfbuf_ld() - find records with maximum LD values or the values in first record that exceeds thresholds + * set by vcfbuf_set_opt(..,LD_MAX*,..) + * + * Returns 0 on success or -1 if no values were filled. + * + * @val: will be filled with the values + * .. correlation coefficient r-squared + * .. Lewontin's D' (PMID: 19433632) + * .. Ragsdale's \hat{D} (doi:10.1093/molbev/msz265) + * @rec: corresponding positions or NULL if the value(s) has not been set */ -bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld); +#define VCFBUF_LD_N 3 +#define VCFBUF_LD_IDX_R2 0 +#define VCFBUF_LD_IDX_LD 1 +#define VCFBUF_LD_IDX_HD 2 +typedef struct +{ + double val[VCFBUF_LD_N]; // r2, ld, hd + bcf1_t *rec[VCFBUF_LD_N]; // record with max r2, ld, hd +} +vcfbuf_ld_t; +int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld); #endif diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index f546542..e2aab3f 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -25,6 +25,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -189,6 +190,11 @@ static ploidy_predef_t ploidy_predefs[] = .ploidy = "* * * * 1\n" }, + { .alias = "2", + .about = "Treat all samples as diploid", + .ploidy = + "* * * * 2\n" + }, { .alias = NULL, .about = NULL, @@ -536,7 +542,7 @@ bcf1_t *next_line(args_t *args) bcf_unpack(rec, BCF_UN_STR); if ( !rec0 ) rec0 = rec; recN = rec; - args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); + args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec); if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break; } } @@ -611,7 +617,7 @@ static void init_data(args_t *args) // Open files for input and output, initialize structures if ( args->targets ) { - args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL); + args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : (regidx_free_f) NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL); args->tgt_itr = regitr_init(args->tgt_idx); args->tgt_itr_tmp = regitr_init(args->tgt_idx); } @@ -686,7 +692,7 @@ static void init_data(args_t *args) if ( args->aux.flag & CALL_CONSTR_ALLELES ) args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -768,7 +774,20 @@ void parse_novel_rate(args_t *args, const char *str) else error("Could not parse --novel-rate %s\n", str); } -static int parse_format_flag(const char *str) +static void list_annotations(FILE *fp) +{ + fprintf(fp, + "\n" + "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n" + " INFO/PV4 .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n" + "\n" + "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n" + " FORMAT/GQ .. Phred-scaled genotype quality (Number=1,Type=Integer)\n" + " FORMAT/GP .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n" + "\n"); +} + +static int parse_output_tags(const char *str) { int flag = 0; const char *ss = str; @@ -776,8 +795,9 @@ static int parse_format_flag(const char *str) { const char *se = ss; while ( *se && *se!=',' ) se++; - if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ; - else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP; + if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss) ) flag |= CALL_FMT_GQ; + else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP; + else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4; else { fprintf(stderr,"Could not parse \"%s\"\n", str); @@ -856,41 +876,46 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools call [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "File format options:\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " --ploidy [?] predefined ploidy, 'list' to print available settings, append '?' for details\n"); - fprintf(stderr, " --ploidy-file space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --samples list of samples to include [all samples]\n"); - fprintf(stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n"); + fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " -s, --samples LIST List of samples to include [all samples]\n"); + fprintf(stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Input/output options:\n"); - fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); - fprintf(stderr, " -F, --prior-freqs use prior allele frequencies\n"); - fprintf(stderr, " -G, --group-samples group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n"); - fprintf(stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); - fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); - fprintf(stderr, " -V, --skip-variants skip indels/snps\n"); - fprintf(stderr, " -v, --variants-only output variant sites only\n"); + fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n"); + fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); +//todo? +// fprintf(stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n"); +// fprintf(stderr, " tag removal [^I16,^QS,^FMT/QS]\n"); + fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); + fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); + fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); + fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); + fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); + fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); + fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); + fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); + fprintf(stderr, " -v, --variants-only Output variant sites only\n"); fprintf(stderr, "\n"); fprintf(stderr, "Consensus/variant calling options:\n"); - fprintf(stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n"); - fprintf(stderr, " -C, --constrain one of: alleles, trio (see manual)\n"); - fprintf(stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); - fprintf(stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); + fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); + fprintf(stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n"); + fprintf(stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); + fprintf(stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); + fprintf(stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)= 0) + while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0) { switch (c) { @@ -969,7 +996,12 @@ int main_vcfcall(int argc, char *argv[]) case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; case 'G': args.aux.sample_groups = optarg; break; - case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; + case 3 : args.aux.sample_groups_tag = optarg; break; + case 'f': fprintf(stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n"); + case 'a': + if (optarg[0]=='?') { list_annotations(stderr); return 1; } + args.aux.output_tags |= parse_output_tags(optarg); + break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) case 'A': args.aux.flag |= CALL_KEEPALT; break; diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index 8caf510..b5bedb9 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -2,7 +2,7 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -27,6 +27,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -191,6 +192,11 @@ static ploidy_predef_t ploidy_predefs[] = .ploidy = "* * * * 1\n" }, + { .alias = "2", + .about = "Treat all samples as diploid", + .ploidy = + "* * * * 2\n" + }, { .alias = NULL, .about = NULL, @@ -538,7 +544,7 @@ bcf1_t *next_line(args_t *args) bcf_unpack(rec, BCF_UN_STR); if ( !rec0 ) rec0 = rec; recN = rec; - args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); + args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec); if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break; } } @@ -613,7 +619,7 @@ static void init_data(args_t *args) // Open files for input and output, initialize structures if ( args->targets ) { - args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL); + args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : (regidx_free_f) NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL); args->tgt_itr = regitr_init(args->tgt_idx); args->tgt_itr_tmp = regitr_init(args->tgt_idx); } @@ -688,7 +694,7 @@ static void init_data(args_t *args) if ( args->aux.flag & CALL_CONSTR_ALLELES ) args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -770,7 +776,20 @@ void parse_novel_rate(args_t *args, const char *str) else error("Could not parse --novel-rate %s\n", str); } -static int parse_format_flag(const char *str) +static void list_annotations(FILE *fp) +{ + fprintf(fp, + "\n" + "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n" + " INFO/PV4 .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n" + "\n" + "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n" + " FORMAT/GQ .. Phred-scaled genotype quality (Number=1,Type=Integer)\n" + " FORMAT/GP .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n" + "\n"); +} + +static int parse_output_tags(const char *str) { int flag = 0; const char *ss = str; @@ -778,12 +797,13 @@ static int parse_format_flag(const char *str) { const char *se = ss; while ( *se && *se!=',' ) se++; - if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ; - else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP; + if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss) ) flag |= CALL_FMT_GQ; + else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP; + else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4; else { fprintf(bcftools_stderr,"Could not parse \"%s\"\n", str); - exit(1); + bcftools_exit(1); } if ( !*se ) break; ss = se + 1; @@ -837,12 +857,12 @@ ploidy_t *init_ploidy(char *alias) fprintf(bcftools_stderr,"Run as --ploidy (e.g. --ploidy GRCh37).\n"); fprintf(bcftools_stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n"); fprintf(bcftools_stderr,"\n"); - exit(-1); + bcftools_exit(-1); } else if ( detailed ) { fprintf(bcftools_stderr,"%s", pld->ploidy); - exit(-1); + bcftools_exit(-1); } return ploidy_init_string(pld->ploidy,2); } @@ -858,41 +878,46 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools call [options] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "File format options:\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " --ploidy [?] predefined ploidy, 'list' to print available settings, append '?' for details\n"); - fprintf(bcftools_stderr, " --ploidy-file space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --samples list of samples to include [all samples]\n"); - fprintf(bcftools_stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n"); + fprintf(bcftools_stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include [all samples]\n"); + fprintf(bcftools_stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Input/output options:\n"); - fprintf(bcftools_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); - fprintf(bcftools_stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); - fprintf(bcftools_stderr, " -F, --prior-freqs use prior allele frequencies\n"); - fprintf(bcftools_stderr, " -G, --group-samples group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n"); - fprintf(bcftools_stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(bcftools_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); - fprintf(bcftools_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); - fprintf(bcftools_stderr, " -V, --skip-variants skip indels/snps\n"); - fprintf(bcftools_stderr, " -v, --variants-only output variant sites only\n"); + fprintf(bcftools_stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n"); + fprintf(bcftools_stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); +//todo? +// fprintf(bcftools_stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n"); +// fprintf(bcftools_stderr, " tag removal [^I16,^QS,^FMT/QS]\n"); + fprintf(bcftools_stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); + fprintf(bcftools_stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); + fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); + fprintf(bcftools_stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); + fprintf(bcftools_stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); + fprintf(bcftools_stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); + fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); + fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); + fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Consensus/variant calling options:\n"); - fprintf(bcftools_stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n"); - fprintf(bcftools_stderr, " -C, --constrain one of: alleles, trio (see manual)\n"); - fprintf(bcftools_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); - fprintf(bcftools_stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(bcftools_stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); + fprintf(bcftools_stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); + fprintf(bcftools_stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n"); + fprintf(bcftools_stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); + fprintf(bcftools_stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); + fprintf(bcftools_stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)aux.min_perm_p); fprintf(bcftools_stderr, "\n"); - exit(-1); + bcftools_exit(-1); } int main_vcfcall(int argc, char *argv[]) @@ -929,9 +954,11 @@ int main_vcfcall(int argc, char *argv[]) { {"help",no_argument,NULL,'h'}, {"format-fields",required_argument,NULL,'f'}, + {"annotate",required_argument,NULL,'a'}, {"prior-freqs",required_argument,NULL,'F'}, {"gvcf",required_argument,NULL,'g'}, {"group-samples",required_argument,NULL,'G'}, + {"group-samples-tag",required_argument,NULL,3}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"regions",required_argument,NULL,'r'}, @@ -962,7 +989,7 @@ int main_vcfcall(int argc, char *argv[]) }; char *tmp = NULL; - while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0) { switch (c) { @@ -971,7 +998,12 @@ int main_vcfcall(int argc, char *argv[]) case 'X': ploidy = "X"; fprintf(bcftools_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; case 'Y': ploidy = "Y"; fprintf(bcftools_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; case 'G': args.aux.sample_groups = optarg; break; - case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; + case 3 : args.aux.sample_groups_tag = optarg; break; + case 'f': fprintf(bcftools_stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n"); + case 'a': + if (optarg[0]=='?') { list_annotations(bcftools_stderr); return 1; } + args.aux.output_tags |= parse_output_tags(optarg); + break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) case 'A': args.aux.flag |= CALL_KEEPALT; break; diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c index 2d8a94c..02d610d 100644 --- a/bcftools/vcfcnv.c +++ b/bcftools/vcfcnv.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index 21b9e9d..d74486d 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -1236,7 +1237,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -P, --same-prob prior probability of -s/-c being the same [0.5]\n"); fprintf(bcftools_stderr, " -x, --xy-prob P(x|y) transition probability [1e-9]\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfcnv(int argc, char *argv[]) diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c index dce17f9..0781a60 100644 --- a/bcftools/vcfconcat.c +++ b/bcftools/vcfconcat.c @@ -1,6 +1,6 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -26,6 +26,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -115,7 +116,7 @@ static void init_data(args_t *args) bcf_hdr_append(args->out_hdr,"##FORMAT="); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->allow_overlaps || args->phased_concat ) { @@ -154,6 +155,7 @@ static void init_data(args_t *args) else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE; + else if ( !strcmp(args->remove_dups,"exact") ) args->files->collapse = COLLAPSE_NONE; else error("The -D string \"%s\" not recognised.\n", args->remove_dups); } for (i=0; infnames; i++) @@ -233,6 +235,7 @@ static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec) if ( !args->swap_phase[i] ) continue; int *gt = &args->GTa[i*2]; if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue; + if ( !bcf_gt_is_phased(gt[1]) ) continue; SWAP(int, gt[0], gt[1]); gt[1] |= 1; } @@ -845,8 +848,8 @@ static void usage(args_t *args) fprintf(stderr, "Options:\n"); fprintf(stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n"); fprintf(stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n"); - fprintf(stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); - fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n"); + fprintf(stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); + fprintf(stderr, " -D, --remove-duplicates Alias for -d exact\n"); fprintf(stderr, " -f, --file-list Read the list of files from a file.\n"); fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); @@ -903,7 +906,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 'd': args->remove_dups = optarg; break; - case 'D': args->remove_dups = "none"; break; + case 'D': args->remove_dups = "exact"; break; case 'q': args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index 0004a55..0cd061e 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -2,7 +2,7 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -28,6 +28,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -117,7 +118,7 @@ static void init_data(args_t *args) bcf_hdr_append(args->out_hdr,"##FORMAT="); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->allow_overlaps || args->phased_concat ) { @@ -156,6 +157,7 @@ static void init_data(args_t *args) else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE; + else if ( !strcmp(args->remove_dups,"exact") ) args->files->collapse = COLLAPSE_NONE; else error("The -D string \"%s\" not recognised.\n", args->remove_dups); } for (i=0; infnames; i++) @@ -235,6 +237,7 @@ static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec) if ( !args->swap_phase[i] ) continue; int *gt = &args->GTa[i*2]; if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue; + if ( !bcf_gt_is_phased(gt[1]) ) continue; SWAP(int, gt[0], gt[1]); gt[1] |= 1; } @@ -847,8 +850,8 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Options:\n"); fprintf(bcftools_stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n"); fprintf(bcftools_stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n"); - fprintf(bcftools_stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); - fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d none\n"); + fprintf(bcftools_stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); + fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d exact\n"); fprintf(bcftools_stderr, " -f, --file-list Read the list of files from a file.\n"); fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); @@ -862,7 +865,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " --threads Use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, " -v, --verbose <0|1> Set verbosity level [1]\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfconcat(int argc, char *argv[]) @@ -905,7 +908,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 'd': args->remove_dups = optarg; break; - case 'D': args->remove_dups = "none"; break; + case 'D': args->remove_dups = "exact"; break; case 'q': args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index 445a894..a48e85c 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -1,6 +1,6 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2017 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -68,7 +68,7 @@ struct _args_t int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname, *sex_fname; - int argc, n_threads, record_cmd_line; + int argc, n_threads, record_cmd_line, keep_duplicates; }; static void destroy_data(args_t *args) @@ -153,6 +153,15 @@ static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss); rec->pos--; + // ID + if ( args->output_vcf_ids ) + { + char tmp = *tsv->se; + *tsv->se = 0; + bcf_update_id(args->header, rec, tsv->ss); + *tsv->se = tmp; + } + // REF,ALT args->str.l = 0; se = ++ss; @@ -385,7 +394,7 @@ static void gensample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -513,7 +522,7 @@ static void haplegendsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -627,7 +636,7 @@ static void hapsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -802,7 +811,7 @@ static void vcf_to_gensample(args_t *args) } // skip duplicate lines, or otherwise shapeit complains - if ( prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; } + if ( !args->keep_duplicates && prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; } prev_rid = line->rid; prev_pos = line->pos; @@ -977,7 +986,7 @@ static void vcf_to_hapsample(args_t *args) if ( args->output_vcf_ids ) kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str); else - kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); @@ -994,7 +1003,7 @@ static void vcf_to_hapsample(args_t *args) if ( n_files==1 ) { int l = str.l; - kputs(".sample",&str); + kputs(".samples",&str); sample_fname = strdup(str.s); str.l = l; kputs(".hap.gz",&str); @@ -1215,7 +1224,7 @@ static void tsv_to_vcf(args_t *args) bcf_hdr_add_sample(args->header, NULL); args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -1267,7 +1276,7 @@ static void tsv_to_vcf(args_t *args) static void vcf_to_vcf(args_t *args) { open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1296,7 +1305,7 @@ static void gvcf_to_vcf(args_t *args) if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname); open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1395,6 +1404,7 @@ static void usage(void) fprintf(stderr, " -g, --gensample <...> |,\n"); fprintf(stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); fprintf(stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); + fprintf(stderr, " --keep-duplicates keep duplicate positions\n"); fprintf(stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); fprintf(stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, "\n"); @@ -1473,12 +1483,17 @@ int main_vcfconvert(int argc, char *argv[]) {"columns",required_argument,NULL,'c'}, {"fasta-ref",required_argument,NULL,'f'}, {"no-version",no_argument,NULL,10}, + {"keep-duplicates",no_argument,NULL,12}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) { switch (c) { - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 't': args->targets_list = optarg; break; @@ -1512,6 +1527,7 @@ int main_vcfconvert(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 10 : args->record_cmd_line = 0; break; case 11 : args->sex_fname = optarg; break; + case 12 : args->keep_duplicates = 1; break; case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index abdfbec..358e404 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -2,7 +2,7 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2017 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -70,7 +70,7 @@ struct _args_t int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname, *sex_fname; - int argc, n_threads, record_cmd_line; + int argc, n_threads, record_cmd_line, keep_duplicates; }; static void destroy_data(args_t *args) @@ -155,6 +155,15 @@ static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss); rec->pos--; + // ID + if ( args->output_vcf_ids ) + { + char tmp = *tsv->se; + *tsv->se = 0; + bcf_update_id(args->header, rec, tsv->ss); + *tsv->se = tmp; + } + // REF,ALT args->str.l = 0; se = ++ss; @@ -387,7 +396,7 @@ static void gensample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -515,7 +524,7 @@ static void haplegendsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -629,7 +638,7 @@ static void hapsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -804,7 +813,7 @@ static void vcf_to_gensample(args_t *args) } // skip duplicate lines, or otherwise shapeit complains - if ( prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; } + if ( !args->keep_duplicates && prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; } prev_rid = line->rid; prev_pos = line->pos; @@ -979,7 +988,7 @@ static void vcf_to_hapsample(args_t *args) if ( args->output_vcf_ids ) kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str); else - kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); @@ -996,7 +1005,7 @@ static void vcf_to_hapsample(args_t *args) if ( n_files==1 ) { int l = str.l; - kputs(".sample",&str); + kputs(".samples",&str); sample_fname = strdup(str.s); str.l = l; kputs(".hap.gz",&str); @@ -1217,7 +1226,7 @@ static void tsv_to_vcf(args_t *args) bcf_hdr_add_sample(args->header, NULL); args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -1269,7 +1278,7 @@ static void tsv_to_vcf(args_t *args) static void vcf_to_vcf(args_t *args) { open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1298,7 +1307,7 @@ static void gvcf_to_vcf(args_t *args) if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname); open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1397,6 +1406,7 @@ static void usage(void) fprintf(bcftools_stderr, " -g, --gensample <...> |,\n"); fprintf(bcftools_stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); fprintf(bcftools_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, " --keep-duplicates keep duplicate positions\n"); fprintf(bcftools_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); fprintf(bcftools_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); fprintf(bcftools_stderr, "\n"); @@ -1433,7 +1443,7 @@ static void usage(void) // fprintf(bcftools_stderr, "PBWT options:\n"); // fprintf(bcftools_stderr, " -b, --pbwt or ,,,\n"); // fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfconvert(int argc, char *argv[]) @@ -1475,12 +1485,17 @@ int main_vcfconvert(int argc, char *argv[]) {"columns",required_argument,NULL,'c'}, {"fasta-ref",required_argument,NULL,'f'}, {"no-version",no_argument,NULL,10}, + {"keep-duplicates",no_argument,NULL,12}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) { switch (c) { - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 't': args->targets_list = optarg; break; @@ -1514,6 +1529,7 @@ int main_vcfconvert(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 10 : args->record_cmd_line = 0; break; case 11 : args->sex_fname = optarg; break; + case 12 : args->keep_duplicates = 1; break; case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c index 257ee3f..723bcdf 100644 --- a/bcftools/vcffilter.c +++ b/bcftools/vcffilter.c @@ -1,6 +1,6 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -25,8 +25,10 @@ THE SOFTWARE. */ #include #include #include +#include #include #include +#include #include #include #include @@ -60,7 +62,8 @@ typedef struct _args_t char *soft_filter; // drop failed sites or annotate FILTER column? int annot_mode; // add to existing FILTER annotation or replace? Otherwise reset FILTER to PASS or leave as it is? int flt_fail, flt_pass; // BCF ids of fail and pass filters - int snp_gap, indel_gap, IndelGap_id, SnpGap_id; + int snp_gap, snp_gap_type, indel_gap, IndelGap_id, SnpGap_id; + char *snp_gap_str; int32_t ntmpi, *tmpi, ntmp_ac, *tmp_ac; rbuf_t rbuf; bcf1_t **rbuf_lines; @@ -77,7 +80,7 @@ args_t; static void init_data(args_t *args) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -138,7 +141,7 @@ static void init_data(args_t *args) args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); if ( args->snp_gap ) { - bcf_hdr_printf(args->hdr, "##FILTER=", args->snp_gap); + bcf_hdr_printf(args->hdr, "##FILTER=", args->snp_gap,args->snp_gap_str); args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap"); assert( args->SnpGap_id>=0 ); } @@ -217,9 +220,9 @@ static void buffered_filters(args_t *args, bcf1_t *line) */ // To avoid additional data structure, we abuse bcf1_t's var and var_type records. - const int SnpGap_set = VCF_OTHER<<1; - const int IndelGap_set = VCF_OTHER<<2; - const int IndelGap_flush = VCF_OTHER<<3; + const int SnpGap_set = 1 << (8*sizeof(int)/2); + const int IndelGap_set = 1 << (8*sizeof(int)/2-1); + const int IndelGap_flush = 1 << (8*sizeof(int)/2-2); int var_type = 0, i; if ( line ) @@ -245,15 +248,8 @@ static void buffered_filters(args_t *args, bcf1_t *line) // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be // used. This filter is therefore more strict and may remove some valid // SNPs. - int len = 1; - if ( var_type & VCF_INDEL ) - { - for (i=1; in_allele; i++) - if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n; - } - // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion. - line->d.var[0].n = len; + line->d.var[0].n = line->rlen; } int k_flush = 1; @@ -328,13 +324,13 @@ static void buffered_filters(args_t *args, bcf1_t *line) int rec_to = rec->pos + rec->d.var[0].n - 1; // last position affected by the variant if ( rec_to + args->snp_gap < last_from ) j_flush++; - else if ( (var_type & VCF_INDEL) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) ) + else if ( (var_type & args->snp_gap_type) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) ) { // this SNP has not been SnpGap-filtered yet rec->d.var_type |= SnpGap_set; bcf_add_filter(args->hdr, rec, args->SnpGap_id); } - else if ( (var_type & VCF_SNP) && (rec->d.var_type & VCF_INDEL) ) + else if ( (var_type & VCF_SNP) && (rec->d.var_type & args->snp_gap_type) ) { // the line which we are adding is a SNP and needs to be filtered line->d.var_type |= SnpGap_set; @@ -413,7 +409,7 @@ static void usage(args_t *args) fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -g, --SnpGap filter SNPs within base pairs of an indel\n"); + fprintf(stderr, " -g, --SnpGap [:type] filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); fprintf(stderr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); fprintf(stderr, " -i, --include include only sites for which the expression is true (see man page for details\n"); fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); @@ -465,9 +461,31 @@ int main_vcffilter(int argc, char *argv[]) char *tmp; while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) { switch (c) { - case 'g': + case 'g': args->snp_gap = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: --SnpGap %s\n", optarg); + if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg); + if ( *tmp==':' ) + { + args->snp_gap_str = tmp+1; + int i,n; + char **keys = hts_readlist(tmp+1,0,&n); + for(i=0; isnp_gap_type |= VCF_INDEL; + else if ( !strcasecmp(keys[i],"mnp") ) args->snp_gap_type |= VCF_MNP; + else if ( !strcasecmp(keys[i],"bnd") ) args->snp_gap_type |= VCF_BND; + else if ( !strcasecmp(keys[i],"other") ) args->snp_gap_type |= VCF_OTHER; + else if ( !strcasecmp(keys[i],"overlap") ) args->snp_gap_type |= VCF_OVERLAP; + else error("Could not parse \"%s\" in \"--SnpGap %s\"\n", keys[i], optarg); + free(keys[i]); + } + if ( n ) free(keys); + } + else + { + args->snp_gap_type = VCF_INDEL; + args->snp_gap_str = "indel"; + } break; case 'G': args->indel_gap = strtol(optarg,&tmp,10); @@ -492,8 +510,12 @@ int main_vcffilter(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'S': if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING; else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF; diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c index 908c3b4..5709182 100644 --- a/bcftools/vcffilter.c.pysam.c +++ b/bcftools/vcffilter.c.pysam.c @@ -2,7 +2,7 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -27,8 +27,10 @@ THE SOFTWARE. */ #include #include #include +#include #include #include +#include #include #include #include @@ -62,7 +64,8 @@ typedef struct _args_t char *soft_filter; // drop failed sites or annotate FILTER column? int annot_mode; // add to existing FILTER annotation or replace? Otherwise reset FILTER to PASS or leave as it is? int flt_fail, flt_pass; // BCF ids of fail and pass filters - int snp_gap, indel_gap, IndelGap_id, SnpGap_id; + int snp_gap, snp_gap_type, indel_gap, IndelGap_id, SnpGap_id; + char *snp_gap_str; int32_t ntmpi, *tmpi, ntmp_ac, *tmp_ac; rbuf_t rbuf; bcf1_t **rbuf_lines; @@ -79,7 +82,7 @@ args_t; static void init_data(args_t *args) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -140,7 +143,7 @@ static void init_data(args_t *args) args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); if ( args->snp_gap ) { - bcf_hdr_printf(args->hdr, "##FILTER=", args->snp_gap); + bcf_hdr_printf(args->hdr, "##FILTER=", args->snp_gap,args->snp_gap_str); args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap"); assert( args->SnpGap_id>=0 ); } @@ -219,9 +222,9 @@ static void buffered_filters(args_t *args, bcf1_t *line) */ // To avoid additional data structure, we abuse bcf1_t's var and var_type records. - const int SnpGap_set = VCF_OTHER<<1; - const int IndelGap_set = VCF_OTHER<<2; - const int IndelGap_flush = VCF_OTHER<<3; + const int SnpGap_set = 1 << (8*sizeof(int)/2); + const int IndelGap_set = 1 << (8*sizeof(int)/2-1); + const int IndelGap_flush = 1 << (8*sizeof(int)/2-2); int var_type = 0, i; if ( line ) @@ -247,15 +250,8 @@ static void buffered_filters(args_t *args, bcf1_t *line) // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be // used. This filter is therefore more strict and may remove some valid // SNPs. - int len = 1; - if ( var_type & VCF_INDEL ) - { - for (i=1; in_allele; i++) - if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n; - } - // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion. - line->d.var[0].n = len; + line->d.var[0].n = line->rlen; } int k_flush = 1; @@ -330,13 +326,13 @@ static void buffered_filters(args_t *args, bcf1_t *line) int rec_to = rec->pos + rec->d.var[0].n - 1; // last position affected by the variant if ( rec_to + args->snp_gap < last_from ) j_flush++; - else if ( (var_type & VCF_INDEL) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) ) + else if ( (var_type & args->snp_gap_type) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) ) { // this SNP has not been SnpGap-filtered yet rec->d.var_type |= SnpGap_set; bcf_add_filter(args->hdr, rec, args->SnpGap_id); } - else if ( (var_type & VCF_SNP) && (rec->d.var_type & VCF_INDEL) ) + else if ( (var_type & VCF_SNP) && (rec->d.var_type & args->snp_gap_type) ) { // the line which we are adding is a SNP and needs to be filtered line->d.var_type |= SnpGap_set; @@ -415,7 +411,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -g, --SnpGap filter SNPs within base pairs of an indel\n"); + fprintf(bcftools_stderr, " -g, --SnpGap [:type] filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); fprintf(bcftools_stderr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); fprintf(bcftools_stderr, " -i, --include include only sites for which the expression is true (see man page for details\n"); fprintf(bcftools_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); @@ -430,7 +426,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcffilter(int argc, char *argv[]) @@ -467,9 +463,31 @@ int main_vcffilter(int argc, char *argv[]) char *tmp; while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) { switch (c) { - case 'g': + case 'g': args->snp_gap = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: --SnpGap %s\n", optarg); + if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg); + if ( *tmp==':' ) + { + args->snp_gap_str = tmp+1; + int i,n; + char **keys = hts_readlist(tmp+1,0,&n); + for(i=0; isnp_gap_type |= VCF_INDEL; + else if ( !strcasecmp(keys[i],"mnp") ) args->snp_gap_type |= VCF_MNP; + else if ( !strcasecmp(keys[i],"bnd") ) args->snp_gap_type |= VCF_BND; + else if ( !strcasecmp(keys[i],"other") ) args->snp_gap_type |= VCF_OTHER; + else if ( !strcasecmp(keys[i],"overlap") ) args->snp_gap_type |= VCF_OVERLAP; + else error("Could not parse \"%s\" in \"--SnpGap %s\"\n", keys[i], optarg); + free(keys[i]); + } + if ( n ) free(keys); + } + else + { + args->snp_gap_type = VCF_INDEL; + args->snp_gap_str = "indel"; + } break; case 'G': args->indel_gap = strtol(optarg,&tmp,10); @@ -494,8 +512,12 @@ int main_vcffilter(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'S': if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING; else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF; diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c index 8bf3223..8a96e3e 100644 --- a/bcftools/vcfgtcheck.c +++ b/bcftools/vcfgtcheck.c @@ -1,6 +1,6 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -26,8 +26,10 @@ THE SOFTWARE. */ #include #include #include +#include #include #include +#include #include #include #include @@ -35,240 +37,46 @@ THE SOFTWARE. */ #include #include #include +#include +#include #include +#include #include "bcftools.h" -#include "hclust.h" +#include "extsort.h" +//#include "hclust.h" typedef struct { - bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check - bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF - int ntmp_arr, npl_arr; - int32_t *tmp_arr, *pl_arr; - double *lks, *sites, min_inter_err, max_intra_err; - int *cnts, *dps, hom_only, cross_check, all_sites; - char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample; - int argc, no_PLs, narr, nsmpl; -} -args_t; - -FILE *open_file(char **fname, const char *mode, const char *fmt, ...); -char *msprintf(const char *fmt, ...); -void mkdir_p(const char *fmt, ...); - -void py_plot(char *script) -{ - mkdir_p(script); - int len = strlen(script); - char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script); - int ret = system(cmd); - if ( ret ) fprintf(stderr, "The command returned non-zero status %d: %s\n", ret, cmd); - free(cmd); -} - -static void plot_check(args_t *args, char *target_sample, char *query_sample) -{ - char *fname; - FILE *fp = open_file(&fname, "w", "%s.py", args->plot); - fprintf(fp, - "import matplotlib as mpl\n" - "mpl.use('Agg')\n" - "import matplotlib.pyplot as plt\n" - "import matplotlib.gridspec as gridspec\n" - "import csv\n" - "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n" - "\n" - "sample_ids = False\n" - "\n" - "dat = []\n" - "with open('%s.tab', 'r') as f:\n" - " reader = csv.reader(f, 'tab')\n" - " for row in reader:\n" - " if row[0][0]=='#': continue\n" - " if row[0]!='CN': continue\n" - " tgt = 0\n" - " if row[4]=='%s': tgt = 1\n" - " dat.append([float(row[1]), float(row[2]), float(row[3]), tgt, row[4]])\n" - "\n" - "dat = sorted(dat)\n" - "\n" - "iq = -1; dp = 0\n" - "for i in range(len(dat)):\n" - " if iq==-1 and dat[i][3]==1: iq = i\n" - " dp += dat[i][2]\n" - "dp /= len(dat)\n" - "\n" - "fig,ax1 = plt.subplots(figsize=(8,5))\n" - "ax2 = ax1.twinx()\n" - "plots = ax1.plot([x[0] for x in dat],'o-', ms=3, color='g', mec='g', label='Discordance (total)')\n" - "plots += ax1.plot([x[1] for x in dat], '^', ms=3, color='r', mec='r', label='Discordance (avg per site)')\n" - "plots += ax2.plot([x[2] for x in dat],'v', ms=3, color='k', label='Number of sites')\n" - "if iq!=-1:\n" - " ax1.plot([iq],[dat[iq][0]],'o',color='orange', ms=9)\n" - " ax1.annotate('%s',xy=(iq,dat[iq][0]), xytext=(5,5), textcoords='offset points',fontsize='xx-small',rotation=45,va='bottom',ha='left')\n" - " ax1.plot([iq],[dat[iq][1]],'^',color='red', ms=5)\n" - "for tl in ax1.get_yticklabels(): tl.set_color('g')\n" - "for tl in ax2.get_yticklabels(): tl.set_color('k'); tl.set_fontsize(9)\n" - "min_dp = min([x[2] for x in dat])\n" - "max_dp = max([x[2] for x in dat])\n" - "ax2.set_ylim(min_dp-1,max_dp+1)\n" - "ax1.set_title('Discordance with %s')\n" - "ax1.set_xlim(-0.05*len(dat),1.05*(len(dat)-1))\n" - "ax1.set_xlabel('Sample ID')\n" - "plt.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)\n" - "if sample_ids:\n" - " ax1.set_xticks(range(len(dat)))\n" - " ax1.set_xticklabels([x[4] for x in dat],**{'rotation':45, 'ha':'right', 'fontsize':8})\n" - " plt.subplots_adjust(bottom=0.2)\n" - "ax1.set_ylabel('Discordance',color='g')\n" - "ax2.set_ylabel('Number of sites',color='k')\n" - "ax2.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n" - "ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n" - "labels = [l.get_label() for l in plots]\n" - "plt.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False)\n" - "plt.savefig('%s.png')\n" - "plt.close()\n" - "\n", args->plot, target_sample, target_sample, query_sample, args->plot - ); - fclose(fp); - py_plot(fname); - free(fname); -} - -#if 0 -static void plot_cross_check(args_t *args) -{ - char *fname; - FILE *fp = open_file(&fname, "w", "%s.py", args->plot); - fprintf(fp, - "import matplotlib as mpl\n" - "mpl.use('Agg')\n" - "import matplotlib.pyplot as plt\n" - "import matplotlib.gridspec as gridspec\n" - "import csv\n" - "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n" - "avg = []\n" - "dp = []\n" - "sm2id = {}\n" - "dat = None\n" - "min = None\n" - "max = None\n" - "with open('%s.tab', 'r') as f:\n" - " reader = csv.reader(f, 'tab')\n" - " i = 0\n" - " for row in reader:\n" - " if row[0]=='SM':\n" - " sm2id[row[4]] = i\n" - " avg.append([i,float(row[1])])\n" - " dp.append([i,float(row[2])])\n" - " i += 1\n" - " elif row[0]=='CN':\n" - " val = 0\n" - " if int(row[2])!=0: val = float(row[1])/int(row[2])\n" - " if not dat:\n" - " dat = [[0]*len(sm2id) for x in xrange(len(sm2id))]\n" - " min = val\n" - " max = val\n" - " id_i = sm2id[row[4]]\n" - " id_j = sm2id[row[5]]\n" - " dat[id_i][id_j] = val\n" - " dat[id_j][id_i] = val\n" - " if min > val: min = val\n" - " if max < val: max = val\n" - "\n" - "if len(sm2id)<=1: exit(1)\n" - "if min==max: exit(1)\n" - "\n" - "fig = plt.figure(figsize=(6,7))\n" - "gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1.5])\n" - "ax1 = plt.subplot(gs[0])\n" - "ax2 = plt.subplot(gs[1])\n" - "\n" - "ax1.plot([x[0] for x in avg],[x[1] for x in avg],'^-', ms=3, color='k')\n" - "ax3 = ax1.twinx()\n" - "ax3.plot([x[0] for x in dp],[x[1] for x in dp],'^-', ms=3, color='r',mec='r')\n" - "for tl in ax3.get_yticklabels():\n" - " tl.set_color('r')\n" - " tl.set_fontsize(9)\n" - "\n" - "im = ax2.imshow(dat,clim=(min),interpolation='nearest',origin='lower')\n" - "cb1 = plt.colorbar(im,ax=ax2)\n" - "cb1.set_label('Pairwise discordance')\n" - "for t in cb1.ax.get_yticklabels(): t.set_fontsize(9)\n" - "\n" - "ax1.tick_params(axis='both', which='major', labelsize=9)\n" - "ax1.tick_params(axis='both', which='minor', labelsize=9)\n" - "ax2.tick_params(axis='both', which='major', labelsize=9)\n" - "ax2.tick_params(axis='both', which='minor', labelsize=9)\n" - "\n" - "ax1.set_title('Sample Discordance Score')\n" - "ax2.set_ylabel('Sample ID')\n" - "ax2.set_xlabel('Sample ID')\n" - "ax3.set_ylabel('Average Depth',color='r')\n" - "ax1.set_xlabel('Sample ID')\n" - "ax1.set_ylabel('Average discordance')\n" - "\n" - "plt.subplots_adjust(left=0.15,right=0.87,bottom=0.08,top=0.93,hspace=0.25)\n" - "plt.savefig('%s.png')\n" - "plt.close()\n" - "\n", args->plot,args->plot - ); - fclose(fp); - py_plot(fname); - free(fname); -} -#endif - -static void init_data(args_t *args) -{ - args->sm_hdr = args->files->readers[0].header; - if ( !bcf_hdr_nsamples(args->sm_hdr) ) error("No samples in %s?\n", args->files->readers[0].fname); - - if ( !args->cross_check ) - { - args->gt_hdr = args->files->readers[1].header; - int nsamples = bcf_hdr_nsamples(args->gt_hdr); - if ( !nsamples ) error("No samples in %s?\n", args->files->readers[1].fname); - args->lks = (double*) calloc(nsamples,sizeof(double)); - args->cnts = (int*) calloc(nsamples,sizeof(int)); - args->sites = (double*) calloc(nsamples,sizeof(double)); - args->dps = (int*) calloc(nsamples,sizeof(int)); - } + int iqry, igt; } +pair_t; -static void destroy_data(args_t *args) -{ - free(args->lks); free(args->cnts); free(args->dps); free(args->cwd); free(args->sites); -} - -static int allele_to_int(bcf1_t *line, char *allele) +typedef struct { - int i; - for (i=0; in_allele; i++) - if ( !strcmp(allele,line->d.allele[i]) ) return i; - if ( strcmp(line->d.allele[i-1],"X") ) return -1; - return i-1; -} + bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check + bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF + char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples; + int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file; + int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl; + double *pdiff, *qry_prob, *gt_prob; + uint32_t *ndiff,*ncnt,ncmp, npairs; + int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr; + uint8_t *qry_dsg, *gt_dsg; + pair_t *pairs; + double *hwe_prob, dsg2prob[8][3], pl2prob[256]; + double min_inter_err, max_intra_err; + int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs; + FILE *fp; + unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL; -static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2ipl, int n_gt2ipl) -{ - int i, j; - for (i=0; in_allele; i++) - { - // find which of the sm_alleles (k) corresponds to the gt_allele (i) - int k = allele_to_int(sm_line, gt_line->d.allele[i]); - if ( k<0 ) return 0; - for (j=0; j<=i; j++) - { - int l = allele_to_int(sm_line, gt_line->d.allele[j]); - if ( l<0 ) return 0; - gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k); - } - } - //for (i=0; icwd); } -static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line) +static int cmp_int(const void *_a, const void *_b) { - // PLs not present, use GTs instead. - int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs - int nsm_gt, i; - if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) - error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); - nsm_gt /= bcf_hdr_nsamples(hdr); - int npl = line->n_allele*(line->n_allele+1)/2; - hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr); - for (i=0; itmp_arr + i*nsm_gt; - int j, *pl_ptr = args->pl_arr + i*npl; - if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype - { - for (j=0; j b ) return 1; + return 0; +} +static int cmp_pair(const void *_a, const void *_b) +{ + pair_t *a = (pair_t*)_a; + pair_t *b = (pair_t*)_b; + if ( a->iqry < b->iqry ) return -1; + if ( a->iqry > b->iqry ) return 1; + if ( a->igt < b->igt ) return -1; + if ( a->igt > b->igt ) return 1; + return 0; } -static int cmp_doubleptr(const void *_a, const void *_b) +typedef struct +{ + uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms + unsigned long kbs_dat[1]; +} +diff_sites_t; +#if DBG +static void diff_sites_debug_print(args_t *args, diff_sites_t *ds) +{ + int i; + memcpy(args->kbs_diff->b,ds->kbs_dat,args->kbs_diff->n*sizeof(unsigned long)); + fprintf(stderr,"%s:%d\t%d\t",bcf_hdr_id2name(args->qry_hdr,ds->rid),ds->pos+1,ds->ndiff); + for (i=0; inpairs; i++) fprintf(stderr,"%d",kbs_exists(args->kbs_diff,i)?1:0); + fprintf(stderr,"\n"); +} +#endif +static int diff_sites_cmp(const void *aptr, const void *bptr) +{ + diff_sites_t *a = *((diff_sites_t**)aptr); + diff_sites_t *b = *((diff_sites_t**)bptr); + if ( a->ndiff < b->ndiff ) return 1; // descending order + if ( a->ndiff > b->ndiff ) return -1; + if ( a->rand < b->rand ) return -1; + if ( a->rand > b->rand ) return 1; + return 0; +} +static void diff_sites_init(args_t *args) +{ + int nsites = args->distinctive_sites<=1 ? args->npairs*args->distinctive_sites : args->distinctive_sites; + if ( nsites<=0 ) error("The value for --distinctive-sites was set too low: %d\n",nsites); + if ( nsites > args->npairs ) + { + fprintf(stderr,"Warning: The value for --distinctive-sites is bigger than is the number of pairs, all discordant sites be printed.\n"); + nsites = args->npairs; + args->distinctive_sites = args->npairs + 1; + } + else + args->distinctive_sites = nsites; + args->kbs_diff = kbs_init(args->npairs); + size_t n = (args->npairs + KBS_ELTBITS-1) / KBS_ELTBITS; + assert( n==args->kbs_diff->n ); + args->diff_sites_size = sizeof(diff_sites_t) + (n-1)*sizeof(unsigned long); + args->es = extsort_alloc(); + extsort_set_opt(args->es,size_t,DAT_SIZE,args->diff_sites_size); + extsort_set_opt(args->es,const char*,TMP_PREFIX,args->es_tmp_prefix); + extsort_set_opt(args->es,const char*,MAX_MEM,args->es_max_mem); + extsort_set_opt(args->es,extsort_cmp_f,FUNC_CMP,diff_sites_cmp); + extsort_init(args->es); +} +static void diff_sites_destroy(args_t *args) { - double *a = *((double**)_a); - double *b = *((double**)_b); - if ( *a < *b ) return -1; - else if ( *a == *b ) return 0; + kbs_destroy(args->kbs_diff); + extsort_destroy(args->es); +} +static inline void diff_sites_reset(args_t *args) +{ + kbs_clear(args->kbs_diff); +} +static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos) +{ + diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size); + memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added) + dat->ndiff = ndiff; + dat->rid = rid; + dat->pos = pos; + dat->rand = hts_lrand48(); + memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long)); + extsort_push(args->es,dat); +} +static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos) +{ + diff_sites_t *dat = (diff_sites_t*) extsort_shift(args->es); + if ( !dat ) return 0; + *ndiff = dat->ndiff; + *rid = dat->rid; + *pos = dat->pos; + memcpy(args->kbs_diff->b,dat->kbs_dat,args->kbs_diff->n*sizeof(unsigned long)); return 1; } -static void check_gt(args_t *args) +static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname) { - int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0; - int fake_pls = args->no_PLs; + int i; + if ( !strcmp(list,"-") ) + { + *nsmpl = bcf_hdr_nsamples(hdr); + *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl)); + for (i=0; i<*nsmpl; i++) (*smpl)[i] = i; + return; + } - // Initialize things: check which tags are defined in the header, sample names etc. - if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname); - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) + char **tmp = hts_readlist(list, list_is_file, nsmpl); + if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list); + *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl)); + for (i=0; i<*nsmpl; i++) { - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) - error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); - if ( !args->no_PLs ) - fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); - fake_pls = 1; + int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]); + if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]); + (*smpl)[i] = idx; + free(tmp[i]); } + free(tmp); + qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int); + // check for duplicates + for (i=1; i<*nsmpl; i++) + if ( (*smpl)[i-1]==(*smpl)[i] ) + error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list); +} - FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; - print_header(args, fp); +static void init_data(args_t *args) +{ + hts_srand48(0); - int tgt_isample = -1, query_isample = 0; - if ( args->target_sample ) + args->files = bcf_sr_init(); + if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); + if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); + + if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX); + if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum)); + if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) + error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum)); + + args->qry_hdr = bcf_sr_get_header(args->files,0); + if ( !bcf_hdr_nsamples(args->qry_hdr) ) error("No samples in %s?\n", args->qry_fname); + if ( args->gt_fname ) { - tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample); - if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample); + args->gt_hdr = bcf_sr_get_header(args->files,1); + if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname); } - if ( args->all_sites ) + + // Determine whether GT or PL will be used + if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default { - if ( tgt_isample==-1 ) - { - fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); - tgt_isample = 0; - } + if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")>=0 ) + args->qry_use_GT = 0; + else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")>=0 ) + args->qry_use_GT = 1; + else + error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->qry_fname); } - if ( args->query_sample ) + else if ( args->qry_use_GT==1 ) { - query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample); - if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample); + if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")<0 ) + error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->qry_fname); } - if ( args->all_sites ) - fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n", - args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]); + else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")<0 ) + error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->qry_fname); - // Main loop - float prev_lk = 0; - while ( (ret=bcf_sr_next_line(args->files)) ) + if ( args->gt_hdr ) { - if ( ret!=2 ) continue; - bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file - bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file - bcf_unpack(sm_line, BCF_UN_FMT); - bcf_unpack(gt_line, BCF_UN_FMT); - - // Init mapping from target genotype index to the sample's PL fields - int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2; - if ( n_gt2ipl > m_gt2ipl ) + if ( args->gt_use_GT==-1 ) // not set by -u, gt uses GT by default + { + if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")>=0 ) + args->gt_use_GT = 1; + else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")>=0 ) + args->gt_use_GT = 0; + else + error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->gt_fname); + } + else if ( args->gt_use_GT==1 ) { - m_gt2ipl = n_gt2ipl; - gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl); + if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")<0 ) + error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->gt_fname); } - if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue; - - // Target genotypes - int ngt, npl; - if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) - error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); - ngt /= bcf_hdr_nsamples(args->gt_hdr); - if ( ngt!=2 ) continue; // checking only diploid genotypes + else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")<0 ) + error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->gt_fname); + } + else + args->gt_use_GT = args->qry_use_GT; - // Sample PLs - if ( !fake_pls ) + // Prepare samples + int i,j; + args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr); + if ( args->qry_samples ) + { + init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname); + } + if ( args->gt_samples ) + { + init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl, + args->gt_hdr ? args->gt_hdr : args->qry_hdr, + args->gt_fname ? args->gt_fname : args->qry_fname); + } + else if ( args->pair_samples ) + { + int npairs; + char **tmp = hts_readlist(args->pair_samples, args->pair_samples_is_file, &npairs); + if ( !tmp || !npairs ) error("Failed to parse %s\n", args->pair_samples); + if ( !args->pair_samples_is_file && npairs%2 ) error("Expected even number of comma-delimited samples with -p\n"); + args->npairs = args->pair_samples_is_file ? npairs : npairs/2; + args->pairs = (pair_t*) calloc(args->npairs,sizeof(*args->pairs)); + if ( !args->pair_samples_is_file ) { - if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 ) + for (i=0; inpairs; i++) { - if ( sm_line->n_allele==1 ) - { - // PL values may not be present when ALT=. (mpileup/bcftools output), in that case - // switch automatically to GT at these sites - npl = fake_PLs(args, args->sm_hdr, sm_line); - } - else - error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1); + args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i]); + args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i+1]); + if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[2*i]); + if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,tmp[2*i+1]); + free(tmp[2*i]); + free(tmp[2*i+1]); } - else - npl /= bcf_hdr_nsamples(args->sm_hdr); } else - npl = fake_PLs(args, args->sm_hdr, sm_line); + { + for (i=0; inpairs; i++) + { + char *ptr = tmp[i]; + while ( *ptr && !isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse %s: %s\n",args->pair_samples,tmp[i]); + *ptr = 0; + args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]); + if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]); + ptr++; + while ( *ptr && isspace(*ptr) ) ptr++; + args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, ptr); + if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,ptr); + free(tmp[i]); + } + } + free(tmp); + qsort(args->pairs,args->npairs,sizeof(*args->pairs),cmp_pair); + } + else if ( args->gt_hdr ) + args->ngt_smpl = bcf_hdr_nsamples(args->gt_hdr); + if ( !args->ngt_smpl ) + { + args->ngt_smpl = args->nqry_smpl; + args->gt_smpl = args->qry_smpl; + args->cross_check = 1; + } + + // The data arrays + if ( !args->npairs ) args->npairs = args->cross_check ? args->nqry_smpl*(args->nqry_smpl+1)/2 : args->ngt_smpl*args->nqry_smpl; + if ( !args->pair_samples ) + { + args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl); + args->gt_dsg = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl); + } + if ( args->use_PLs ) + { + args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff)); // log probability of pair samples being the same + args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob)); + args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob)); + + // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing + // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding + // probabilities of 0/0, 0/1, and 1/1 genotypes + for (i=0; i<8; i++) + for (j=0; j<3; j++) + args->dsg2prob[i][j] = HUGE_VAL; + args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs)); + args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs)); + args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs)); - // Calculate likelihoods for all samples, assuming diploid genotypes + // lookup table to avoid exponentiation + for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i); + } + else + args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff)); // number of differing genotypes for each pair of samples + args->ncnt = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt)); // number of comparisons performed (non-missing data) + if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6); + if ( args->calc_hwe_prob ) + { + // prob of the observed sequence of matches given site AFs and HWE + args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob)); + if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6); + } + + if ( args->distinctive_sites ) diff_sites_init(args); + + args->fp = stdout; + print_header(args, args->fp); +} + +static void destroy_data(args_t *args) +{ + if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg); + free(args->qry_dsg); + if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob); + free(args->qry_prob); + free(args->es_max_mem); + fclose(args->fp); + if ( args->distinctive_sites ) diff_sites_destroy(args); + free(args->hwe_prob); + free(args->cwd); + free(args->qry_arr); + if ( args->gt_hdr ) free(args->gt_arr); + free(args->pdiff); + free(args->ndiff); + free(args->ncnt); + free(args->qry_smpl); + if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl); + free(args->pairs); + bcf_sr_destroy(args->files); +} - // For faster access to genotype likelihoods (PLs) of the query sample - int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl; - double sum_pl = 0; // for converting PLs to probs - for (max_ipl=0; max_ipldsg2prob[dsg][0]; + prob[1] = args->dsg2prob[dsg][1]; + prob[2] = args->dsg2prob[dsg][2]; + } + return dsg; +} +static inline uint8_t pl_to_prob(args_t *args, int32_t *ptr, double *prob) +{ + uint8_t dsg = pl_to_dsg(ptr); + if ( dsg ) + { + prob[0] = (ptr[0]>=0 && ptr[0]<255) ? args->pl2prob[ptr[0]] : args->pl2prob[255]; + prob[1] = (ptr[1]>=0 && ptr[1]<255) ? args->pl2prob[ptr[1]] : args->pl2prob[255]; + prob[2] = (ptr[2]>=0 && ptr[2]<255) ? args->pl2prob[ptr[2]] : args->pl2prob[255]; + double sum = prob[0] + prob[1] + prob[2]; + prob[0] /= sum; + prob[1] /= sum; + prob[2] /= sum; + prob[0] = -log(prob[0]); + prob[1] = -log(prob[1]); + prob[2] = -log(prob[2]); + } + return dsg; +} +static int set_data(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec, int32_t **arr, int32_t *narr, int *narr1, int *use_GT) +{ + static int warn_dip_GT = 1; + static int warn_dip_PL = 1; + int i; + for (i=0; i<2; i++) + { + if ( *use_GT ) { - if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break; - if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue; - sum_pl += pow(10, -0.1*pl_ptr[max_ipl]); + int ret = bcf_get_genotypes(hdr,rec,arr,narr); + if ( ret < 0 ) + { + if ( !i ) { *use_GT = 0; continue; } + args->nskip_no_data++; + return -1; + } + if ( ret != 2*bcf_hdr_nsamples(hdr) ) + { + if ( warn_dip_GT ) + { + fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/GT fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1); + warn_dip_GT = 0; + } + args->nskip_dip_GT++; + return -1; + } + *narr1 = 2; + return 0; } - if ( sum_pl==0 ) continue; // no PLs present - if ( fake_pls && args->no_PLs==1 ) sum_pl = -1; - // The main stats: concordance of the query sample with the target -g samples - for (i=0; igt_hdr); i++) + int ret = bcf_get_format_int32(hdr,rec,"PL",arr,narr); + if ( ret < 0 ) { - int *gt_ptr = gt_arr + i*ngt; - if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes - if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue; - int a = bcf_gt_allele(gt_ptr[0]); - int b = bcf_gt_allele(gt_ptr[1]); - if ( args->hom_only && a!=b ) continue; // heterozygous genotype - int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file - int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file - if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing - args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); - args->sites[i]++; + if ( !i ) { *use_GT = 1; continue; } + args->nskip_no_data++; + return -1; } - if ( args->all_sites ) + if ( ret != 3*bcf_hdr_nsamples(hdr) ) { - // Print LKs at all sites for debugging - int *gt_ptr = gt_arr + tgt_isample*ngt; - if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes - int a = bcf_gt_allele(gt_ptr[0]); - int b = bcf_gt_allele(gt_ptr[1]); - if ( args->hom_only && a!=b ) continue; // heterozygous genotype - fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); - for (i=0; in_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); - fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); - fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk); - prev_lk = args->lks[query_isample]; - - int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample - for (i=0; in_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); - for (igt=0; igtpos+1); + warn_dip_PL = 0; + } + args->nskip_dip_PL++; + return -1; } + *narr1 = 3; + return 0; } - free(gt2ipl); - free(gt_arr); - free(args->pl_arr); - free(args->tmp_arr); + return -1; // should never reach +} +static void process_line(args_t *args) +{ + int i,j,k, nqry1, ngt1, ret; + + bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0); // the query file + int qry_use_GT = args->qry_use_GT; + int gt_use_GT = args->gt_use_GT; + + ret = set_data(args, args->qry_hdr, qry_rec, &args->qry_arr, &args->nqry_arr, &nqry1, &qry_use_GT); + if ( ret<0 ) return; - // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same - // plot as discordance per site, the latter must be scaled to the same range - int nsamples = bcf_hdr_nsamples(args->gt_hdr); - double extreme_lk = 0, extreme_lk_per_site = 0; - for (i=0; igt_hdr ) { - if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i]; - if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i]; + gt_rec = bcf_sr_get_line(args->files,1); + ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, >_use_GT); + if ( ret<0 ) return; + } + else + { + ngt1 = nqry1; + args->gt_arr = args->qry_arr; } - // Sorted output - double **p = (double**) malloc(sizeof(double*)*nsamples); - for (i=0; ilks[i]; - qsort(p, nsamples, sizeof(int*), cmp_doubleptr); + args->ncmp++; - fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (avg score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]); - for (i=0; icalc_hwe_prob ) { - int idx = p[i] - args->lks; - double per_site = 0; - if ( args->sites[idx] ) + int ac[2]; + if ( args->gt_hdr ) { - if ( args->sites[idx] && extreme_lk_per_site ) + if ( bcf_calc_ac(args->gt_hdr, gt_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n"); + } + else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n"); + + // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty + // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g. + // hwe[1,2,4] .. dsg=0,1,2 + // hwe[3] .. dsg=0 or 1 + // hwe[6] .. dsg=1 or 2 + + double hwe[3]; + const double min_af = 1e-5; // cap the AF in case we get unrealistic values + af = (double)ac[1]/(ac[0]+ac[1]); + hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af); + hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af)); + hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af); + hwe_dsg[0] = 0; + for (i=1; i<8; i++) + { + hwe_dsg[i] = HUGE_VAL; + for (k=0; k<3; k++) { - per_site = args->lks[idx]/args->sites[idx]; - per_site *= extreme_lk / extreme_lk_per_site; + if ( ((1< hwe[k] ) hwe_dsg[i] = hwe[k]; } - else - per_site = 0; } - fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i); } - if ( args->plot ) + // The sample pairs were given explicitly via -p/-P options + if ( args->pairs ) { - if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__); - plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); - } -} + if ( !args->use_PLs ) + { + int ndiff = 0; + if ( args->kbs_diff ) diff_sites_reset(args); -// static inline int is_hom_most_likely(int nals, int *pls) -// { -// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0]; -// for (ia=1; iasm_hdr, line, &args->tmp_arr, &args->ntmp_arr); + for (i=0; inpairs; i++) + { + int32_t *ptr; + uint8_t qry_dsg, gt_dsg; - if ( ngt<=0 ) return 1; // GT not present - if ( ngt!=args->nsmpl*2 ) return 2; // not diploid - ngt /= args->nsmpl; - - int i,j, idx = 0; - for (i=1; insmpl; i++) - { - int32_t *a = args->tmp_arr + i*ngt; - if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; } - int agt = 1<gt_arr + args->pairs[i].igt*ngt1; + gt_dsg = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr); + if ( !gt_dsg ) continue; // missing value + if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom + + ptr = args->qry_arr + args->pairs[i].iqry*nqry1; + qry_dsg = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr); + if ( !qry_dsg ) continue; // missing value + + int match = qry_dsg & gt_dsg; + if ( !match ) + { + args->ndiff[i]++; + if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); } + } + else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match]; + args->ncnt[i]++; + } - for (j=0; jrid, qry_rec->pos); + } + else // use_PLs set { - int32_t *b = args->tmp_arr + j*ngt; - if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; } - int bgt = 1<npairs; i++) + { + int32_t *ptr; + double qry_prob[3], gt_prob[3]; + uint8_t qry_dsg, gt_dsg; + + ptr = args->gt_arr + args->pairs[i].igt*ngt1; + gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob); + if ( !gt_dsg ) continue; // missing value + if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom + + ptr = args->qry_arr + args->pairs[i].iqry*nqry1; + qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob); + if ( !qry_dsg ) continue; // missing value - ntot[idx]++; - if ( agt!=bgt ) ndif[idx]++; - idx++; + double min = qry_prob[0] + gt_prob[0]; + qry_prob[1] += gt_prob[1]; + if ( min > qry_prob[1] ) min = qry_prob[1]; + qry_prob[2] += gt_prob[2]; + if ( min > qry_prob[2] ) min = qry_prob[2]; + args->pdiff[i] += min; + + if ( args->calc_hwe_prob ) + { + int match = qry_dsg & gt_dsg; + args->hwe_prob[i] += hwe_dsg[match]; + } + args->ncnt[i]++; + } } + return; } - return 0; -} -int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif) -{ - int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr); - if ( npl<=0 ) return 1; // PL not present - npl /= args->nsmpl; - - int i,j,k, idx = 0; - for (i=1; insmpl; i++) + int idx=0; + if ( !args->use_PLs ) { - int32_t *a = args->tmp_arr + i*npl; - int imin = -1; - for (k=0; knqry_smpl; i++) { - if ( a[k]==bcf_int32_vector_end ) break; - if ( a[k]==bcf_int32_missing ) continue; - if ( imin==-1 || a[imin] > a[k] ) imin = k; + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + int32_t *ptr = args->qry_arr + nqry1*iqry; + args->qry_dsg[i] = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr); } - if ( imin<0 ) { idx+=i; continue; } - - for (j=0; jcross_check ) // in this case gt_dsg points to qry_dsg { - int32_t *b = args->tmp_arr + j*npl; - int jmin = -1; - for (k=0; kngt_smpl; i++) { - if ( b[k]==bcf_int32_vector_end ) break; - if ( b[k]==bcf_int32_missing ) continue; - if ( jmin==-1 || b[jmin] > b[k] ) jmin = k; + int igt = args->gt_smpl ? args->gt_smpl[i] : i; + int32_t *ptr = args->gt_arr + ngt1*igt; + args->gt_dsg[i] = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr); + if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value + } + } + for (i=0; inqry_smpl; i++) + { + int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode? + if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value + for (j=0; jgt_dsg[j] ) { idx++; continue; } // missing value + int match = args->qry_dsg[i] & args->gt_dsg[j]; + if ( !match ) args->ndiff[idx]++; + else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match]; + args->ncnt[idx]++; + idx++; } - if ( jmin<0 ) { idx++; continue; } - - ntot[idx]++; - if ( imin!=jmin ) ndif[idx]++; - idx++; } } - return 0; -} + else // use_PLs set + { + for (i=0; inqry_smpl; i++) + { + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + int32_t *ptr = args->qry_arr + nqry1*iqry; + args->qry_dsg[i] = qry_use_GT ? gt_to_prob(args,ptr,args->qry_prob+i*3) : pl_to_prob(args,ptr,args->qry_prob+i*3); + } + if ( !args->cross_check ) // in this case gt_dsg points to qry_dsg + { + for (i=0; ingt_smpl; i++) + { + int igt = args->gt_smpl ? args->gt_smpl[i] : i; + int32_t *ptr = args->gt_arr + ngt1*igt; + args->gt_dsg[i] = gt_use_GT ? gt_to_prob(args,ptr,args->gt_prob+i*3) : pl_to_prob(args,ptr,args->gt_prob+i*3); + if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value + } + } + for (i=0; inqry_smpl; i++) + { + int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode? + if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value + for (j=0; jgt_dsg[j] ) { idx++; continue; } // missing value -static void cross_check_gts(args_t *args) -{ - // Initialize things: check which tags are defined in the header, sample names etc. - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) - { - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) - error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); - if ( !args->no_PLs ) { - fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); - args->no_PLs = 99; + double min = args->qry_prob[i*3] + args->gt_prob[j*3]; + if ( min > args->qry_prob[i*3+1] + args->gt_prob[j*3+1] ) min = args->qry_prob[i*3+1] + args->gt_prob[j*3+1]; + if ( min > args->qry_prob[i*3+2] + args->gt_prob[j*3+2] ) min = args->qry_prob[i*3+2] + args->gt_prob[j*3+2]; + args->pdiff[idx] += min; + + if ( args->calc_hwe_prob ) + { + int match = args->qry_dsg[i] & args->gt_dsg[j]; + args->hwe_prob[idx] += hwe_dsg[match]; + } + args->ncnt[idx]++; + idx++; + } } } +} - args->nsmpl = bcf_hdr_nsamples(args->sm_hdr); - args->narr = (args->nsmpl-1)*args->nsmpl/2; - uint32_t *ndif = (uint32_t*) calloc(args->narr,4); - uint32_t *ntot = (uint32_t*) calloc(args->narr,4); +typedef struct +{ + int ism, idx; + double val; +} +idbl_t; +static int cmp_idbl(const void *_a, const void *_b) +{ + idbl_t *a = (idbl_t*)_a; + idbl_t *b = (idbl_t*)_b; + if ( a->val < b->val ) return -1; + if ( a->val > b->val ) return 1; + return 0; +} +static void report_distinctive_sites(args_t *args) +{ + extsort_sort(args->es); + + fprintf(args->fp,"# DS, distinctive sites:\n"); + fprintf(args->fp,"# - chromosome\n"); + fprintf(args->fp,"# - position\n"); + fprintf(args->fp,"# - cumulative number of pairs distinguished by this block\n"); + fprintf(args->fp,"# - block id\n"); + fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n"); - while ( bcf_sr_next_line(args->files) ) + kbitset_t *kbs_blk = kbs_init(args->npairs); + kbitset_iter_t itr; + int i,ndiff,rid,pos,ndiff_tot = 0, iblock = 0; + int ndiff_min = args->distinctive_sites <= args->npairs ? args->distinctive_sites : args->npairs; + while ( diff_sites_shift(args,&ndiff,&rid,&pos) ) { - bcf1_t *line = bcf_sr_get_line(args->files,0); - - // use PLs unless no_PLs is set and GT exists - if ( args->no_PLs ) + int ndiff_new = 0, ndiff_dbg = 0; + kbs_start(&itr); + while ( (i=kbs_next(args->kbs_diff, &itr))>=0 ) { - if ( process_GT(args,line,ntot,ndif)==0 ) continue; + ndiff_dbg++; + if ( kbs_exists(kbs_blk,i) ) continue; // already set + kbs_insert(kbs_blk,i); + ndiff_new++; } - process_PL(args,line,ntot,ndif); + if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff); + if ( !ndiff_new ) continue; // no new pair distinguished by this site + ndiff_tot += ndiff_new; + fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock); + if ( ndiff_tot < ndiff_min ) continue; // fewer than the requested number of pairs can be distinguished at this point + iblock++; + ndiff_tot = 0; + kbs_clear(kbs_blk); } - - FILE *fp = stdout; - print_header(args, fp); + kbs_destroy(kbs_blk); +} +static void report(args_t *args) +{ + fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp); + fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match); + fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba); + fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono); + fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data); + fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT); + fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL); + fprintf(args->fp,"# DC, discordance:\n"); + fprintf(args->fp,"# - query sample\n"); + fprintf(args->fp,"# - genotyped sample\n"); + fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n"); + fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n"); + fprintf(args->fp,"# - number of sites compared (bigger is better)\n"); + fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n"); - float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2); + int trim = args->ntop; + if ( !args->pairs ) + { + if ( !args->ngt_smpl && args->nqry_smpl <= args->ntop ) trim = 0; + if ( args->ngt_smpl && args->ngt_smpl <= args->ntop ) trim = 0; + } - // Output pairwise distances - fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n"); - int i,j, idx = 0; - for (i=0; insmpl; i++) + if ( args->pairs ) { - for (j=0; jnpairs; i++) { - float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10; - fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); - PDIST(tmp,i,j) = err; - idx++; + int iqry = args->pairs[i].iqry; + int igt = args->pairs[i].igt; + if ( args->ndiff ) + { + fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->ndiff[i], + args->calc_hwe_prob ? args->hwe_prob[i] : 0, + args->ncnt[i]); + } + else + { + fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->pdiff[i], + args->calc_hwe_prob ? args->hwe_prob[i] : 0, + args->ncnt[i]); + } } } - - // Cluster samples - int nlist; - float clust_max_err = args->max_intra_err; - hclust_t *clust = hclust_init(args->nsmpl,tmp); - cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist); - fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n"); - for (i=0; ism_hdr->samples[list[i].memb[j]]); - fprintf(fp,"\n"); - } - hclust_destroy_list(list,nlist); - // Debugging output: the cluster graph and data used for deciding - char **dbg = hclust_explain(clust,&nlist); - for (i=0; ism_hdr->samples,clust_max_err)); - hclust_destroy(clust); - free(tmp); - - - // Deprecated output for temporary backward compatibility - fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n"); - fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); - idx = 0; - for (i=0; insmpl; i++) + else if ( !trim ) { - for (j=0; jnqry_smpl; i++) { - fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); - idx++; + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + int ngt = args->cross_check ? i : args->ngt_smpl; + for (j=0; jgt_smpl ? args->gt_smpl[j] : j; + if ( args->ndiff ) + { + fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->ndiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + else + { + fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->pdiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + idx++; + } } } - - free(ndif); - free(ntot); - free(args->tmp_arr); + else if ( !args->cross_check ) + { + idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*args->ngt_smpl); + int i,j; + for (i=0; inqry_smpl; i++) + { + int idx = i*args->ngt_smpl; + for (j=0; jngt_smpl; j++) + { + if ( args->sort_by_hwe ) + arr[j].val = -args->hwe_prob[idx]; + else if ( args->ndiff ) + arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0; + else + arr[j].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0; + arr[j].ism = j; + arr[j].idx = idx; + idx++; + } + qsort(arr, args->ngt_smpl, sizeof(*arr), cmp_idbl); + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + for (j=0; jntop; j++) + { + int idx = arr[j].idx; + int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism; + if ( args->ndiff ) + { + fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->ndiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + else + { + fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->pdiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + } + } + free(arr); + } + else + { + int narr = args->nqry_smpl-1; + idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*narr); + int i,j,k,idx; + for (i=0; inqry_smpl; i++) + { + k = 0, idx = i*(i-1)/2; + for (j=0; jsort_by_hwe ) + arr[k].val = -args->hwe_prob[idx]; + else if ( args->ndiff ) + arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0; + else + arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0; + arr[k].ism = j; + arr[k].idx = idx; + idx++; + k++; + } + for (; jsort_by_hwe ) + arr[k].val = -args->hwe_prob[idx]; + else if ( args->ndiff ) + arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0; + else + arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0; + arr[k].ism = j + 1; + arr[k].idx = idx; + k++; + } + qsort(arr, narr, sizeof(*arr), cmp_idbl); + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + for (j=0; jntop; j++) + { + if ( i <= arr[j].ism ) continue; + int idx = arr[j].idx; + int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism; + if ( args->ndiff ) + { + fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->qry_hdr->samples[igt], + args->ndiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + else + { + fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->qry_hdr->samples[igt], + args->pdiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + } + } + free(arr); + } } -static char *init_prefix(char *prefix) +static int is_input_okay(args_t *args, int nmatch) { - int len = strlen(prefix); - if ( prefix[len-1] == '/' || prefix[len-1] == '\\' ) - return msprintf("%sgtcheck", prefix); - return strdup(prefix); + int i; + const char *msg; + bcf_hdr_t *hdr; + bcf1_t *rec; + if ( args->gt_hdr && nmatch!=2 ) + { + if ( args->nskip_no_match++ ) return 0; + for (i=0; i<2; i++) + { + rec = bcf_sr_get_line(args->files,i); + if ( rec ) break; + } + hdr = bcf_sr_get_header(args->files,i); + fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", no record with matching POS+ALT. (This is printed only once.)\n", + bcf_seqname(hdr,rec),rec->pos+1); + return 0; + } + for (i=0; i<2; i++) + { + hdr = bcf_sr_get_header(args->files,i); + rec = bcf_sr_get_line(args->files,i); + if ( rec->n_allele>2 ) + { + if ( args->nskip_not_ba++ ) return 0; + msg = "not a biallelic site, run `bcftools norm -m -` first"; + goto not_okay; + } + if ( bcf_get_variant_types(rec)==VCF_REF ) + { + if ( args->nskip_mono++ ) return 0; + msg = "monoallelic site"; + goto not_okay; + } + if ( !args->gt_hdr ) break; + } + return 1; + +not_okay: + fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", + bcf_seqname(hdr,rec),rec->pos+1,msg); + return 0; } static void usage(void) @@ -712,18 +1026,41 @@ static void usage(void) fprintf(stderr, "Usage: bcftools gtcheck [options] [-g ] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -a, --all-sites output comparison for all sites\n"); - fprintf(stderr, " -c, --cluster min inter- and max intra-sample error [0.23,-0.3]\n"); - fprintf(stderr, " -g, --genotypes genotypes to compare against\n"); - fprintf(stderr, " -G, --GTs-only use GTs, ignore PLs, using for unseen genotypes [99]\n"); - fprintf(stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n"); - fprintf(stderr, " -p, --plot plot\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --query-sample query sample (by default the first sample is checked)\n"); - fprintf(stderr, " -S, --target-sample target sample in the -g file (used only for plotting)\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + //fprintf(stderr, " -a, --all-sites Output comparison for all sites\n"); + //fprintf(stderr, " -c, --cluster MIN,MAX Min inter- and max intra-sample error [0.23,-0.3]\n"); + fprintf(stderr, " --distinctive-sites Find sites that can distinguish between at least NUM sample pairs.\n"); + fprintf(stderr, " NUM[,MEM[,TMP]] If the number is smaller or equal to 1, it is interpreted as the fraction of pairs.\n"); + fprintf(stderr, " The optional MEM string sets the maximum memory used for in-memory sorting [500M]\n"); +#ifdef _WIN32 + fprintf(stderr, " and TMP is a prefix of temporary files used by external sorting [/bcftools.XXXXXX]\n"); +#else + fprintf(stderr, " and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n"); +#endif + fprintf(stderr, " --dry-run Stop after first record to estimate required time\n"); + fprintf(stderr, " -e, --error-probability INT Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n"); + fprintf(stderr, " -g, --genotypes FILE Genotypes to compare against\n"); + fprintf(stderr, " -H, --homs-only Homozygous genotypes only, useful with low coverage data (requires -g)\n"); + fprintf(stderr, " --n-matches INT Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n"); + fprintf(stderr, " Use negative value to sort by HWE probability rather than by discordance [0]\n"); + fprintf(stderr, " --no-HWE-prob Disable calculation of HWE probability\n"); + fprintf(stderr, " -p, --pairs LIST Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n"); + fprintf(stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n"); + fprintf(stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n"); + fprintf(stderr, "Examples:\n"); + fprintf(stderr, " # Check discordance of all samples from B against all sample in A\n"); + fprintf(stderr, " bcftools gtcheck -g A.bcf B.bcf\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " # Limit comparisons to the fiven list of samples\n"); + fprintf(stderr, " bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " # Compare only two pairs a1,b1 and a1,b2\n"); + fprintf(stderr, " bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf\n"); fprintf(stderr, "\n"); exit(1); } @@ -732,10 +1069,19 @@ int main_vcfgtcheck(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); - args->files = bcf_sr_init(); args->argc = argc; args->argv = argv; set_cwd(args); - char *regions = NULL, *targets = NULL; - int regions_is_file = 0, targets_is_file = 0; + args->qry_use_GT = -1; + args->gt_use_GT = -1; + args->calc_hwe_prob = 1; + args->use_PLs = 40; + + // external sort for --distinctive-sites +#ifdef _WIN32 + args->es_tmp_prefix = NULL; +#else + args->es_tmp_prefix = "/tmp/bcftools-gtcheck"; +#endif + args->es_max_mem = strdup("500M"); // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 // - min_inter: pairs with smaller err value will be considered identical @@ -746,6 +1092,8 @@ int main_vcfgtcheck(int argc, char *argv[]) static struct option loptions[] = { + {"error-probability",1,0,'e'}, + {"use",1,0,'u'}, {"cluster",1,0,'c'}, {"GTs-only",1,0,'G'}, {"all-sites",0,0,'a'}, @@ -753,18 +1101,74 @@ int main_vcfgtcheck(int argc, char *argv[]) {"help",0,0,'h'}, {"genotypes",1,0,'g'}, {"plot",1,0,'p'}, - {"target-sample",1,0,'S'}, - {"query-sample",1,0,'s'}, + {"samples",1,0,'s'}, + {"samples-file",1,0,'S'}, + {"n-matches",1,0,2}, + {"no-HWE-prob",0,0,3}, + {"target-sample",1,0,4}, + {"dry-run",0,0,5}, + {"distinctive-sites",1,0,6}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"pairs",1,0,'p'}, + {"pairs-file",1,0,'P'}, {0,0,0,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) { switch (c) { + case 'e': + args->use_PLs = strtol(optarg,&tmp,10); + if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg); + break; + case 'u': + { + int i,nlist; + char **list = hts_readlist(optarg, 0, &nlist); + if ( !list || nlist<=0 || nlist>2 ) error("Failed to parse --use %s\n", optarg); + if ( !strcasecmp("GT",list[0]) ) args->qry_use_GT = 1; + else if ( !strcasecmp("PL",list[0]) ) args->qry_use_GT = 0; + else error("Failed to parse --use %s; only GT and PL are supported\n", optarg); + if ( nlist==2 ) + { + if ( !strcasecmp("GT",list[1]) ) args->gt_use_GT = 1; + else if ( !strcasecmp("PL",list[1]) ) args->gt_use_GT = 0; + else error("Failed to parse --use %s; only GT and PL are supported\n", optarg); + } + else args->gt_use_GT = args->qry_use_GT; + for (i=0; intop = strtol(optarg,&tmp,10); + if ( !tmp || *tmp ) error("Could not parse: --n-matches %s\n", optarg); + if ( args->ntop < 0 ) + { + args->sort_by_hwe = 1; + args->ntop *= -1; + } + break; + case 3 : args->calc_hwe_prob = 0; break; + case 4 : error("The option -S, --target-sample has been deprecated\n"); break; + case 5 : args->dry_run = 1; break; + case 6 : + args->distinctive_sites = strtod(optarg,&tmp); + if ( *tmp ) + { + if ( *tmp!=',' ) error("Could not parse: --distinctive-sites %s\n", optarg); + tmp++; + free(args->es_max_mem); + args->es_max_mem = strdup(tmp); + while ( *tmp && *tmp!=',' ) tmp++; + if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; } + } + args->use_PLs = 0; + break; case 'c': + error("The -c option is to be implemented, please open an issue on github\n"); args->min_inter_err = strtod(optarg,&tmp); if ( *tmp ) { @@ -773,50 +1177,77 @@ int main_vcfgtcheck(int argc, char *argv[]) if ( *tmp ) error("Could not parse: -c %s\n", optarg); } break; - case 'G': - args->no_PLs = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg); - break; - case 'a': args->all_sites = 1; break; + case 'G': error("The option -G, --GTs-only has been deprecated\n"); break; + case 'a': args->all_sites = 1; error("The -a option is to be implemented, please open an issue on github\n"); break; case 'H': args->hom_only = 1; break; case 'g': args->gt_fname = optarg; break; - case 'p': args->plot = optarg; break; - case 'S': args->target_sample = optarg; break; - case 's': args->query_sample = optarg; break; - case 'r': regions = optarg; break; - case 'R': regions = optarg; regions_is_file = 1; break; - case 't': targets = optarg; break; - case 'T': targets = optarg; targets_is_file = 1; break; +// case 'p': args->plot = optarg; break; + case 's': + if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3; + else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4; + else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); + break; + case 'S': + if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1; + else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1; + else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); + break; + case 'p': args->pair_samples = optarg; break; + case 'P': args->pair_samples = optarg; args->pair_samples_is_file = 1; break; + case 'r': args->regions = optarg; break; + case 'R': args->regions = optarg; args->regions_is_file = 1; break; + case 't': args->targets = optarg; break; + case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); } } - char *fname = NULL; if ( optind==argc ) { - if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + if ( !isatty(fileno((FILE *)stdin)) ) args->qry_fname = "-"; // reading from stdin else usage(); // no files given } - else fname = argv[optind]; - if ( argc>optind+1 ) usage(); // too many files given - if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode - else args->files->require_index = 1; - if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); - if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); - if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); - if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) - error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum)); - args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; - if ( args->plot ) args->plot = init_prefix(args->plot); + else args->qry_fname = argv[optind]; + if ( argc>optind+1 ) error("Error: too many files given, run with -h for help\n"); // too many files given + if ( args->pair_samples ) + { + if ( args->gt_samples || args->qry_samples ) error("The -p/-P option cannot be combined with -s/-S\n"); + if ( args->ntop ) error("The --n-matches option cannot be combined with -p/-P\n"); + } + if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n"); + if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n"); + if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n"); + init_data(args); - if ( args->cross_check ) - cross_check_gts(args); - else - check_gt(args); + + int ret; + while ( (ret=bcf_sr_next_line(args->files)) ) + { + if ( !is_input_okay(args,ret) ) continue; + + // time one record to give the user an estimate with very big files + struct timeval t0, t1; + if ( !args->ncmp ) gettimeofday(&t0, NULL); + + process_line(args); + + if ( args->ncmp==1 ) + { + gettimeofday(&t1, NULL); + double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); + fprintf(stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6); + fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6); + if ( args->dry_run ) break; + } + } + if ( !args->dry_run ) + { + report(args); + if ( args->distinctive_sites ) report_distinctive_sites(args); + } + destroy_data(args); - bcf_sr_destroy(args->files); - if (args->plot) free(args->plot); free(args); return 0; } diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index ae8ba74..6ab27ed 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -2,7 +2,7 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -28,8 +28,10 @@ THE SOFTWARE. */ #include #include #include +#include #include #include +#include #include #include #include @@ -37,240 +39,46 @@ THE SOFTWARE. */ #include #include #include +#include +#include #include +#include #include "bcftools.h" -#include "hclust.h" +#include "extsort.h" +//#include "hclust.h" typedef struct { - bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check - bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF - int ntmp_arr, npl_arr; - int32_t *tmp_arr, *pl_arr; - double *lks, *sites, min_inter_err, max_intra_err; - int *cnts, *dps, hom_only, cross_check, all_sites; - char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample; - int argc, no_PLs, narr, nsmpl; -} -args_t; - -FILE *open_file(char **fname, const char *mode, const char *fmt, ...); -char *msprintf(const char *fmt, ...); -void mkdir_p(const char *fmt, ...); - -void py_plot(char *script) -{ - mkdir_p(script); - int len = strlen(script); - char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script); - int ret = system(cmd); - if ( ret ) fprintf(bcftools_stderr, "The command returned non-zero status %d: %s\n", ret, cmd); - free(cmd); -} - -static void plot_check(args_t *args, char *target_sample, char *query_sample) -{ - char *fname; - FILE *fp = open_file(&fname, "w", "%s.py", args->plot); - fprintf(fp, - "import matplotlib as mpl\n" - "mpl.use('Agg')\n" - "import matplotlib.pyplot as plt\n" - "import matplotlib.gridspec as gridspec\n" - "import csv\n" - "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n" - "\n" - "sample_ids = False\n" - "\n" - "dat = []\n" - "with open('%s.tab', 'r') as f:\n" - " reader = csv.reader(f, 'tab')\n" - " for row in reader:\n" - " if row[0][0]=='#': continue\n" - " if row[0]!='CN': continue\n" - " tgt = 0\n" - " if row[4]=='%s': tgt = 1\n" - " dat.append([float(row[1]), float(row[2]), float(row[3]), tgt, row[4]])\n" - "\n" - "dat = sorted(dat)\n" - "\n" - "iq = -1; dp = 0\n" - "for i in range(len(dat)):\n" - " if iq==-1 and dat[i][3]==1: iq = i\n" - " dp += dat[i][2]\n" - "dp /= len(dat)\n" - "\n" - "fig,ax1 = plt.subplots(figsize=(8,5))\n" - "ax2 = ax1.twinx()\n" - "plots = ax1.plot([x[0] for x in dat],'o-', ms=3, color='g', mec='g', label='Discordance (total)')\n" - "plots += ax1.plot([x[1] for x in dat], '^', ms=3, color='r', mec='r', label='Discordance (avg per site)')\n" - "plots += ax2.plot([x[2] for x in dat],'v', ms=3, color='k', label='Number of sites')\n" - "if iq!=-1:\n" - " ax1.plot([iq],[dat[iq][0]],'o',color='orange', ms=9)\n" - " ax1.annotate('%s',xy=(iq,dat[iq][0]), xytext=(5,5), textcoords='offset points',fontsize='xx-small',rotation=45,va='bottom',ha='left')\n" - " ax1.plot([iq],[dat[iq][1]],'^',color='red', ms=5)\n" - "for tl in ax1.get_yticklabels(): tl.set_color('g')\n" - "for tl in ax2.get_yticklabels(): tl.set_color('k'); tl.set_fontsize(9)\n" - "min_dp = min([x[2] for x in dat])\n" - "max_dp = max([x[2] for x in dat])\n" - "ax2.set_ylim(min_dp-1,max_dp+1)\n" - "ax1.set_title('Discordance with %s')\n" - "ax1.set_xlim(-0.05*len(dat),1.05*(len(dat)-1))\n" - "ax1.set_xlabel('Sample ID')\n" - "plt.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)\n" - "if sample_ids:\n" - " ax1.set_xticks(range(len(dat)))\n" - " ax1.set_xticklabels([x[4] for x in dat],**{'rotation':45, 'ha':'right', 'fontsize':8})\n" - " plt.subplots_adjust(bottom=0.2)\n" - "ax1.set_ylabel('Discordance',color='g')\n" - "ax2.set_ylabel('Number of sites',color='k')\n" - "ax2.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n" - "ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n" - "labels = [l.get_label() for l in plots]\n" - "plt.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False)\n" - "plt.savefig('%s.png')\n" - "plt.close()\n" - "\n", args->plot, target_sample, target_sample, query_sample, args->plot - ); - fclose(fp); - py_plot(fname); - free(fname); -} - -#if 0 -static void plot_cross_check(args_t *args) -{ - char *fname; - FILE *fp = open_file(&fname, "w", "%s.py", args->plot); - fprintf(fp, - "import matplotlib as mpl\n" - "mpl.use('Agg')\n" - "import matplotlib.pyplot as plt\n" - "import matplotlib.gridspec as gridspec\n" - "import csv\n" - "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n" - "avg = []\n" - "dp = []\n" - "sm2id = {}\n" - "dat = None\n" - "min = None\n" - "max = None\n" - "with open('%s.tab', 'r') as f:\n" - " reader = csv.reader(f, 'tab')\n" - " i = 0\n" - " for row in reader:\n" - " if row[0]=='SM':\n" - " sm2id[row[4]] = i\n" - " avg.append([i,float(row[1])])\n" - " dp.append([i,float(row[2])])\n" - " i += 1\n" - " elif row[0]=='CN':\n" - " val = 0\n" - " if int(row[2])!=0: val = float(row[1])/int(row[2])\n" - " if not dat:\n" - " dat = [[0]*len(sm2id) for x in xrange(len(sm2id))]\n" - " min = val\n" - " max = val\n" - " id_i = sm2id[row[4]]\n" - " id_j = sm2id[row[5]]\n" - " dat[id_i][id_j] = val\n" - " dat[id_j][id_i] = val\n" - " if min > val: min = val\n" - " if max < val: max = val\n" - "\n" - "if len(sm2id)<=1: exit(1)\n" - "if min==max: exit(1)\n" - "\n" - "fig = plt.figure(figsize=(6,7))\n" - "gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1.5])\n" - "ax1 = plt.subplot(gs[0])\n" - "ax2 = plt.subplot(gs[1])\n" - "\n" - "ax1.plot([x[0] for x in avg],[x[1] for x in avg],'^-', ms=3, color='k')\n" - "ax3 = ax1.twinx()\n" - "ax3.plot([x[0] for x in dp],[x[1] for x in dp],'^-', ms=3, color='r',mec='r')\n" - "for tl in ax3.get_yticklabels():\n" - " tl.set_color('r')\n" - " tl.set_fontsize(9)\n" - "\n" - "im = ax2.imshow(dat,clim=(min),interpolation='nearest',origin='lower')\n" - "cb1 = plt.colorbar(im,ax=ax2)\n" - "cb1.set_label('Pairwise discordance')\n" - "for t in cb1.ax.get_yticklabels(): t.set_fontsize(9)\n" - "\n" - "ax1.tick_params(axis='both', which='major', labelsize=9)\n" - "ax1.tick_params(axis='both', which='minor', labelsize=9)\n" - "ax2.tick_params(axis='both', which='major', labelsize=9)\n" - "ax2.tick_params(axis='both', which='minor', labelsize=9)\n" - "\n" - "ax1.set_title('Sample Discordance Score')\n" - "ax2.set_ylabel('Sample ID')\n" - "ax2.set_xlabel('Sample ID')\n" - "ax3.set_ylabel('Average Depth',color='r')\n" - "ax1.set_xlabel('Sample ID')\n" - "ax1.set_ylabel('Average discordance')\n" - "\n" - "plt.subplots_adjust(left=0.15,right=0.87,bottom=0.08,top=0.93,hspace=0.25)\n" - "plt.savefig('%s.png')\n" - "plt.close()\n" - "\n", args->plot,args->plot - ); - fclose(fp); - py_plot(fname); - free(fname); -} -#endif - -static void init_data(args_t *args) -{ - args->sm_hdr = args->files->readers[0].header; - if ( !bcf_hdr_nsamples(args->sm_hdr) ) error("No samples in %s?\n", args->files->readers[0].fname); - - if ( !args->cross_check ) - { - args->gt_hdr = args->files->readers[1].header; - int nsamples = bcf_hdr_nsamples(args->gt_hdr); - if ( !nsamples ) error("No samples in %s?\n", args->files->readers[1].fname); - args->lks = (double*) calloc(nsamples,sizeof(double)); - args->cnts = (int*) calloc(nsamples,sizeof(int)); - args->sites = (double*) calloc(nsamples,sizeof(double)); - args->dps = (int*) calloc(nsamples,sizeof(int)); - } + int iqry, igt; } +pair_t; -static void destroy_data(args_t *args) -{ - free(args->lks); free(args->cnts); free(args->dps); free(args->cwd); free(args->sites); -} - -static int allele_to_int(bcf1_t *line, char *allele) +typedef struct { - int i; - for (i=0; in_allele; i++) - if ( !strcmp(allele,line->d.allele[i]) ) return i; - if ( strcmp(line->d.allele[i-1],"X") ) return -1; - return i-1; -} + bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check + bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF + char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples; + int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file; + int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl; + double *pdiff, *qry_prob, *gt_prob; + uint32_t *ndiff,*ncnt,ncmp, npairs; + int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr; + uint8_t *qry_dsg, *gt_dsg; + pair_t *pairs; + double *hwe_prob, dsg2prob[8][3], pl2prob[256]; + double min_inter_err, max_intra_err; + int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs; + FILE *fp; + unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL; -static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2ipl, int n_gt2ipl) -{ - int i, j; - for (i=0; in_allele; i++) - { - // find which of the sm_alleles (k) corresponds to the gt_allele (i) - int k = allele_to_int(sm_line, gt_line->d.allele[i]); - if ( k<0 ) return 0; - for (j=0; j<=i; j++) - { - int l = allele_to_int(sm_line, gt_line->d.allele[j]); - if ( l<0 ) return 0; - gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k); - } - } - //for (i=0; icwd); } -static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line) +static int cmp_int(const void *_a, const void *_b) { - // PLs not present, use GTs instead. - int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs - int nsm_gt, i; - if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) - error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); - nsm_gt /= bcf_hdr_nsamples(hdr); - int npl = line->n_allele*(line->n_allele+1)/2; - hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr); - for (i=0; itmp_arr + i*nsm_gt; - int j, *pl_ptr = args->pl_arr + i*npl; - if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype - { - for (j=0; j b ) return 1; + return 0; +} +static int cmp_pair(const void *_a, const void *_b) +{ + pair_t *a = (pair_t*)_a; + pair_t *b = (pair_t*)_b; + if ( a->iqry < b->iqry ) return -1; + if ( a->iqry > b->iqry ) return 1; + if ( a->igt < b->igt ) return -1; + if ( a->igt > b->igt ) return 1; + return 0; } -static int cmp_doubleptr(const void *_a, const void *_b) +typedef struct +{ + uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms + unsigned long kbs_dat[1]; +} +diff_sites_t; +#if DBG +static void diff_sites_debug_print(args_t *args, diff_sites_t *ds) +{ + int i; + memcpy(args->kbs_diff->b,ds->kbs_dat,args->kbs_diff->n*sizeof(unsigned long)); + fprintf(bcftools_stderr,"%s:%d\t%d\t",bcf_hdr_id2name(args->qry_hdr,ds->rid),ds->pos+1,ds->ndiff); + for (i=0; inpairs; i++) fprintf(bcftools_stderr,"%d",kbs_exists(args->kbs_diff,i)?1:0); + fprintf(bcftools_stderr,"\n"); +} +#endif +static int diff_sites_cmp(const void *aptr, const void *bptr) +{ + diff_sites_t *a = *((diff_sites_t**)aptr); + diff_sites_t *b = *((diff_sites_t**)bptr); + if ( a->ndiff < b->ndiff ) return 1; // descending order + if ( a->ndiff > b->ndiff ) return -1; + if ( a->rand < b->rand ) return -1; + if ( a->rand > b->rand ) return 1; + return 0; +} +static void diff_sites_init(args_t *args) +{ + int nsites = args->distinctive_sites<=1 ? args->npairs*args->distinctive_sites : args->distinctive_sites; + if ( nsites<=0 ) error("The value for --distinctive-sites was set too low: %d\n",nsites); + if ( nsites > args->npairs ) + { + fprintf(bcftools_stderr,"Warning: The value for --distinctive-sites is bigger than is the number of pairs, all discordant sites be printed.\n"); + nsites = args->npairs; + args->distinctive_sites = args->npairs + 1; + } + else + args->distinctive_sites = nsites; + args->kbs_diff = kbs_init(args->npairs); + size_t n = (args->npairs + KBS_ELTBITS-1) / KBS_ELTBITS; + assert( n==args->kbs_diff->n ); + args->diff_sites_size = sizeof(diff_sites_t) + (n-1)*sizeof(unsigned long); + args->es = extsort_alloc(); + extsort_set_opt(args->es,size_t,DAT_SIZE,args->diff_sites_size); + extsort_set_opt(args->es,const char*,TMP_PREFIX,args->es_tmp_prefix); + extsort_set_opt(args->es,const char*,MAX_MEM,args->es_max_mem); + extsort_set_opt(args->es,extsort_cmp_f,FUNC_CMP,diff_sites_cmp); + extsort_init(args->es); +} +static void diff_sites_destroy(args_t *args) { - double *a = *((double**)_a); - double *b = *((double**)_b); - if ( *a < *b ) return -1; - else if ( *a == *b ) return 0; + kbs_destroy(args->kbs_diff); + extsort_destroy(args->es); +} +static inline void diff_sites_reset(args_t *args) +{ + kbs_clear(args->kbs_diff); +} +static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos) +{ + diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size); + memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added) + dat->ndiff = ndiff; + dat->rid = rid; + dat->pos = pos; + dat->rand = hts_lrand48(); + memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long)); + extsort_push(args->es,dat); +} +static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos) +{ + diff_sites_t *dat = (diff_sites_t*) extsort_shift(args->es); + if ( !dat ) return 0; + *ndiff = dat->ndiff; + *rid = dat->rid; + *pos = dat->pos; + memcpy(args->kbs_diff->b,dat->kbs_dat,args->kbs_diff->n*sizeof(unsigned long)); return 1; } -static void check_gt(args_t *args) +static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname) { - int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0; - int fake_pls = args->no_PLs; + int i; + if ( !strcmp(list,"-") ) + { + *nsmpl = bcf_hdr_nsamples(hdr); + *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl)); + for (i=0; i<*nsmpl; i++) (*smpl)[i] = i; + return; + } - // Initialize things: check which tags are defined in the header, sample names etc. - if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname); - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) + char **tmp = hts_readlist(list, list_is_file, nsmpl); + if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list); + *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl)); + for (i=0; i<*nsmpl; i++) { - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) - error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); - if ( !args->no_PLs ) - fprintf(bcftools_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); - fake_pls = 1; + int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]); + if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]); + (*smpl)[i] = idx; + free(tmp[i]); } + free(tmp); + qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int); + // check for duplicates + for (i=1; i<*nsmpl; i++) + if ( (*smpl)[i-1]==(*smpl)[i] ) + error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list); +} - FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : bcftools_stdout; - print_header(args, fp); +static void init_data(args_t *args) +{ + hts_srand48(0); - int tgt_isample = -1, query_isample = 0; - if ( args->target_sample ) + args->files = bcf_sr_init(); + if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); + if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); + + if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX); + if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum)); + if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) + error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum)); + + args->qry_hdr = bcf_sr_get_header(args->files,0); + if ( !bcf_hdr_nsamples(args->qry_hdr) ) error("No samples in %s?\n", args->qry_fname); + if ( args->gt_fname ) { - tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample); - if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample); + args->gt_hdr = bcf_sr_get_header(args->files,1); + if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname); } - if ( args->all_sites ) + + // Determine whether GT or PL will be used + if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default { - if ( tgt_isample==-1 ) - { - fprintf(bcftools_stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); - tgt_isample = 0; - } + if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")>=0 ) + args->qry_use_GT = 0; + else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")>=0 ) + args->qry_use_GT = 1; + else + error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->qry_fname); } - if ( args->query_sample ) + else if ( args->qry_use_GT==1 ) { - query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample); - if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample); + if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")<0 ) + error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->qry_fname); } - if ( args->all_sites ) - fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n", - args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]); + else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")<0 ) + error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->qry_fname); - // Main loop - float prev_lk = 0; - while ( (ret=bcf_sr_next_line(args->files)) ) + if ( args->gt_hdr ) { - if ( ret!=2 ) continue; - bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file - bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file - bcf_unpack(sm_line, BCF_UN_FMT); - bcf_unpack(gt_line, BCF_UN_FMT); - - // Init mapping from target genotype index to the sample's PL fields - int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2; - if ( n_gt2ipl > m_gt2ipl ) + if ( args->gt_use_GT==-1 ) // not set by -u, gt uses GT by default + { + if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")>=0 ) + args->gt_use_GT = 1; + else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")>=0 ) + args->gt_use_GT = 0; + else + error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->gt_fname); + } + else if ( args->gt_use_GT==1 ) { - m_gt2ipl = n_gt2ipl; - gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl); + if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")<0 ) + error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->gt_fname); } - if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue; - - // Target genotypes - int ngt, npl; - if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) - error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); - ngt /= bcf_hdr_nsamples(args->gt_hdr); - if ( ngt!=2 ) continue; // checking only diploid genotypes + else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")<0 ) + error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->gt_fname); + } + else + args->gt_use_GT = args->qry_use_GT; - // Sample PLs - if ( !fake_pls ) + // Prepare samples + int i,j; + args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr); + if ( args->qry_samples ) + { + init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname); + } + if ( args->gt_samples ) + { + init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl, + args->gt_hdr ? args->gt_hdr : args->qry_hdr, + args->gt_fname ? args->gt_fname : args->qry_fname); + } + else if ( args->pair_samples ) + { + int npairs; + char **tmp = hts_readlist(args->pair_samples, args->pair_samples_is_file, &npairs); + if ( !tmp || !npairs ) error("Failed to parse %s\n", args->pair_samples); + if ( !args->pair_samples_is_file && npairs%2 ) error("Expected even number of comma-delimited samples with -p\n"); + args->npairs = args->pair_samples_is_file ? npairs : npairs/2; + args->pairs = (pair_t*) calloc(args->npairs,sizeof(*args->pairs)); + if ( !args->pair_samples_is_file ) { - if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 ) + for (i=0; inpairs; i++) { - if ( sm_line->n_allele==1 ) - { - // PL values may not be present when ALT=. (mpileup/bcftools output), in that case - // switch automatically to GT at these sites - npl = fake_PLs(args, args->sm_hdr, sm_line); - } - else - error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1); + args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i]); + args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i+1]); + if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[2*i]); + if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,tmp[2*i+1]); + free(tmp[2*i]); + free(tmp[2*i+1]); } - else - npl /= bcf_hdr_nsamples(args->sm_hdr); } else - npl = fake_PLs(args, args->sm_hdr, sm_line); + { + for (i=0; inpairs; i++) + { + char *ptr = tmp[i]; + while ( *ptr && !isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse %s: %s\n",args->pair_samples,tmp[i]); + *ptr = 0; + args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]); + if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]); + ptr++; + while ( *ptr && isspace(*ptr) ) ptr++; + args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, ptr); + if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,ptr); + free(tmp[i]); + } + } + free(tmp); + qsort(args->pairs,args->npairs,sizeof(*args->pairs),cmp_pair); + } + else if ( args->gt_hdr ) + args->ngt_smpl = bcf_hdr_nsamples(args->gt_hdr); + if ( !args->ngt_smpl ) + { + args->ngt_smpl = args->nqry_smpl; + args->gt_smpl = args->qry_smpl; + args->cross_check = 1; + } + + // The data arrays + if ( !args->npairs ) args->npairs = args->cross_check ? args->nqry_smpl*(args->nqry_smpl+1)/2 : args->ngt_smpl*args->nqry_smpl; + if ( !args->pair_samples ) + { + args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl); + args->gt_dsg = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl); + } + if ( args->use_PLs ) + { + args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff)); // log probability of pair samples being the same + args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob)); + args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob)); + + // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing + // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding + // probabilities of 0/0, 0/1, and 1/1 genotypes + for (i=0; i<8; i++) + for (j=0; j<3; j++) + args->dsg2prob[i][j] = HUGE_VAL; + args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs)); + args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs)); + args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs)); + args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs)); - // Calculate likelihoods for all samples, assuming diploid genotypes + // lookup table to avoid exponentiation + for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i); + } + else + args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff)); // number of differing genotypes for each pair of samples + args->ncnt = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt)); // number of comparisons performed (non-missing data) + if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6); + if ( args->calc_hwe_prob ) + { + // prob of the observed sequence of matches given site AFs and HWE + args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob)); + if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6); + } + + if ( args->distinctive_sites ) diff_sites_init(args); + + args->fp = bcftools_stdout; + print_header(args, args->fp); +} + +static void destroy_data(args_t *args) +{ + if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg); + free(args->qry_dsg); + if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob); + free(args->qry_prob); + free(args->es_max_mem); + fclose(args->fp); + if ( args->distinctive_sites ) diff_sites_destroy(args); + free(args->hwe_prob); + free(args->cwd); + free(args->qry_arr); + if ( args->gt_hdr ) free(args->gt_arr); + free(args->pdiff); + free(args->ndiff); + free(args->ncnt); + free(args->qry_smpl); + if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl); + free(args->pairs); + bcf_sr_destroy(args->files); +} - // For faster access to genotype likelihoods (PLs) of the query sample - int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl; - double sum_pl = 0; // for converting PLs to probs - for (max_ipl=0; max_ipldsg2prob[dsg][0]; + prob[1] = args->dsg2prob[dsg][1]; + prob[2] = args->dsg2prob[dsg][2]; + } + return dsg; +} +static inline uint8_t pl_to_prob(args_t *args, int32_t *ptr, double *prob) +{ + uint8_t dsg = pl_to_dsg(ptr); + if ( dsg ) + { + prob[0] = (ptr[0]>=0 && ptr[0]<255) ? args->pl2prob[ptr[0]] : args->pl2prob[255]; + prob[1] = (ptr[1]>=0 && ptr[1]<255) ? args->pl2prob[ptr[1]] : args->pl2prob[255]; + prob[2] = (ptr[2]>=0 && ptr[2]<255) ? args->pl2prob[ptr[2]] : args->pl2prob[255]; + double sum = prob[0] + prob[1] + prob[2]; + prob[0] /= sum; + prob[1] /= sum; + prob[2] /= sum; + prob[0] = -log(prob[0]); + prob[1] = -log(prob[1]); + prob[2] = -log(prob[2]); + } + return dsg; +} +static int set_data(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec, int32_t **arr, int32_t *narr, int *narr1, int *use_GT) +{ + static int warn_dip_GT = 1; + static int warn_dip_PL = 1; + int i; + for (i=0; i<2; i++) + { + if ( *use_GT ) { - if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break; - if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue; - sum_pl += pow(10, -0.1*pl_ptr[max_ipl]); + int ret = bcf_get_genotypes(hdr,rec,arr,narr); + if ( ret < 0 ) + { + if ( !i ) { *use_GT = 0; continue; } + args->nskip_no_data++; + return -1; + } + if ( ret != 2*bcf_hdr_nsamples(hdr) ) + { + if ( warn_dip_GT ) + { + fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/GT fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1); + warn_dip_GT = 0; + } + args->nskip_dip_GT++; + return -1; + } + *narr1 = 2; + return 0; } - if ( sum_pl==0 ) continue; // no PLs present - if ( fake_pls && args->no_PLs==1 ) sum_pl = -1; - // The main stats: concordance of the query sample with the target -g samples - for (i=0; igt_hdr); i++) + int ret = bcf_get_format_int32(hdr,rec,"PL",arr,narr); + if ( ret < 0 ) { - int *gt_ptr = gt_arr + i*ngt; - if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes - if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue; - int a = bcf_gt_allele(gt_ptr[0]); - int b = bcf_gt_allele(gt_ptr[1]); - if ( args->hom_only && a!=b ) continue; // heterozygous genotype - int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file - int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file - if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing - args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); - args->sites[i]++; + if ( !i ) { *use_GT = 1; continue; } + args->nskip_no_data++; + return -1; } - if ( args->all_sites ) + if ( ret != 3*bcf_hdr_nsamples(hdr) ) { - // Print LKs at all sites for debugging - int *gt_ptr = gt_arr + tgt_isample*ngt; - if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes - int a = bcf_gt_allele(gt_ptr[0]); - int b = bcf_gt_allele(gt_ptr[1]); - if ( args->hom_only && a!=b ) continue; // heterozygous genotype - fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); - for (i=0; in_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); - fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); - fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk); - prev_lk = args->lks[query_isample]; - - int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample - for (i=0; in_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); - for (igt=0; igtpos+1); + warn_dip_PL = 0; + } + args->nskip_dip_PL++; + return -1; } + *narr1 = 3; + return 0; } - free(gt2ipl); - free(gt_arr); - free(args->pl_arr); - free(args->tmp_arr); + return -1; // should never reach +} +static void process_line(args_t *args) +{ + int i,j,k, nqry1, ngt1, ret; + + bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0); // the query file + int qry_use_GT = args->qry_use_GT; + int gt_use_GT = args->gt_use_GT; + + ret = set_data(args, args->qry_hdr, qry_rec, &args->qry_arr, &args->nqry_arr, &nqry1, &qry_use_GT); + if ( ret<0 ) return; - // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same - // plot as discordance per site, the latter must be scaled to the same range - int nsamples = bcf_hdr_nsamples(args->gt_hdr); - double extreme_lk = 0, extreme_lk_per_site = 0; - for (i=0; igt_hdr ) { - if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i]; - if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i]; + gt_rec = bcf_sr_get_line(args->files,1); + ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, >_use_GT); + if ( ret<0 ) return; + } + else + { + ngt1 = nqry1; + args->gt_arr = args->qry_arr; } - // Sorted output - double **p = (double**) malloc(sizeof(double*)*nsamples); - for (i=0; ilks[i]; - qsort(p, nsamples, sizeof(int*), cmp_doubleptr); + args->ncmp++; - fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (avg score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]); - for (i=0; icalc_hwe_prob ) { - int idx = p[i] - args->lks; - double per_site = 0; - if ( args->sites[idx] ) + int ac[2]; + if ( args->gt_hdr ) { - if ( args->sites[idx] && extreme_lk_per_site ) + if ( bcf_calc_ac(args->gt_hdr, gt_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n"); + } + else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n"); + + // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty + // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g. + // hwe[1,2,4] .. dsg=0,1,2 + // hwe[3] .. dsg=0 or 1 + // hwe[6] .. dsg=1 or 2 + + double hwe[3]; + const double min_af = 1e-5; // cap the AF in case we get unrealistic values + af = (double)ac[1]/(ac[0]+ac[1]); + hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af); + hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af)); + hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af); + hwe_dsg[0] = 0; + for (i=1; i<8; i++) + { + hwe_dsg[i] = HUGE_VAL; + for (k=0; k<3; k++) { - per_site = args->lks[idx]/args->sites[idx]; - per_site *= extreme_lk / extreme_lk_per_site; + if ( ((1< hwe[k] ) hwe_dsg[i] = hwe[k]; } - else - per_site = 0; } - fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i); } - if ( args->plot ) + // The sample pairs were given explicitly via -p/-P options + if ( args->pairs ) { - if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__); - plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); - } -} + if ( !args->use_PLs ) + { + int ndiff = 0; + if ( args->kbs_diff ) diff_sites_reset(args); -// static inline int is_hom_most_likely(int nals, int *pls) -// { -// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0]; -// for (ia=1; iasm_hdr, line, &args->tmp_arr, &args->ntmp_arr); + for (i=0; inpairs; i++) + { + int32_t *ptr; + uint8_t qry_dsg, gt_dsg; - if ( ngt<=0 ) return 1; // GT not present - if ( ngt!=args->nsmpl*2 ) return 2; // not diploid - ngt /= args->nsmpl; - - int i,j, idx = 0; - for (i=1; insmpl; i++) - { - int32_t *a = args->tmp_arr + i*ngt; - if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; } - int agt = 1<gt_arr + args->pairs[i].igt*ngt1; + gt_dsg = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr); + if ( !gt_dsg ) continue; // missing value + if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom + + ptr = args->qry_arr + args->pairs[i].iqry*nqry1; + qry_dsg = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr); + if ( !qry_dsg ) continue; // missing value + + int match = qry_dsg & gt_dsg; + if ( !match ) + { + args->ndiff[i]++; + if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); } + } + else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match]; + args->ncnt[i]++; + } - for (j=0; jrid, qry_rec->pos); + } + else // use_PLs set { - int32_t *b = args->tmp_arr + j*ngt; - if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; } - int bgt = 1<npairs; i++) + { + int32_t *ptr; + double qry_prob[3], gt_prob[3]; + uint8_t qry_dsg, gt_dsg; + + ptr = args->gt_arr + args->pairs[i].igt*ngt1; + gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob); + if ( !gt_dsg ) continue; // missing value + if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom + + ptr = args->qry_arr + args->pairs[i].iqry*nqry1; + qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob); + if ( !qry_dsg ) continue; // missing value - ntot[idx]++; - if ( agt!=bgt ) ndif[idx]++; - idx++; + double min = qry_prob[0] + gt_prob[0]; + qry_prob[1] += gt_prob[1]; + if ( min > qry_prob[1] ) min = qry_prob[1]; + qry_prob[2] += gt_prob[2]; + if ( min > qry_prob[2] ) min = qry_prob[2]; + args->pdiff[i] += min; + + if ( args->calc_hwe_prob ) + { + int match = qry_dsg & gt_dsg; + args->hwe_prob[i] += hwe_dsg[match]; + } + args->ncnt[i]++; + } } + return; } - return 0; -} -int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif) -{ - int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr); - if ( npl<=0 ) return 1; // PL not present - npl /= args->nsmpl; - - int i,j,k, idx = 0; - for (i=1; insmpl; i++) + int idx=0; + if ( !args->use_PLs ) { - int32_t *a = args->tmp_arr + i*npl; - int imin = -1; - for (k=0; knqry_smpl; i++) { - if ( a[k]==bcf_int32_vector_end ) break; - if ( a[k]==bcf_int32_missing ) continue; - if ( imin==-1 || a[imin] > a[k] ) imin = k; + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + int32_t *ptr = args->qry_arr + nqry1*iqry; + args->qry_dsg[i] = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr); } - if ( imin<0 ) { idx+=i; continue; } - - for (j=0; jcross_check ) // in this case gt_dsg points to qry_dsg { - int32_t *b = args->tmp_arr + j*npl; - int jmin = -1; - for (k=0; kngt_smpl; i++) { - if ( b[k]==bcf_int32_vector_end ) break; - if ( b[k]==bcf_int32_missing ) continue; - if ( jmin==-1 || b[jmin] > b[k] ) jmin = k; + int igt = args->gt_smpl ? args->gt_smpl[i] : i; + int32_t *ptr = args->gt_arr + ngt1*igt; + args->gt_dsg[i] = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr); + if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value + } + } + for (i=0; inqry_smpl; i++) + { + int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode? + if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value + for (j=0; jgt_dsg[j] ) { idx++; continue; } // missing value + int match = args->qry_dsg[i] & args->gt_dsg[j]; + if ( !match ) args->ndiff[idx]++; + else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match]; + args->ncnt[idx]++; + idx++; } - if ( jmin<0 ) { idx++; continue; } - - ntot[idx]++; - if ( imin!=jmin ) ndif[idx]++; - idx++; } } - return 0; -} + else // use_PLs set + { + for (i=0; inqry_smpl; i++) + { + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + int32_t *ptr = args->qry_arr + nqry1*iqry; + args->qry_dsg[i] = qry_use_GT ? gt_to_prob(args,ptr,args->qry_prob+i*3) : pl_to_prob(args,ptr,args->qry_prob+i*3); + } + if ( !args->cross_check ) // in this case gt_dsg points to qry_dsg + { + for (i=0; ingt_smpl; i++) + { + int igt = args->gt_smpl ? args->gt_smpl[i] : i; + int32_t *ptr = args->gt_arr + ngt1*igt; + args->gt_dsg[i] = gt_use_GT ? gt_to_prob(args,ptr,args->gt_prob+i*3) : pl_to_prob(args,ptr,args->gt_prob+i*3); + if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value + } + } + for (i=0; inqry_smpl; i++) + { + int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode? + if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value + for (j=0; jgt_dsg[j] ) { idx++; continue; } // missing value -static void cross_check_gts(args_t *args) -{ - // Initialize things: check which tags are defined in the header, sample names etc. - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) - { - if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) - error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); - if ( !args->no_PLs ) { - fprintf(bcftools_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); - args->no_PLs = 99; + double min = args->qry_prob[i*3] + args->gt_prob[j*3]; + if ( min > args->qry_prob[i*3+1] + args->gt_prob[j*3+1] ) min = args->qry_prob[i*3+1] + args->gt_prob[j*3+1]; + if ( min > args->qry_prob[i*3+2] + args->gt_prob[j*3+2] ) min = args->qry_prob[i*3+2] + args->gt_prob[j*3+2]; + args->pdiff[idx] += min; + + if ( args->calc_hwe_prob ) + { + int match = args->qry_dsg[i] & args->gt_dsg[j]; + args->hwe_prob[idx] += hwe_dsg[match]; + } + args->ncnt[idx]++; + idx++; + } } } +} - args->nsmpl = bcf_hdr_nsamples(args->sm_hdr); - args->narr = (args->nsmpl-1)*args->nsmpl/2; - uint32_t *ndif = (uint32_t*) calloc(args->narr,4); - uint32_t *ntot = (uint32_t*) calloc(args->narr,4); +typedef struct +{ + int ism, idx; + double val; +} +idbl_t; +static int cmp_idbl(const void *_a, const void *_b) +{ + idbl_t *a = (idbl_t*)_a; + idbl_t *b = (idbl_t*)_b; + if ( a->val < b->val ) return -1; + if ( a->val > b->val ) return 1; + return 0; +} +static void report_distinctive_sites(args_t *args) +{ + extsort_sort(args->es); + + fprintf(args->fp,"# DS, distinctive sites:\n"); + fprintf(args->fp,"# - chromosome\n"); + fprintf(args->fp,"# - position\n"); + fprintf(args->fp,"# - cumulative number of pairs distinguished by this block\n"); + fprintf(args->fp,"# - block id\n"); + fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n"); - while ( bcf_sr_next_line(args->files) ) + kbitset_t *kbs_blk = kbs_init(args->npairs); + kbitset_iter_t itr; + int i,ndiff,rid,pos,ndiff_tot = 0, iblock = 0; + int ndiff_min = args->distinctive_sites <= args->npairs ? args->distinctive_sites : args->npairs; + while ( diff_sites_shift(args,&ndiff,&rid,&pos) ) { - bcf1_t *line = bcf_sr_get_line(args->files,0); - - // use PLs unless no_PLs is set and GT exists - if ( args->no_PLs ) + int ndiff_new = 0, ndiff_dbg = 0; + kbs_start(&itr); + while ( (i=kbs_next(args->kbs_diff, &itr))>=0 ) { - if ( process_GT(args,line,ntot,ndif)==0 ) continue; + ndiff_dbg++; + if ( kbs_exists(kbs_blk,i) ) continue; // already set + kbs_insert(kbs_blk,i); + ndiff_new++; } - process_PL(args,line,ntot,ndif); + if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff); + if ( !ndiff_new ) continue; // no new pair distinguished by this site + ndiff_tot += ndiff_new; + fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock); + if ( ndiff_tot < ndiff_min ) continue; // fewer than the requested number of pairs can be distinguished at this point + iblock++; + ndiff_tot = 0; + kbs_clear(kbs_blk); } - - FILE *fp = bcftools_stdout; - print_header(args, fp); + kbs_destroy(kbs_blk); +} +static void report(args_t *args) +{ + fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp); + fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match); + fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba); + fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono); + fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data); + fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT); + fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL); + fprintf(args->fp,"# DC, discordance:\n"); + fprintf(args->fp,"# - query sample\n"); + fprintf(args->fp,"# - genotyped sample\n"); + fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n"); + fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n"); + fprintf(args->fp,"# - number of sites compared (bigger is better)\n"); + fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n"); - float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2); + int trim = args->ntop; + if ( !args->pairs ) + { + if ( !args->ngt_smpl && args->nqry_smpl <= args->ntop ) trim = 0; + if ( args->ngt_smpl && args->ngt_smpl <= args->ntop ) trim = 0; + } - // Output pairwise distances - fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n"); - int i,j, idx = 0; - for (i=0; insmpl; i++) + if ( args->pairs ) { - for (j=0; jnpairs; i++) { - float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10; - fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); - PDIST(tmp,i,j) = err; - idx++; + int iqry = args->pairs[i].iqry; + int igt = args->pairs[i].igt; + if ( args->ndiff ) + { + fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->ndiff[i], + args->calc_hwe_prob ? args->hwe_prob[i] : 0, + args->ncnt[i]); + } + else + { + fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->pdiff[i], + args->calc_hwe_prob ? args->hwe_prob[i] : 0, + args->ncnt[i]); + } } } - - // Cluster samples - int nlist; - float clust_max_err = args->max_intra_err; - hclust_t *clust = hclust_init(args->nsmpl,tmp); - cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist); - fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n"); - for (i=0; ism_hdr->samples[list[i].memb[j]]); - fprintf(fp,"\n"); - } - hclust_destroy_list(list,nlist); - // Debugging output: the cluster graph and data used for deciding - char **dbg = hclust_explain(clust,&nlist); - for (i=0; ism_hdr->samples,clust_max_err)); - hclust_destroy(clust); - free(tmp); - - - // Deprecated output for temporary backward compatibility - fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n"); - fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); - idx = 0; - for (i=0; insmpl; i++) + else if ( !trim ) { - for (j=0; jnqry_smpl; i++) { - fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); - idx++; + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + int ngt = args->cross_check ? i : args->ngt_smpl; + for (j=0; jgt_smpl ? args->gt_smpl[j] : j; + if ( args->ndiff ) + { + fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->ndiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + else + { + fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->pdiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + idx++; + } } } - - free(ndif); - free(ntot); - free(args->tmp_arr); + else if ( !args->cross_check ) + { + idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*args->ngt_smpl); + int i,j; + for (i=0; inqry_smpl; i++) + { + int idx = i*args->ngt_smpl; + for (j=0; jngt_smpl; j++) + { + if ( args->sort_by_hwe ) + arr[j].val = -args->hwe_prob[idx]; + else if ( args->ndiff ) + arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0; + else + arr[j].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0; + arr[j].ism = j; + arr[j].idx = idx; + idx++; + } + qsort(arr, args->ngt_smpl, sizeof(*arr), cmp_idbl); + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + for (j=0; jntop; j++) + { + int idx = arr[j].idx; + int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism; + if ( args->ndiff ) + { + fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->ndiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + else + { + fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt], + args->pdiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + } + } + free(arr); + } + else + { + int narr = args->nqry_smpl-1; + idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*narr); + int i,j,k,idx; + for (i=0; inqry_smpl; i++) + { + k = 0, idx = i*(i-1)/2; + for (j=0; jsort_by_hwe ) + arr[k].val = -args->hwe_prob[idx]; + else if ( args->ndiff ) + arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0; + else + arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0; + arr[k].ism = j; + arr[k].idx = idx; + idx++; + k++; + } + for (; jsort_by_hwe ) + arr[k].val = -args->hwe_prob[idx]; + else if ( args->ndiff ) + arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0; + else + arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0; + arr[k].ism = j + 1; + arr[k].idx = idx; + k++; + } + qsort(arr, narr, sizeof(*arr), cmp_idbl); + int iqry = args->qry_smpl ? args->qry_smpl[i] : i; + for (j=0; jntop; j++) + { + if ( i <= arr[j].ism ) continue; + int idx = arr[j].idx; + int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism; + if ( args->ndiff ) + { + fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->qry_hdr->samples[igt], + args->ndiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + else + { + fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n", + args->qry_hdr->samples[iqry], + args->qry_hdr->samples[igt], + args->pdiff[idx], + args->calc_hwe_prob ? args->hwe_prob[idx] : 0, + args->ncnt[idx]); + } + } + } + free(arr); + } } -static char *init_prefix(char *prefix) +static int is_input_okay(args_t *args, int nmatch) { - int len = strlen(prefix); - if ( prefix[len-1] == '/' || prefix[len-1] == '\\' ) - return msprintf("%sgtcheck", prefix); - return strdup(prefix); + int i; + const char *msg; + bcf_hdr_t *hdr; + bcf1_t *rec; + if ( args->gt_hdr && nmatch!=2 ) + { + if ( args->nskip_no_match++ ) return 0; + for (i=0; i<2; i++) + { + rec = bcf_sr_get_line(args->files,i); + if ( rec ) break; + } + hdr = bcf_sr_get_header(args->files,i); + fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", no record with matching POS+ALT. (This is printed only once.)\n", + bcf_seqname(hdr,rec),rec->pos+1); + return 0; + } + for (i=0; i<2; i++) + { + hdr = bcf_sr_get_header(args->files,i); + rec = bcf_sr_get_line(args->files,i); + if ( rec->n_allele>2 ) + { + if ( args->nskip_not_ba++ ) return 0; + msg = "not a biallelic site, run `bcftools norm -m -` first"; + goto not_okay; + } + if ( bcf_get_variant_types(rec)==VCF_REF ) + { + if ( args->nskip_mono++ ) return 0; + msg = "monoallelic site"; + goto not_okay; + } + if ( !args->gt_hdr ) break; + } + return 1; + +not_okay: + fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", + bcf_seqname(hdr,rec),rec->pos+1,msg); + return 0; } static void usage(void) @@ -714,30 +1028,62 @@ static void usage(void) fprintf(bcftools_stderr, "Usage: bcftools gtcheck [options] [-g ] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -a, --all-sites output comparison for all sites\n"); - fprintf(bcftools_stderr, " -c, --cluster min inter- and max intra-sample error [0.23,-0.3]\n"); - fprintf(bcftools_stderr, " -g, --genotypes genotypes to compare against\n"); - fprintf(bcftools_stderr, " -G, --GTs-only use GTs, ignore PLs, using for unseen genotypes [99]\n"); - fprintf(bcftools_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n"); - fprintf(bcftools_stderr, " -p, --plot plot\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --query-sample query sample (by default the first sample is checked)\n"); - fprintf(bcftools_stderr, " -S, --target-sample target sample in the -g file (used only for plotting)\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + //fprintf(bcftools_stderr, " -a, --all-sites Output comparison for all sites\n"); + //fprintf(bcftools_stderr, " -c, --cluster MIN,MAX Min inter- and max intra-sample error [0.23,-0.3]\n"); + fprintf(bcftools_stderr, " --distinctive-sites Find sites that can distinguish between at least NUM sample pairs.\n"); + fprintf(bcftools_stderr, " NUM[,MEM[,TMP]] If the number is smaller or equal to 1, it is interpreted as the fraction of pairs.\n"); + fprintf(bcftools_stderr, " The optional MEM string sets the maximum memory used for in-memory sorting [500M]\n"); +#ifdef _WIN32 + fprintf(bcftools_stderr, " and TMP is a prefix of temporary files used by external sorting [/bcftools.XXXXXX]\n"); +#else + fprintf(bcftools_stderr, " and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n"); +#endif + fprintf(bcftools_stderr, " --dry-run Stop after first record to estimate required time\n"); + fprintf(bcftools_stderr, " -e, --error-probability INT Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n"); + fprintf(bcftools_stderr, " -g, --genotypes FILE Genotypes to compare against\n"); + fprintf(bcftools_stderr, " -H, --homs-only Homozygous genotypes only, useful with low coverage data (requires -g)\n"); + fprintf(bcftools_stderr, " --n-matches INT Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n"); + fprintf(bcftools_stderr, " Use negative value to sort by HWE probability rather than by discordance [0]\n"); + fprintf(bcftools_stderr, " --no-HWE-prob Disable calculation of HWE probability\n"); + fprintf(bcftools_stderr, " -p, --pairs LIST Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n"); + fprintf(bcftools_stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n"); + fprintf(bcftools_stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n"); + fprintf(bcftools_stderr, "Examples:\n"); + fprintf(bcftools_stderr, " # Check discordance of all samples from B against all sample in A\n"); + fprintf(bcftools_stderr, " bcftools gtcheck -g A.bcf B.bcf\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, " # Limit comparisons to the fiven list of samples\n"); + fprintf(bcftools_stderr, " bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + fprintf(bcftools_stderr, " # Compare only two pairs a1,b1 and a1,b2\n"); + fprintf(bcftools_stderr, " bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf\n"); + fprintf(bcftools_stderr, "\n"); + bcftools_exit(1); } int main_vcfgtcheck(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); - args->files = bcf_sr_init(); args->argc = argc; args->argv = argv; set_cwd(args); - char *regions = NULL, *targets = NULL; - int regions_is_file = 0, targets_is_file = 0; + args->qry_use_GT = -1; + args->gt_use_GT = -1; + args->calc_hwe_prob = 1; + args->use_PLs = 40; + + // external sort for --distinctive-sites +#ifdef _WIN32 + args->es_tmp_prefix = NULL; +#else + args->es_tmp_prefix = "/tmp/bcftools-gtcheck"; +#endif + args->es_max_mem = strdup("500M"); // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 // - min_inter: pairs with smaller err value will be considered identical @@ -748,6 +1094,8 @@ int main_vcfgtcheck(int argc, char *argv[]) static struct option loptions[] = { + {"error-probability",1,0,'e'}, + {"use",1,0,'u'}, {"cluster",1,0,'c'}, {"GTs-only",1,0,'G'}, {"all-sites",0,0,'a'}, @@ -755,18 +1103,74 @@ int main_vcfgtcheck(int argc, char *argv[]) {"help",0,0,'h'}, {"genotypes",1,0,'g'}, {"plot",1,0,'p'}, - {"target-sample",1,0,'S'}, - {"query-sample",1,0,'s'}, + {"samples",1,0,'s'}, + {"samples-file",1,0,'S'}, + {"n-matches",1,0,2}, + {"no-HWE-prob",0,0,3}, + {"target-sample",1,0,4}, + {"dry-run",0,0,5}, + {"distinctive-sites",1,0,6}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"pairs",1,0,'p'}, + {"pairs-file",1,0,'P'}, {0,0,0,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) { switch (c) { + case 'e': + args->use_PLs = strtol(optarg,&tmp,10); + if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg); + break; + case 'u': + { + int i,nlist; + char **list = hts_readlist(optarg, 0, &nlist); + if ( !list || nlist<=0 || nlist>2 ) error("Failed to parse --use %s\n", optarg); + if ( !strcasecmp("GT",list[0]) ) args->qry_use_GT = 1; + else if ( !strcasecmp("PL",list[0]) ) args->qry_use_GT = 0; + else error("Failed to parse --use %s; only GT and PL are supported\n", optarg); + if ( nlist==2 ) + { + if ( !strcasecmp("GT",list[1]) ) args->gt_use_GT = 1; + else if ( !strcasecmp("PL",list[1]) ) args->gt_use_GT = 0; + else error("Failed to parse --use %s; only GT and PL are supported\n", optarg); + } + else args->gt_use_GT = args->qry_use_GT; + for (i=0; intop = strtol(optarg,&tmp,10); + if ( !tmp || *tmp ) error("Could not parse: --n-matches %s\n", optarg); + if ( args->ntop < 0 ) + { + args->sort_by_hwe = 1; + args->ntop *= -1; + } + break; + case 3 : args->calc_hwe_prob = 0; break; + case 4 : error("The option -S, --target-sample has been deprecated\n"); break; + case 5 : args->dry_run = 1; break; + case 6 : + args->distinctive_sites = strtod(optarg,&tmp); + if ( *tmp ) + { + if ( *tmp!=',' ) error("Could not parse: --distinctive-sites %s\n", optarg); + tmp++; + free(args->es_max_mem); + args->es_max_mem = strdup(tmp); + while ( *tmp && *tmp!=',' ) tmp++; + if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; } + } + args->use_PLs = 0; + break; case 'c': + error("The -c option is to be implemented, please open an issue on github\n"); args->min_inter_err = strtod(optarg,&tmp); if ( *tmp ) { @@ -775,50 +1179,77 @@ int main_vcfgtcheck(int argc, char *argv[]) if ( *tmp ) error("Could not parse: -c %s\n", optarg); } break; - case 'G': - args->no_PLs = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg); - break; - case 'a': args->all_sites = 1; break; + case 'G': error("The option -G, --GTs-only has been deprecated\n"); break; + case 'a': args->all_sites = 1; error("The -a option is to be implemented, please open an issue on github\n"); break; case 'H': args->hom_only = 1; break; case 'g': args->gt_fname = optarg; break; - case 'p': args->plot = optarg; break; - case 'S': args->target_sample = optarg; break; - case 's': args->query_sample = optarg; break; - case 'r': regions = optarg; break; - case 'R': regions = optarg; regions_is_file = 1; break; - case 't': targets = optarg; break; - case 'T': targets = optarg; targets_is_file = 1; break; +// case 'p': args->plot = optarg; break; + case 's': + if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3; + else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4; + else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); + break; + case 'S': + if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1; + else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1; + else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg); + break; + case 'p': args->pair_samples = optarg; break; + case 'P': args->pair_samples = optarg; args->pair_samples_is_file = 1; break; + case 'r': args->regions = optarg; break; + case 'R': args->regions = optarg; args->regions_is_file = 1; break; + case 't': args->targets = optarg; break; + case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); } } - char *fname = NULL; if ( optind==argc ) { - if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin + if ( !isatty(fileno((FILE *)stdin)) ) args->qry_fname = "-"; // reading from stdin else usage(); // no files given } - else fname = argv[optind]; - if ( argc>optind+1 ) usage(); // too many files given - if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode - else args->files->require_index = 1; - if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); - if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); - if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); - if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) - error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum)); - args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; - if ( args->plot ) args->plot = init_prefix(args->plot); + else args->qry_fname = argv[optind]; + if ( argc>optind+1 ) error("Error: too many files given, run with -h for help\n"); // too many files given + if ( args->pair_samples ) + { + if ( args->gt_samples || args->qry_samples ) error("The -p/-P option cannot be combined with -s/-S\n"); + if ( args->ntop ) error("The --n-matches option cannot be combined with -p/-P\n"); + } + if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n"); + if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n"); + if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n"); + init_data(args); - if ( args->cross_check ) - cross_check_gts(args); - else - check_gt(args); + + int ret; + while ( (ret=bcf_sr_next_line(args->files)) ) + { + if ( !is_input_okay(args,ret) ) continue; + + // time one record to give the user an estimate with very big files + struct timeval t0, t1; + if ( !args->ncmp ) gettimeofday(&t0, NULL); + + process_line(args); + + if ( args->ncmp==1 ) + { + gettimeofday(&t1, NULL); + double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); + fprintf(bcftools_stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6); + fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6); + if ( args->dry_run ) break; + } + } + if ( !args->dry_run ) + { + report(args); + if ( args->distinctive_sites ) report_distinctive_sites(args); + } + destroy_data(args); - bcf_sr_destroy(args->files); - if (args->plot) free(args->plot); free(args); return 0; } diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c index 9f7de23..4a16d8a 100644 --- a/bcftools/vcfindex.c +++ b/bcftools/vcfindex.c @@ -1,6 +1,6 @@ /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. - Copyright (C) 2014-2016 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Shane McCarthy @@ -24,6 +24,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include @@ -37,6 +38,11 @@ DEALINGS IN THE SOFTWARE. */ #define BCF_LIDX_SHIFT 14 +enum { + per_contig = 1, + total = 2 +}; + static void usage(void) { fprintf(stderr, "\n"); @@ -47,7 +53,7 @@ static void usage(void) fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); fprintf(stderr, " -f, --force overwrite index if it already exists\n"); fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(stderr, " -o, --output-file FILE optional output index file name\n"); + fprintf(stderr, " -o, --output FILE optional output index file name\n"); fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n"); fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n"); fprintf(stderr, "\n"); @@ -60,65 +66,137 @@ static void usage(void) int vcf_index_stats(char *fname, int stats) { - const char **seq; - int i, nseq; + const char **seq = NULL; + int tid, nseq = 0, ret = 0; tbx_t *tbx = NULL; + bcf_hdr_t *hdr = NULL; hts_idx_t *idx = NULL; + htsFile *fp = NULL; + uint64_t sum = 0; + char *fntemp = NULL, *fnidx = NULL; - htsFile *fp = hts_open(fname,"r"); - if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; } - bcf_hdr_t *hdr = bcf_hdr_read(fp); - if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; } - - if ( hts_get_format(fp)->format==vcf ) + /* + * First, has the user provided an index file? If per contig stats + * are requested, open the variant file (together with the index file, + * if provided), since the contig names can only be retrieved from its + * header. Otherwise, use just the corresponding index file to count + * the total number of records. + */ + int len = strlen(fname); + if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) { + fntemp = strdup(fname); + if ( !fntemp ) return 1; + fntemp[fnidx-fname] = 0; + fname = fntemp; + fnidx += strlen(HTS_IDX_DELIM); + } + else if ( len>4 && (!strcasecmp(".csi",fname+len-4) || !strcasecmp(".tbi",fname+len-4)) ) { - tbx = tbx_index_load(fname); - if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; } + fnidx = fname; + fntemp = strdup(fname); + fname = fntemp; + fname[len-4] = 0; } - else if ( hts_get_format(fp)->format==bcf ) + + if ( stats&per_contig ) { - idx = bcf_index_load(fname); - if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; } + fp = hts_open(fname,"r"); + if ( !fp ) { + fprintf(stderr,"Could not read %s\n", fname); + ret = 1; goto cleanup; + } + hdr = bcf_hdr_read(fp); + if ( !hdr ) { + fprintf(stderr,"Could not read the header: %s\n", fname); + ret = 1; goto cleanup; + } + + if ( hts_get_format(fp)->format==vcf ) + { + tbx = tbx_index_load2(fname, fnidx); + if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; } + } + else if ( hts_get_format(fp)->format==bcf ) + { + idx = bcf_index_load2(fname, fnidx); + if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; } + } + else + { + fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); + return 1; + } } - else + else if ( fnidx ) { - fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); - return 1; + char *ext = strrchr(fnidx, '.'); + if ( ext && strcmp(ext, ".tbi") == 0 ) { + tbx = tbx_index_load2(fname, fnidx); + } else if ( ext && strcmp(ext, ".csi") == 0 ) { + idx = bcf_index_load2(fname, fnidx); + } + if ( !tbx && !idx ) { + fprintf(stderr,"Could not load index file '%s'\n", fnidx); + ret = 1; goto cleanup; + } + } else { + char *ext = strrchr(fname, '.'); + if ( ext && strcmp(ext, ".bcf") == 0 ) { + idx = bcf_index_load(fname); + } else if ( ext && (ext-fname) > 4 && strcmp(ext-4, ".vcf.gz") == 0 ) { + tbx = tbx_index_load(fname); + } } - seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); - uint64_t sum = 0; - for (i=0; iidx : idx, i, &records, &v); - sum+=records; - if (stats&2 || !records) continue; - bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); - int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - printf("%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); - } - if (!sum) + hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v); + sum += records; + if ( (stats&total) || !records ) continue; + const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL; + if ( ctg_name ) { + bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL; + int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; + printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records); + } + } + if ( !sum ) { // No counts found. // Is this because index version has no stored count data, or no records? bcf1_t *rec = bcf_init1(); - if (bcf_read1(fp, hdr, rec) >= 0) - { + if (fp && hdr && rec && bcf_read1(fp, hdr, rec) >= 0) { fprintf(stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname); - return 1; + ret = 1; } bcf_destroy1(rec); } - if (stats&2) printf("%" PRIu64 "\n", sum); + if ( (stats&total) && !ret ) { + printf("%" PRIu64 "\n", sum); + } + +cleanup: free(seq); - if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__); + free(fntemp); + if ( fp && hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__); bcf_hdr_destroy(hdr); if (tbx) tbx_destroy(tbx); if (idx) hts_idx_destroy(idx); - return 0; + return ret; } int main_vcfindex(int argc, char *argv[]) @@ -137,6 +215,7 @@ int main_vcfindex(int argc, char *argv[]) {"nrecords",no_argument,NULL,'n'}, {"threads",required_argument,NULL,9}, {"output-file",required_argument,NULL,'o'}, + {"output",required_argument,NULL,'o'}, {NULL, 0, NULL, 0} }; @@ -152,8 +231,8 @@ int main_vcfindex(int argc, char *argv[]) min_shift = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg); break; - case 's': stats |= 1; break; - case 'n': stats |= 2; break; + case 's': stats |= per_contig; break; + case 'n': stats |= total; break; case 9: n_threads = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); @@ -162,7 +241,7 @@ int main_vcfindex(int argc, char *argv[]) default: usage(); } } - if (stats>2) + if (stats > total) { fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); return 1; diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c index 0b7aeeb..acbae89 100644 --- a/bcftools/vcfindex.c.pysam.c +++ b/bcftools/vcfindex.c.pysam.c @@ -2,7 +2,7 @@ /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. - Copyright (C) 2014-2016 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Shane McCarthy @@ -26,6 +26,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include @@ -39,6 +40,11 @@ DEALINGS IN THE SOFTWARE. */ #define BCF_LIDX_SHIFT 14 +enum { + per_contig = 1, + total = 2 +}; + static void usage(void) { fprintf(bcftools_stderr, "\n"); @@ -49,7 +55,7 @@ static void usage(void) fprintf(bcftools_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); fprintf(bcftools_stderr, " -f, --force overwrite index if it already exists\n"); fprintf(bcftools_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(bcftools_stderr, " -o, --output-file FILE optional output index file name\n"); + fprintf(bcftools_stderr, " -o, --output FILE optional output index file name\n"); fprintf(bcftools_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); fprintf(bcftools_stderr, " --threads INT use multithreading with INT worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); @@ -57,70 +63,142 @@ static void usage(void) fprintf(bcftools_stderr, " -n, --nrecords print number of records based on existing index file\n"); fprintf(bcftools_stderr, " -s, --stats print per contig stats based on existing index file\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int vcf_index_stats(char *fname, int stats) { - const char **seq; - int i, nseq; + const char **seq = NULL; + int tid, nseq = 0, ret = 0; tbx_t *tbx = NULL; + bcf_hdr_t *hdr = NULL; hts_idx_t *idx = NULL; + htsFile *fp = NULL; + uint64_t sum = 0; + char *fntemp = NULL, *fnidx = NULL; - htsFile *fp = hts_open(fname,"r"); - if ( !fp ) { fprintf(bcftools_stderr,"Could not read %s\n", fname); return 1; } - bcf_hdr_t *hdr = bcf_hdr_read(fp); - if ( !hdr ) { fprintf(bcftools_stderr,"Could not read the header: %s\n", fname); return 1; } - - if ( hts_get_format(fp)->format==vcf ) + /* + * First, has the user provided an index file? If per contig stats + * are requested, open the variant file (together with the index file, + * if provided), since the contig names can only be retrieved from its + * header. Otherwise, use just the corresponding index file to count + * the total number of records. + */ + int len = strlen(fname); + if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) { + fntemp = strdup(fname); + if ( !fntemp ) return 1; + fntemp[fnidx-fname] = 0; + fname = fntemp; + fnidx += strlen(HTS_IDX_DELIM); + } + else if ( len>4 && (!strcasecmp(".csi",fname+len-4) || !strcasecmp(".tbi",fname+len-4)) ) { - tbx = tbx_index_load(fname); - if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; } + fnidx = fname; + fntemp = strdup(fname); + fname = fntemp; + fname[len-4] = 0; } - else if ( hts_get_format(fp)->format==bcf ) + + if ( stats&per_contig ) { - idx = bcf_index_load(fname); - if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; } + fp = hts_open(fname,"r"); + if ( !fp ) { + fprintf(bcftools_stderr,"Could not read %s\n", fname); + ret = 1; goto cleanup; + } + hdr = bcf_hdr_read(fp); + if ( !hdr ) { + fprintf(bcftools_stderr,"Could not read the header: %s\n", fname); + ret = 1; goto cleanup; + } + + if ( hts_get_format(fp)->format==vcf ) + { + tbx = tbx_index_load2(fname, fnidx); + if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; } + } + else if ( hts_get_format(fp)->format==bcf ) + { + idx = bcf_index_load2(fname, fnidx); + if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; } + } + else + { + fprintf(bcftools_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); + return 1; + } } - else + else if ( fnidx ) { - fprintf(bcftools_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); - return 1; + char *ext = strrchr(fnidx, '.'); + if ( ext && strcmp(ext, ".tbi") == 0 ) { + tbx = tbx_index_load2(fname, fnidx); + } else if ( ext && strcmp(ext, ".csi") == 0 ) { + idx = bcf_index_load2(fname, fnidx); + } + if ( !tbx && !idx ) { + fprintf(bcftools_stderr,"Could not load index file '%s'\n", fnidx); + ret = 1; goto cleanup; + } + } else { + char *ext = strrchr(fname, '.'); + if ( ext && strcmp(ext, ".bcf") == 0 ) { + idx = bcf_index_load(fname); + } else if ( ext && (ext-fname) > 4 && strcmp(ext-4, ".vcf.gz") == 0 ) { + tbx = tbx_index_load(fname); + } } - seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); - uint64_t sum = 0; - for (i=0; iidx : idx, i, &records, &v); - sum+=records; - if (stats&2 || !records) continue; - bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); - int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); - } - if (!sum) + hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v); + sum += records; + if ( (stats&total) || !records ) continue; + const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL; + if ( ctg_name ) { + bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL; + int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; + fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records); + } + } + if ( !sum ) { // No counts found. // Is this because index version has no stored count data, or no records? bcf1_t *rec = bcf_init1(); - if (bcf_read1(fp, hdr, rec) >= 0) - { + if (fp && hdr && rec && bcf_read1(fp, hdr, rec) >= 0) { fprintf(bcftools_stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname); - return 1; + ret = 1; } bcf_destroy1(rec); } - if (stats&2) fprintf(bcftools_stdout, "%" PRIu64 "\n", sum); + if ( (stats&total) && !ret ) { + fprintf(bcftools_stdout, "%" PRIu64 "\n", sum); + } + +cleanup: free(seq); - if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__); + free(fntemp); + if ( fp && hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__); bcf_hdr_destroy(hdr); if (tbx) tbx_destroy(tbx); if (idx) hts_idx_destroy(idx); - return 0; + return ret; } int main_vcfindex(int argc, char *argv[]) @@ -139,6 +217,7 @@ int main_vcfindex(int argc, char *argv[]) {"nrecords",no_argument,NULL,'n'}, {"threads",required_argument,NULL,9}, {"output-file",required_argument,NULL,'o'}, + {"output",required_argument,NULL,'o'}, {NULL, 0, NULL, 0} }; @@ -154,8 +233,8 @@ int main_vcfindex(int argc, char *argv[]) min_shift = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg); break; - case 's': stats |= 1; break; - case 'n': stats |= 2; break; + case 's': stats |= per_contig; break; + case 'n': stats |= total; break; case 9: n_threads = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); @@ -164,7 +243,7 @@ int main_vcfindex(int argc, char *argv[]) default: usage(); } } - if (stats>2) + if (stats > total) { fprintf(bcftools_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); return 1; diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index 261841c..1d2fab1 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -141,7 +141,7 @@ void isec_vcf(args_t *args) if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { - out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); + out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); @@ -356,7 +356,7 @@ static void init_data(args_t *args) #define OPEN_FILE(i,j) { \ open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \ - args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \ + args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ @@ -397,10 +397,9 @@ static void init_data(args_t *args) fprintf(args->fh_log,"%s\tfor stripped\t%s\n", args->fnames[i], args->files->readers[i].fname); } #undef OPEN_FILE - - args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix); - if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno)); } + args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix); + if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno)); } else { if (args->output_fname) { diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index 2ef8853..d59d7df 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -2,7 +2,7 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -143,7 +143,7 @@ void isec_vcf(args_t *args) if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { - out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); + out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); @@ -358,7 +358,7 @@ static void init_data(args_t *args) #define OPEN_FILE(i,j) { \ open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \ - args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \ + args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ @@ -399,10 +399,9 @@ static void init_data(args_t *args) fprintf(args->fh_log,"%s\tfor stripped\t%s\n", args->fnames[i], args->files->readers[i].fname); } #undef OPEN_FILE - - args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix); - if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno)); } + args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix); + if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno)); } else { if (args->output_fname) { @@ -494,7 +493,7 @@ static void usage(void) fprintf(bcftools_stderr, " # Extract records private to A or B comparing by position only\n"); fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfisec(int argc, char *argv[]) diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index 42c2bd3..637e1b9 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -25,6 +25,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ typedef khash_t(strdict) strdict_t; #define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; } +#define PL2PROB_MAX 1024 + // For merging INFO Number=A,G,R tags typedef struct { @@ -132,6 +135,11 @@ typedef struct gvcf_aux_t *gvcf; // buffer of gVCF lines, for each reader one line int nout_smpl; kstring_t *str; + int32_t *laa; // localized alternate alleles given as input-based indexes in per-sample blocks of (args->local_alleles+1) values, 0 is always first + int nlaa, laa_dirty; // number of LAA alleles actually used at this site, and was any L* added? + int32_t *tmpi, *k2k; + double *tmpd, *pl2prob; // mapping from phred-score likelihoods (PL) to probability + int ntmpi, ntmpd, nk2k; } maux_t; @@ -141,7 +149,7 @@ typedef struct maux_t *maux; regidx_t *regs; // apply regions only after the blocks are expanded regitr_t *regs_itr; - int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref; + int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index; char *header_fname, *output_fname, *regions_list, *info_rules, *file_list; faidx_t *gvcf_fai; info_rule_t *rules; @@ -154,6 +162,7 @@ typedef struct bcf_hdr_t *out_hdr; char **argv; int argc, n_threads, record_cmd_line; + int local_alleles; // the value of -L option } args_t; @@ -262,7 +271,28 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul bcf_update_info_string(hdr,line,rule->hdr_tag,rule->vals); } else + { + int isrc, idst = 0; + #define BRANCH(type_t,is_missing,is_vector_end) { \ + type_t *ptr = (type_t*) rule->vals; \ + for (isrc=0; isrcnvals; isrc++) \ + { \ + if ( is_vector_end ) break; \ + if ( is_missing ) continue; \ + if ( idst!=isrc ) ptr[idst] = ptr[isrc]; \ + idst++; \ + } \ + } + switch (rule->type) { + case BCF_HT_INT: BRANCH(int32_t, ptr[isrc]==bcf_int32_missing, ptr[isrc]==bcf_int32_vector_end); break; + case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[isrc]), bcf_float_is_vector_end(ptr[isrc])); break; + default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type); + } + #undef BRANCH + + rule->nvals = idst; bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,rule->nvals,rule->type); + } } static int info_rules_comp_key2(const void *a, const void *b) @@ -344,7 +374,7 @@ static void info_rules_init(args_t *args) if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); - else error("The type is not supported: \"%s\"\n", rule->hdr_tag); + else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type); ss = strchr(ss, '\0'); ss++; if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag); @@ -366,8 +396,17 @@ static void info_rules_init(args_t *args) bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_G || bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_R ) ? 1 : 0; - if ( is_join && is_agr ) - error("Cannot -i %s:join on Number=[AGR] tags is not supported.\n", rule->hdr_tag); + if ( is_join && bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)!=BCF_VL_VAR ) + { + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->out_hdr, BCF_HL_INFO, "ID", rule->hdr_tag, NULL); + hrec = bcf_hrec_dup(hrec); + int i = bcf_hrec_find_key(hrec, "Number"); + if ( i<0 ) error("Uh, could not find the entry Number in the header record of %s\n",rule->hdr_tag); + free(hrec->vals[i]); + hrec->vals[i] = strdup("."); + bcf_hdr_remove(args->out_hdr,BCF_HL_INFO, rule->hdr_tag); + bcf_hdr_add_hrec(args->out_hdr, hrec); + } if ( !is_join && !is_agr ) error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag); } @@ -689,7 +728,7 @@ maux_t *maux_init(args_t *args) assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) ); if ( args->do_gvcf ) { - ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); + ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n for (i=0; in; i++) ma->gvcf[i].line = bcf_init1(); } @@ -699,6 +738,13 @@ maux_t *maux_init(args_t *args) for (i=0; in; i++) ma->buf[i].rid = -1; ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t)); + if ( args->local_alleles ) + { + ma->laa = (int32_t*)malloc(sizeof(*ma->laa)*ma->nout_smpl*(1+args->local_alleles)); + ma->pl2prob = (double*)malloc(PL2PROB_MAX*sizeof(*ma->pl2prob)); + for (i=0; ipl2prob[i] = pow(10,-0.1*i); + } return ma; } void maux_destroy(maux_t *ma) @@ -737,6 +783,11 @@ void maux_destroy(maux_t *ma) free(ma->smpl_ploidy); free(ma->smpl_nGsize); free(ma->chr); + free(ma->laa); + free(ma->tmpi); + free(ma->k2k); + free(ma->tmpd); + free(ma->pl2prob); free(ma); } void maux_expand1(buffer_t *buf, int size) @@ -1325,6 +1376,171 @@ static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl) return max_ploidy; } +// Sets ma->laa to local indexes relevant for each sample or missing/vector_end. +// The indexes are with respect to the source indexes and must be translated as +// the very last step. +void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL) +{ + bcf_srs_t *files = args->files; + maux_t *ma = args->maux; + int i,j,k,l, ismpl = 0, nlaa = 0; + static int warned = 0; + + hts_expand(double,out->n_allele,ma->ntmpd,ma->tmpd); // allele probabilities + hts_expand(int,out->n_allele,ma->ntmpi,ma->tmpi); // indexes of the sorted probabilities + + // Let map[] be the mapping from src to output idx. Then k2k[] is mapping from src allele idxs to src allele idxs + // reordered so that if in_allele,ma->nk2k,ma->k2k); + + // Determine local alleles: either take all that are present in the reader or use PL to determine the best + // subset for each sample. The alleles must be listed in the order of the alleles in the output file. + for (i=0; inreaders; i++) + { + bcf_sr_t *reader = &files->readers[i]; + bcf_hdr_t *hdr = reader->header; + bcf_fmt_t *fmt_ori = ma->fmt_map[files->nreaders*ifmt_PL+i]; + bcf1_t *line = maux_get_line(args, i); + int nsmpl = bcf_hdr_nsamples(hdr); + if ( line ) + { + if ( nlaa < line->n_allele - 1 ) + nlaa = line->n_allele - 1 <= args->local_alleles ? line->n_allele - 1 : args->local_alleles; + + for (j=0; jn_allele; j++) ma->k2k[j] = j; + + if ( line->n_allele <= args->local_alleles + 1 ) + { + // sort to the output order, insertion sort, ascending + int *map = ma->buf[i].rec[ma->buf[i].cur].map; + int *k2k = ma->k2k; + int tmp; + for (k=1; kn_allele; k++) + for (l=k; l>0 && map[k2k[l]] < map[k2k[l-1]]; l--) + tmp = k2k[l], k2k[l] = k2k[l-1], k2k[l-1] = tmp; + + // fewer than the allowed number of alleles, use all alleles from this file + for (j=0; jlaa + (1+args->local_alleles)*ismpl; + for (k=0; kn_allele; k++) ptr[k] = k2k[k]; + for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; + ismpl++; + } + continue; + } + } + if ( !line || !fmt_ori ) + { + // no values, fill in missing values + for (j=0; jlaa + (1+args->local_alleles)*ismpl; + ptr[0] = bcf_int32_missing; + for (k=1; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; + ismpl++; + } + continue; + } + + // there are more alternate alleles in the input files than is allowed on output, need to subset + if ( ifmt_PL==-1 ) + { + if ( !warned ) + fprintf(stderr,"Warning: local alleles are determined from FORMAT/PL but the tag is missing, cannot apply --local-alleles\n"); + warned = 1; + ma->nlaa = 0; + return; + } + + if ( !IS_VL_G(hdr, fmt_ori->id) ) error("FORMAT/PL must be defined as Number=G\n"); + if ( 2*fmt_ori->n != line->n_allele*(line->n_allele+1) ) error("Todo: haploid PL to LPL\n"); + + int *map = ma->buf[i].rec[ma->buf[i].cur].map; + double *allele_prob = ma->tmpd; + int *idx = ma->tmpi; + #define BRANCH(src_type_t, src_is_missing, src_is_vector_end, pl2prob_idx) { \ + src_type_t *src = (src_type_t*) fmt_ori->p; \ + for (j=0; jn_allele; k++) allele_prob[k] = 0; \ + for (k=0; kn_allele; k++) \ + for (l=0; l<=k; l++) \ + { \ + if ( src_is_missing || src_is_vector_end ) { src++; continue; } \ + double prob = ma->pl2prob[pl2prob_idx]; \ + allele_prob[k] += prob; \ + allele_prob[l] += prob; \ + src++; \ + } \ + /* insertion sort by allele probability, descending order, with the twist that REF (idx=0) always comes first */ \ + allele_prob++; idx[0] = -1; idx++; /* keep REF first */ \ + int si,sj,tmp; \ + for (si=0; sin_allele-1; si++) idx[si] = si; \ + for (si=1; sin_allele-1; si++) \ + for (sj=si; sj>0 && allele_prob[idx[sj]] > allele_prob[idx[sj-1]]; sj--) \ + tmp = idx[sj], idx[sj] = idx[sj-1], idx[sj-1] = tmp; \ + /*for debugging only: test order*/ \ + for (si=1; sin_allele-1; si++) \ + assert( allele_prob[idx[si-1]] >= allele_prob[idx[si]] ); \ + allele_prob--; idx--; /* this was to keep REF first */ \ + int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl; \ + ptr[0] = 0; \ + for (k=1; k<=args->local_alleles && kn_allele; k++) ptr[k] = idx[k]+1; \ + int kmax = k; \ + for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; \ + /* insertion sort by indexes to the output order, ascending */ \ + for (k=1; k0 && map[ptr[l]] < map[ptr[l-1]]; l--) \ + tmp = ptr[l], ptr[l] = ptr[l-1], ptr[l-1] = tmp; \ + ismpl++; \ + } \ + } + switch (fmt_ori->type) + { + case BCF_BT_INT8: BRANCH( int8_t, *src==bcf_int8_missing, *src==bcf_int8_vector_end, *src); break; + case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *src>=0 && *src=0 && *srctype); + } + #undef BRANCH + } + ma->nlaa = nlaa; +} + +void update_local_alleles(args_t *args, bcf1_t *out) +{ + bcf_srs_t *files = args->files; + maux_t *ma = args->maux; + int i,j,k,ismpl=0,nsamples = bcf_hdr_nsamples(args->out_hdr); + for (i=0; inreaders; i++) + { + int irec = ma->buf[i].cur; + bcf_sr_t *reader = &files->readers[i]; + int nsmpl = bcf_hdr_nsamples(reader->header); + for (k=0; klaa + ismpl*(1+args->local_alleles); + int32_t *dst = ma->laa + ismpl*ma->nlaa; + j = 0; + if ( irec>=0 ) + { + for (; jnlaa; j++) + { + if ( src[j+1]==bcf_int32_missing ) dst[j] = bcf_int32_missing; + else if ( src[j+1]==bcf_int32_vector_end ) break; + else + dst[j] = ma->buf[i].rec[irec].map[src[j+1]]; + } + } + if ( j==0 ) dst[j++] = bcf_int32_missing; + for (; jnlaa; j++) src[j] = bcf_int32_vector_end; + ismpl++; + } + } + bcf_update_format_int32(args->out_hdr, out, "LAA", ma->laa, nsamples*ma->nlaa); +} + void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) { bcf_srs_t *files = args->files; @@ -1333,7 +1549,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); static int warned = 0; - int nsize = 0, msize = sizeof(int32_t); + int nsize = 0; for (i=0; inreaders; i++) { bcf_fmt_t *fmt = fmt_map[i]; @@ -1343,17 +1559,18 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) } if ( nsize==0 ) nsize = 1; - if ( ma->ntmp_arr < nsamples*nsize*msize ) + size_t msize = sizeof(int32_t)*nsize*nsamples; + if ( msize > 2147483647 ) { - ma->ntmp_arr = nsamples*nsize*msize; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); - if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); - if ( ma->ntmp_arr > 2147483647 ) - { - if ( !warned ) fprintf(stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); - warned = 1; - return; - } + if ( !warned ) fprintf(stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize); + ma->ntmp_arr = msize; } memset(ma->smpl_ploidy,0,nsamples*sizeof(int)); @@ -1509,6 +1726,7 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); } + if ( nmax < str->l ) nmax = str->l; src += fmt_ori->size; } continue; @@ -1520,17 +1738,18 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key); } // update the record - if ( ma->ntmp_arr < nsamples*nmax ) + size_t msize = nsamples*nmax; + if ( msize > 2147483647 ) { - ma->ntmp_arr = nsamples*nmax; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); - if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); - if ( ma->ntmp_arr > 2147483647 ) - { - if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); - warned = 1; - return; - } + if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize); + ma->ntmp_arr = msize; } char *tgt = (char*) ma->tmp_arr; for (i=0; itmp_arr, nsamples*nmax); } +// Note: only diploid Number=G tags only for now +void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr) +{ + int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr); + bcf_srs_t *files = args->files; + maux_t *ma = args->maux; + bcf_fmt_t *fmt = fmt_map[irdr]; + const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt_map[irdr]->id].key; + size_t nsize = (ma->nlaa+1)*(ma->nlaa+2)/2; // max number of Number=G localized fields + size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); + msize *= nsamples*nsize; + if ( msize > 2147483647 ) + { + static int warned = 0; + if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); + ma->ntmp_arr = msize; + } + int ismpl = 0; + for (i=0; inreaders; i++) + { + bcf_sr_t *reader = &files->readers[i]; + bcf_hdr_t *hdr = reader->header; + bcf_fmt_t *fmt_ori = fmt_map[i]; + bcf1_t *line = maux_get_line(args, i); + int nsmpl = bcf_hdr_nsamples(hdr); + + if ( !fmt_ori ) + { + // fill missing values + #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \ + for (j=0; jtmp_arr + ismpl*nsize; \ + tgt_set_missing; \ + for (k=1; ktype) + { + case BCF_BT_INT8: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break; + default: error("Unexpected case: %d, %s\n", fmt->type, key); + } + #undef BRANCH + continue; + } + if ( 2*fmt_ori->n!=line->n_allele*(line->n_allele+1) ) error("Todo: localization of missing or haploid Number=G tags\n"); + + // localize + #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \ + for (j=0; jp + j*fmt_ori->n; \ + tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \ + int *laa = ma->laa + (1+args->local_alleles)*ismpl; \ + int ii,ij,tgt_idx = 0; \ + for (ii=0; ii<=ma->nlaa; ii++) \ + { \ + if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \ + for (ij=0; ij<=ii; ij++) \ + { \ + int src_idx = bcf_alleles2gt(laa[ii],laa[ij]); \ + if ( src_is_missing ) tgt_set_missing; \ + else if ( src_is_vector_end ) break; \ + else tgt[tgt_idx] = src[src_idx]; \ + tgt_idx++; \ + } \ + } \ + if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \ + for (; tgt_idxtype) + { + case BCF_BT_INT8: BRANCH(int32_t, int8_t, src[src_idx]==bcf_int8_missing, src[src_idx]==bcf_int8_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break; + default: error("Unexpected case: %d, %s\n", fmt_ori->type, key); + } + #undef BRANCH + } + args->tmps.l = 0; + kputc('L',&args->tmps); + kputs(key,&args->tmps); + if ( fmt_map[irdr]->type==BCF_BT_FLOAT ) + bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize); + else + bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize); + ma->laa_dirty = 1; +} +void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr) +{ + int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr); + bcf_srs_t *files = args->files; + maux_t *ma = args->maux; + bcf_fmt_t *fmt = fmt_map[irdr]; + const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt->id].key; + size_t nsize = IS_VL_R(files->readers[irdr].header, fmt->id) ? ma->nlaa + 1 : ma->nlaa; + size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); + msize *= nsamples*nsize; + if ( msize > 2147483647 ) + { + static int warned = 0; + if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); + ma->ntmp_arr = msize; + } + int ismpl = 0, ibeg = IS_VL_R(files->readers[irdr].header, fmt->id) ? 0 : 1;; + for (i=0; inreaders; i++) + { + bcf_sr_t *reader = &files->readers[i]; + bcf_hdr_t *hdr = reader->header; + bcf_fmt_t *fmt_ori = fmt_map[i]; + int nsmpl = bcf_hdr_nsamples(hdr); + + if ( !fmt_ori ) + { + // fill missing values + #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \ + for (j=0; jtmp_arr + ismpl*nsize; \ + tgt_set_missing; \ + for (k=1; ktype) + { + case BCF_BT_INT8: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break; + default: error("Unexpected case: %d, %s\n", fmt->type, key); + } + #undef BRANCH + continue; + } + + // localize + #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \ + for (j=0; jp + j*fmt_ori->n; \ + tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \ + int *laa = ma->laa + (1+args->local_alleles)*ismpl; \ + int ii,tgt_idx = 0; \ + for (ii=ibeg; ii<=ma->nlaa; ii++) \ + { \ + if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \ + int src_idx = laa[ii] - ibeg; \ + if ( src_is_missing ) tgt_set_missing; \ + else if ( src_is_vector_end ) break; \ + else tgt[tgt_idx] = src[src_idx]; \ + tgt_idx++; \ + } \ + if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \ + for (; tgt_idxtype) + { + case BCF_BT_INT8: BRANCH(int32_t, int8_t, src[src_idx]==bcf_int8_missing, src[src_idx]==bcf_int8_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break; + default: error("Unexpected case: %d, %s\n", fmt_ori->type, key); + } + #undef BRANCH + } + args->tmps.l = 0; + kputc('L',&args->tmps); + kputs(key,&args->tmps); + if ( fmt_map[irdr]->type==BCF_BT_FLOAT ) + bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize); + else + bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize); + ma->laa_dirty = 1; +} void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) { bcf_srs_t *files = args->files; @@ -1579,6 +1996,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) } if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n; } + if ( ma->nlaa && length!=BCF_VL_FIXED ) + { + if ( length==BCF_VL_G ) merge_localized_numberG_format_field(args,fmt_map,out,i); + else if ( length==BCF_VL_A || length==BCF_VL_R ) merge_localized_numberAR_format_field(args,fmt_map,out,i); + return; + } + if ( type==BCF_BT_CHAR ) { merge_format_string(args, key, fmt_map, out, length, nsize); @@ -1586,17 +2010,18 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) } size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); - if ( ma->ntmp_arr < nsamples*nsize*msize ) + msize *= nsamples*nsize; + if ( msize > 2147483647 ) { - ma->ntmp_arr = nsamples*nsize*msize; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); - if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); - if ( ma->ntmp_arr > 2147483647 ) - { - if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); - warned = 1; - return; - } + if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); + ma->ntmp_arr = msize; } // Fill the temp array for all samples by collecting values from all files @@ -1790,7 +2215,7 @@ void merge_format(args_t *args, bcf1_t *out) khiter_t kitr; strdict_t *tmph = args->tmph; kh_clear(strdict, tmph); - int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index + int i, j, ret, has_GT = 0, has_PL = -1, max_ifmt = 0; // max fmt index for (i=0; inreaders; i++) { bcf1_t *line = maux_get_line(args,i); @@ -1820,6 +2245,7 @@ void merge_format(args_t *args, bcf1_t *out) memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*)); ma->nfmt_map = max_ifmt+1; } + if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; } } kitr = kh_put(strdict, tmph, key, &ret); kh_value(tmph, kitr) = ifmt; @@ -1833,6 +2259,12 @@ void merge_format(args_t *args, bcf1_t *out) ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1; } + if ( args->local_alleles ) + { + ma->laa_dirty = ma->nlaa = 0; + if ( out->n_allele > args->local_alleles + 1 ) init_local_alleles(args, out, has_PL); + } + out->n_sample = bcf_hdr_nsamples(out_hdr); if ( has_GT ) merge_GT(args, ma->fmt_map, out); @@ -1840,6 +2272,10 @@ void merge_format(args_t *args, bcf1_t *out) for (i=1; i<=max_ifmt; i++) merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); + + if ( ma->laa_dirty ) + update_local_alleles(args, out); + out->d.indiv_dirty = 1; } @@ -2041,6 +2477,23 @@ void gvcf_flush(args_t *args, int done) } } +static inline int is_gvcf_block(bcf1_t *line) +{ + if ( line->rlen<=1 ) return 0; + if ( strlen(line->d.allele[0])==line->rlen ) return 0; + if ( line->n_allele==1 ) return 1; + + int i; + for (i=1; in_allele; i++) + { + if ( !strcmp(line->d.allele[i],"<*>") ) return 1; + if ( !strcmp(line->d.allele[i],"") ) return 1; + if ( !strcmp(line->d.allele[i],"") ) return 1; + } + return 0; +} +static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2; + /* Check incoming lines for new gVCF blocks, set pointer to the current source buffer (gvcf or readers). In contrast to gvcf_flush, this function can be @@ -2059,6 +2512,7 @@ void gvcf_stage(args_t *args, int pos) maux->gvcf_min = INT_MAX; for (i=0; inreaders; i++) { + if ( gaux[i].active && gaux[i].end < pos ) gaux[i].active = 0; if ( gaux[i].active ) { // gvcf block should not overlap with another record @@ -2077,7 +2531,7 @@ void gvcf_stage(args_t *args, int pos) int irec = maux->buf[i].beg; bcf_hdr_t *hdr = bcf_sr_get_header(files, i); bcf1_t *line = args->files->readers[i].buffer[irec]; - int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend); + int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0; if ( ret==1 ) { if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END @@ -2218,7 +2672,6 @@ void debug_state(args_t *args) fprintf(stderr,"\n"); } - /* Determine which line should be merged from which reader: go through all readers and all buffered lines, expand REF,ALT and try to match lines with @@ -2227,7 +2680,6 @@ void debug_state(args_t *args) int can_merge(args_t *args) { bcf_srs_t *files = args->files; - int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1; maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; char *id = NULL, ref = 'N'; @@ -2240,6 +2692,9 @@ int can_merge(args_t *args) } maux->var_types = maux->nals = 0; + // this is only for the `-m none -g` mode, ensure that <*> lines come last + #define VCF_GVCF_REF 1 + for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; @@ -2257,12 +2712,17 @@ int can_merge(args_t *args) buf->rec[j].skip = SKIP_DIFF; ntodo++; + bcf1_t *line = buf->lines[j]; if ( args->merge_by_id ) - id = buf->lines[j]->d.id; + id = line->d.id; else { - int var_type = bcf_get_variant_types(buf->lines[j]); - maux->var_types |= var_type ? var_type<<1 : 1; + int var_type = bcf_get_variant_types(line); + maux->var_types |= var_type ? var_type<<2 : 2; + + // for the `-m none -g` mode + if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) ) + maux->var_types |= VCF_GVCF_REF; } } @@ -2294,7 +2754,7 @@ int can_merge(args_t *args) bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer int line_type = bcf_get_variant_types(line); - line_type = line_type ? line_type<<1 : 1; + line_type = line_type ? line_type<<2 : 2; // select relevant lines if ( args->merge_by_id ) @@ -2303,6 +2763,12 @@ int can_merge(args_t *args) } else { + // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant + // records come last, otherwise infinite loop is created (#1164) + if ( args->collapse==COLLAPSE_NONE && args->do_gvcf ) + { + if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue; + } if ( args->collapse==COLLAPSE_NONE && maux->nals ) { // All alleles of the tested record must be present in the @@ -2366,7 +2832,6 @@ int can_merge(args_t *args) */ void stage_line(args_t *args) { - int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1; bcf_srs_t *files = args->files; maux_t *maux = args->maux; @@ -2436,13 +2901,9 @@ void stage_line(args_t *args) void merge_line(args_t *args) { - if ( args->regs ) - { - if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return; - } - bcf1_t *out = args->out_line; merge_chrom2qual(args, out); + if ( args->regs && !regidx_overlap(args->regs,args->maux->chr,out->pos,out->pos+out->rlen-1,NULL) ) return; merge_filter(args, out); merge_info(args, out); if ( args->do_gvcf ) @@ -2490,9 +2951,59 @@ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *c error_errno("[%s] Failed to add program information to header", __func__); } +void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr) +{ + char **str = NULL; + int i,j, nstr = 0, mstr = 0; + for (i=0; inhrec; i++) + { + if ( hdr->hrec[i]->type!=BCF_HL_FMT ) continue; + j = bcf_hrec_find_key(hdr->hrec[i],"ID"); + if ( j<0 ) continue; + char *key = hdr->hrec[i]->vals[j]; + int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key); + assert( id>=0 ); + int localize = 0; + if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G ) localize = 1; + if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A ) localize = 1; + if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R ) localize = 1; + if ( !localize ) continue; + args->tmps.l = 0; + + uint32_t e = 0, nout = 0; + e |= ksprintf(&args->tmps, "##%s=<", hdr->hrec[i]->key) < 0; + for (j=0; jhrec[i]->nkeys; j++) + { + if ( !strcmp("IDX",hdr->hrec[i]->keys[j]) ) continue; + if ( nout ) e |= kputc(',',&args->tmps) < 0; + if ( !strcmp("ID",hdr->hrec[i]->keys[j]) ) + e |= ksprintf(&args->tmps,"%s=L%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0; + else if ( !strcmp("Number",hdr->hrec[i]->keys[j]) ) + e |= ksprintf(&args->tmps,"Number=.") < 0; + else if ( !strcmp("Description",hdr->hrec[i]->keys[j]) && hdr->hrec[i]->vals[j][0]=='"' ) + e |= ksprintf(&args->tmps,"Description=\"Localized field: %s", hdr->hrec[i]->vals[j]+1) < 0; + else + e |= ksprintf(&args->tmps,"%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0; + nout++; + } + e |= ksprintf(&args->tmps,">\n") < 0; + if ( e ) error("Failed to format the header line for %s\n", key); + nstr++; + hts_expand(char*,nstr,mstr,str); + str[nstr-1] = strdup(args->tmps.s); + } + if ( !nstr ) return; + bcf_hdr_append(hdr,"##FORMAT="); + for (i=0; iout_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); @@ -2509,6 +3020,7 @@ void merge_vcf(args_t *args) char buf[24]; snprintf(buf,sizeof buf,"%d",i+1); merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples); } + if ( args->local_alleles ) hdr_add_localized_tags(args, args->out_hdr); if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); if (bcf_hdr_sync(args->out_hdr) < 0) error_errno("[%s] Failed to update header", __func__); @@ -2580,7 +3092,9 @@ static void usage(void) fprintf(stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); fprintf(stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(stderr, " -l, --file-list read file names from the file\n"); + fprintf(stderr, " -L, --local-alleles EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); fprintf(stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); + fprintf(stderr, " --no-index merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); @@ -2608,6 +3122,7 @@ int main_vcfmerge(int argc, char *argv[]) { {"help",no_argument,NULL,'h'}, {"merge",required_argument,NULL,'m'}, + {"local-alleles",required_argument,NULL,'L'}, {"gvcf",required_argument,NULL,'g'}, {"file-list",required_argument,NULL,'l'}, {"missing-to-ref",no_argument,NULL,'0'}, @@ -2622,11 +3137,19 @@ int main_vcfmerge(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"info-rules",required_argument,NULL,'i'}, {"no-version",no_argument,NULL,8}, + {"no-index",no_argument,NULL,10}, {"filter-logic",required_argument,NULL,'F'}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) { + char *tmp; + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) { switch (c) { + case 'L': + args->local_alleles = strtol(optarg,&tmp,10); + if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg); + if ( args->local_alleles < 1 ) + error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg); + break; case 'F': if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD; else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE; @@ -2672,6 +3195,7 @@ int main_vcfmerge(int argc, char *argv[]) case 3 : args->force_samples = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->no_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -2680,7 +3204,13 @@ int main_vcfmerge(int argc, char *argv[]) if ( argc==optind && !args->file_list ) usage(); if ( argc-optind<2 && !args->file_list ) usage(); - args->files->require_index = 1; + if ( args->no_index ) + { + if ( args->regions_list ) error("Error: cannot combine --no-index with -r/-R\n"); + bcf_sr_set_opt(args->files,BCF_SR_ALLOW_NO_IDX); + } + else + bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index 651ea51..0f1c94c 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -2,7 +2,7 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -27,6 +27,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -60,6 +61,8 @@ typedef khash_t(strdict) strdict_t; #define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; } +#define PL2PROB_MAX 1024 + // For merging INFO Number=A,G,R tags typedef struct { @@ -134,6 +137,11 @@ typedef struct gvcf_aux_t *gvcf; // buffer of gVCF lines, for each reader one line int nout_smpl; kstring_t *str; + int32_t *laa; // localized alternate alleles given as input-based indexes in per-sample blocks of (args->local_alleles+1) values, 0 is always first + int nlaa, laa_dirty; // number of LAA alleles actually used at this site, and was any L* added? + int32_t *tmpi, *k2k; + double *tmpd, *pl2prob; // mapping from phred-score likelihoods (PL) to probability + int ntmpi, ntmpd, nk2k; } maux_t; @@ -143,7 +151,7 @@ typedef struct maux_t *maux; regidx_t *regs; // apply regions only after the blocks are expanded regitr_t *regs_itr; - int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref; + int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index; char *header_fname, *output_fname, *regions_list, *info_rules, *file_list; faidx_t *gvcf_fai; info_rule_t *rules; @@ -156,6 +164,7 @@ typedef struct bcf_hdr_t *out_hdr; char **argv; int argc, n_threads, record_cmd_line; + int local_alleles; // the value of -L option } args_t; @@ -264,7 +273,28 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul bcf_update_info_string(hdr,line,rule->hdr_tag,rule->vals); } else + { + int isrc, idst = 0; + #define BRANCH(type_t,is_missing,is_vector_end) { \ + type_t *ptr = (type_t*) rule->vals; \ + for (isrc=0; isrcnvals; isrc++) \ + { \ + if ( is_vector_end ) break; \ + if ( is_missing ) continue; \ + if ( idst!=isrc ) ptr[idst] = ptr[isrc]; \ + idst++; \ + } \ + } + switch (rule->type) { + case BCF_HT_INT: BRANCH(int32_t, ptr[isrc]==bcf_int32_missing, ptr[isrc]==bcf_int32_vector_end); break; + case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[isrc]), bcf_float_is_vector_end(ptr[isrc])); break; + default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type); + } + #undef BRANCH + + rule->nvals = idst; bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,rule->nvals,rule->type); + } } static int info_rules_comp_key2(const void *a, const void *b) @@ -346,7 +376,7 @@ static void info_rules_init(args_t *args) if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); - else error("The type is not supported: \"%s\"\n", rule->hdr_tag); + else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type); ss = strchr(ss, '\0'); ss++; if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag); @@ -368,8 +398,17 @@ static void info_rules_init(args_t *args) bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_G || bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_R ) ? 1 : 0; - if ( is_join && is_agr ) - error("Cannot -i %s:join on Number=[AGR] tags is not supported.\n", rule->hdr_tag); + if ( is_join && bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)!=BCF_VL_VAR ) + { + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->out_hdr, BCF_HL_INFO, "ID", rule->hdr_tag, NULL); + hrec = bcf_hrec_dup(hrec); + int i = bcf_hrec_find_key(hrec, "Number"); + if ( i<0 ) error("Uh, could not find the entry Number in the header record of %s\n",rule->hdr_tag); + free(hrec->vals[i]); + hrec->vals[i] = strdup("."); + bcf_hdr_remove(args->out_hdr,BCF_HL_INFO, rule->hdr_tag); + bcf_hdr_add_hrec(args->out_hdr, hrec); + } if ( !is_join && !is_agr ) error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag); } @@ -691,7 +730,7 @@ maux_t *maux_init(args_t *args) assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) ); if ( args->do_gvcf ) { - ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); + ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n for (i=0; in; i++) ma->gvcf[i].line = bcf_init1(); } @@ -701,6 +740,13 @@ maux_t *maux_init(args_t *args) for (i=0; in; i++) ma->buf[i].rid = -1; ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t)); + if ( args->local_alleles ) + { + ma->laa = (int32_t*)malloc(sizeof(*ma->laa)*ma->nout_smpl*(1+args->local_alleles)); + ma->pl2prob = (double*)malloc(PL2PROB_MAX*sizeof(*ma->pl2prob)); + for (i=0; ipl2prob[i] = pow(10,-0.1*i); + } return ma; } void maux_destroy(maux_t *ma) @@ -739,6 +785,11 @@ void maux_destroy(maux_t *ma) free(ma->smpl_ploidy); free(ma->smpl_nGsize); free(ma->chr); + free(ma->laa); + free(ma->tmpi); + free(ma->k2k); + free(ma->tmpd); + free(ma->pl2prob); free(ma); } void maux_expand1(buffer_t *buf, int size) @@ -1107,7 +1158,7 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break; case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break; - default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); + default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); bcftools_exit(1); } #undef BRANCH } @@ -1137,7 +1188,7 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break; case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break; - default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); + default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); bcftools_exit(1); } #undef BRANCH } @@ -1327,6 +1378,171 @@ static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl) return max_ploidy; } +// Sets ma->laa to local indexes relevant for each sample or missing/vector_end. +// The indexes are with respect to the source indexes and must be translated as +// the very last step. +void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL) +{ + bcf_srs_t *files = args->files; + maux_t *ma = args->maux; + int i,j,k,l, ismpl = 0, nlaa = 0; + static int warned = 0; + + hts_expand(double,out->n_allele,ma->ntmpd,ma->tmpd); // allele probabilities + hts_expand(int,out->n_allele,ma->ntmpi,ma->tmpi); // indexes of the sorted probabilities + + // Let map[] be the mapping from src to output idx. Then k2k[] is mapping from src allele idxs to src allele idxs + // reordered so that if in_allele,ma->nk2k,ma->k2k); + + // Determine local alleles: either take all that are present in the reader or use PL to determine the best + // subset for each sample. The alleles must be listed in the order of the alleles in the output file. + for (i=0; inreaders; i++) + { + bcf_sr_t *reader = &files->readers[i]; + bcf_hdr_t *hdr = reader->header; + bcf_fmt_t *fmt_ori = ma->fmt_map[files->nreaders*ifmt_PL+i]; + bcf1_t *line = maux_get_line(args, i); + int nsmpl = bcf_hdr_nsamples(hdr); + if ( line ) + { + if ( nlaa < line->n_allele - 1 ) + nlaa = line->n_allele - 1 <= args->local_alleles ? line->n_allele - 1 : args->local_alleles; + + for (j=0; jn_allele; j++) ma->k2k[j] = j; + + if ( line->n_allele <= args->local_alleles + 1 ) + { + // sort to the output order, insertion sort, ascending + int *map = ma->buf[i].rec[ma->buf[i].cur].map; + int *k2k = ma->k2k; + int tmp; + for (k=1; kn_allele; k++) + for (l=k; l>0 && map[k2k[l]] < map[k2k[l-1]]; l--) + tmp = k2k[l], k2k[l] = k2k[l-1], k2k[l-1] = tmp; + + // fewer than the allowed number of alleles, use all alleles from this file + for (j=0; jlaa + (1+args->local_alleles)*ismpl; + for (k=0; kn_allele; k++) ptr[k] = k2k[k]; + for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; + ismpl++; + } + continue; + } + } + if ( !line || !fmt_ori ) + { + // no values, fill in missing values + for (j=0; jlaa + (1+args->local_alleles)*ismpl; + ptr[0] = bcf_int32_missing; + for (k=1; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; + ismpl++; + } + continue; + } + + // there are more alternate alleles in the input files than is allowed on output, need to subset + if ( ifmt_PL==-1 ) + { + if ( !warned ) + fprintf(bcftools_stderr,"Warning: local alleles are determined from FORMAT/PL but the tag is missing, cannot apply --local-alleles\n"); + warned = 1; + ma->nlaa = 0; + return; + } + + if ( !IS_VL_G(hdr, fmt_ori->id) ) error("FORMAT/PL must be defined as Number=G\n"); + if ( 2*fmt_ori->n != line->n_allele*(line->n_allele+1) ) error("Todo: haploid PL to LPL\n"); + + int *map = ma->buf[i].rec[ma->buf[i].cur].map; + double *allele_prob = ma->tmpd; + int *idx = ma->tmpi; + #define BRANCH(src_type_t, src_is_missing, src_is_vector_end, pl2prob_idx) { \ + src_type_t *src = (src_type_t*) fmt_ori->p; \ + for (j=0; jn_allele; k++) allele_prob[k] = 0; \ + for (k=0; kn_allele; k++) \ + for (l=0; l<=k; l++) \ + { \ + if ( src_is_missing || src_is_vector_end ) { src++; continue; } \ + double prob = ma->pl2prob[pl2prob_idx]; \ + allele_prob[k] += prob; \ + allele_prob[l] += prob; \ + src++; \ + } \ + /* insertion sort by allele probability, descending order, with the twist that REF (idx=0) always comes first */ \ + allele_prob++; idx[0] = -1; idx++; /* keep REF first */ \ + int si,sj,tmp; \ + for (si=0; sin_allele-1; si++) idx[si] = si; \ + for (si=1; sin_allele-1; si++) \ + for (sj=si; sj>0 && allele_prob[idx[sj]] > allele_prob[idx[sj-1]]; sj--) \ + tmp = idx[sj], idx[sj] = idx[sj-1], idx[sj-1] = tmp; \ + /*for debugging only: test order*/ \ + for (si=1; sin_allele-1; si++) \ + assert( allele_prob[idx[si-1]] >= allele_prob[idx[si]] ); \ + allele_prob--; idx--; /* this was to keep REF first */ \ + int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl; \ + ptr[0] = 0; \ + for (k=1; k<=args->local_alleles && kn_allele; k++) ptr[k] = idx[k]+1; \ + int kmax = k; \ + for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; \ + /* insertion sort by indexes to the output order, ascending */ \ + for (k=1; k0 && map[ptr[l]] < map[ptr[l-1]]; l--) \ + tmp = ptr[l], ptr[l] = ptr[l-1], ptr[l-1] = tmp; \ + ismpl++; \ + } \ + } + switch (fmt_ori->type) + { + case BCF_BT_INT8: BRANCH( int8_t, *src==bcf_int8_missing, *src==bcf_int8_vector_end, *src); break; + case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *src>=0 && *src=0 && *srctype); + } + #undef BRANCH + } + ma->nlaa = nlaa; +} + +void update_local_alleles(args_t *args, bcf1_t *out) +{ + bcf_srs_t *files = args->files; + maux_t *ma = args->maux; + int i,j,k,ismpl=0,nsamples = bcf_hdr_nsamples(args->out_hdr); + for (i=0; inreaders; i++) + { + int irec = ma->buf[i].cur; + bcf_sr_t *reader = &files->readers[i]; + int nsmpl = bcf_hdr_nsamples(reader->header); + for (k=0; klaa + ismpl*(1+args->local_alleles); + int32_t *dst = ma->laa + ismpl*ma->nlaa; + j = 0; + if ( irec>=0 ) + { + for (; jnlaa; j++) + { + if ( src[j+1]==bcf_int32_missing ) dst[j] = bcf_int32_missing; + else if ( src[j+1]==bcf_int32_vector_end ) break; + else + dst[j] = ma->buf[i].rec[irec].map[src[j+1]]; + } + } + if ( j==0 ) dst[j++] = bcf_int32_missing; + for (; jnlaa; j++) src[j] = bcf_int32_vector_end; + ismpl++; + } + } + bcf_update_format_int32(args->out_hdr, out, "LAA", ma->laa, nsamples*ma->nlaa); +} + void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) { bcf_srs_t *files = args->files; @@ -1335,7 +1551,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); static int warned = 0; - int nsize = 0, msize = sizeof(int32_t); + int nsize = 0; for (i=0; inreaders; i++) { bcf_fmt_t *fmt = fmt_map[i]; @@ -1345,17 +1561,18 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) } if ( nsize==0 ) nsize = 1; - if ( ma->ntmp_arr < nsamples*nsize*msize ) + size_t msize = sizeof(int32_t)*nsize*nsamples; + if ( msize > 2147483647 ) { - ma->ntmp_arr = nsamples*nsize*msize; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); - if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); - if ( ma->ntmp_arr > 2147483647 ) - { - if ( !warned ) fprintf(bcftools_stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); - warned = 1; - return; - } + if ( !warned ) fprintf(bcftools_stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize); + ma->ntmp_arr = msize; } memset(ma->smpl_ploidy,0,nsamples*sizeof(int)); @@ -1511,6 +1728,7 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); } + if ( nmax < str->l ) nmax = str->l; src += fmt_ori->size; } continue; @@ -1522,17 +1740,18 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key); } // update the record - if ( ma->ntmp_arr < nsamples*nmax ) + size_t msize = nsamples*nmax; + if ( msize > 2147483647 ) { - ma->ntmp_arr = nsamples*nmax; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); - if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); - if ( ma->ntmp_arr > 2147483647 ) - { - if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); - warned = 1; - return; - } + if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize); + ma->ntmp_arr = msize; } char *tgt = (char*) ma->tmp_arr; for (i=0; itmp_arr, nsamples*nmax); } +// Note: only diploid Number=G tags only for now +void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr) +{ + int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr); + bcf_srs_t *files = args->files; + maux_t *ma = args->maux; + bcf_fmt_t *fmt = fmt_map[irdr]; + const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt_map[irdr]->id].key; + size_t nsize = (ma->nlaa+1)*(ma->nlaa+2)/2; // max number of Number=G localized fields + size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); + msize *= nsamples*nsize; + if ( msize > 2147483647 ) + { + static int warned = 0; + if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); + ma->ntmp_arr = msize; + } + int ismpl = 0; + for (i=0; inreaders; i++) + { + bcf_sr_t *reader = &files->readers[i]; + bcf_hdr_t *hdr = reader->header; + bcf_fmt_t *fmt_ori = fmt_map[i]; + bcf1_t *line = maux_get_line(args, i); + int nsmpl = bcf_hdr_nsamples(hdr); + + if ( !fmt_ori ) + { + // fill missing values + #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \ + for (j=0; jtmp_arr + ismpl*nsize; \ + tgt_set_missing; \ + for (k=1; ktype) + { + case BCF_BT_INT8: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break; + default: error("Unexpected case: %d, %s\n", fmt->type, key); + } + #undef BRANCH + continue; + } + if ( 2*fmt_ori->n!=line->n_allele*(line->n_allele+1) ) error("Todo: localization of missing or haploid Number=G tags\n"); + + // localize + #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \ + for (j=0; jp + j*fmt_ori->n; \ + tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \ + int *laa = ma->laa + (1+args->local_alleles)*ismpl; \ + int ii,ij,tgt_idx = 0; \ + for (ii=0; ii<=ma->nlaa; ii++) \ + { \ + if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \ + for (ij=0; ij<=ii; ij++) \ + { \ + int src_idx = bcf_alleles2gt(laa[ii],laa[ij]); \ + if ( src_is_missing ) tgt_set_missing; \ + else if ( src_is_vector_end ) break; \ + else tgt[tgt_idx] = src[src_idx]; \ + tgt_idx++; \ + } \ + } \ + if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \ + for (; tgt_idxtype) + { + case BCF_BT_INT8: BRANCH(int32_t, int8_t, src[src_idx]==bcf_int8_missing, src[src_idx]==bcf_int8_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break; + default: error("Unexpected case: %d, %s\n", fmt_ori->type, key); + } + #undef BRANCH + } + args->tmps.l = 0; + kputc('L',&args->tmps); + kputs(key,&args->tmps); + if ( fmt_map[irdr]->type==BCF_BT_FLOAT ) + bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize); + else + bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize); + ma->laa_dirty = 1; +} +void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr) +{ + int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr); + bcf_srs_t *files = args->files; + maux_t *ma = args->maux; + bcf_fmt_t *fmt = fmt_map[irdr]; + const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt->id].key; + size_t nsize = IS_VL_R(files->readers[irdr].header, fmt->id) ? ma->nlaa + 1 : ma->nlaa; + size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); + msize *= nsamples*nsize; + if ( msize > 2147483647 ) + { + static int warned = 0; + if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); + ma->ntmp_arr = msize; + } + int ismpl = 0, ibeg = IS_VL_R(files->readers[irdr].header, fmt->id) ? 0 : 1;; + for (i=0; inreaders; i++) + { + bcf_sr_t *reader = &files->readers[i]; + bcf_hdr_t *hdr = reader->header; + bcf_fmt_t *fmt_ori = fmt_map[i]; + int nsmpl = bcf_hdr_nsamples(hdr); + + if ( !fmt_ori ) + { + // fill missing values + #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \ + for (j=0; jtmp_arr + ismpl*nsize; \ + tgt_set_missing; \ + for (k=1; ktype) + { + case BCF_BT_INT8: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break; + default: error("Unexpected case: %d, %s\n", fmt->type, key); + } + #undef BRANCH + continue; + } + + // localize + #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \ + for (j=0; jp + j*fmt_ori->n; \ + tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \ + int *laa = ma->laa + (1+args->local_alleles)*ismpl; \ + int ii,tgt_idx = 0; \ + for (ii=ibeg; ii<=ma->nlaa; ii++) \ + { \ + if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \ + int src_idx = laa[ii] - ibeg; \ + if ( src_is_missing ) tgt_set_missing; \ + else if ( src_is_vector_end ) break; \ + else tgt[tgt_idx] = src[src_idx]; \ + tgt_idx++; \ + } \ + if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \ + for (; tgt_idxtype) + { + case BCF_BT_INT8: BRANCH(int32_t, int8_t, src[src_idx]==bcf_int8_missing, src[src_idx]==bcf_int8_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break; + case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break; + default: error("Unexpected case: %d, %s\n", fmt_ori->type, key); + } + #undef BRANCH + } + args->tmps.l = 0; + kputc('L',&args->tmps); + kputs(key,&args->tmps); + if ( fmt_map[irdr]->type==BCF_BT_FLOAT ) + bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize); + else + bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize); + ma->laa_dirty = 1; +} void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) { bcf_srs_t *files = args->files; @@ -1581,6 +1998,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) } if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n; } + if ( ma->nlaa && length!=BCF_VL_FIXED ) + { + if ( length==BCF_VL_G ) merge_localized_numberG_format_field(args,fmt_map,out,i); + else if ( length==BCF_VL_A || length==BCF_VL_R ) merge_localized_numberAR_format_field(args,fmt_map,out,i); + return; + } + if ( type==BCF_BT_CHAR ) { merge_format_string(args, key, fmt_map, out, length, nsize); @@ -1588,17 +2012,18 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) } size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); - if ( ma->ntmp_arr < nsamples*nsize*msize ) + msize *= nsamples*nsize; + if ( msize > 2147483647 ) { - ma->ntmp_arr = nsamples*nsize*msize; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); - if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); - if ( ma->ntmp_arr > 2147483647 ) - { - if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); - warned = 1; - return; - } + if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize); + warned = 1; + return; + } + if ( ma->ntmp_arr < msize ) + { + ma->tmp_arr = realloc(ma->tmp_arr, msize); + if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); + ma->ntmp_arr = msize; } // Fill the temp array for all samples by collecting values from all files @@ -1792,7 +2217,7 @@ void merge_format(args_t *args, bcf1_t *out) khiter_t kitr; strdict_t *tmph = args->tmph; kh_clear(strdict, tmph); - int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index + int i, j, ret, has_GT = 0, has_PL = -1, max_ifmt = 0; // max fmt index for (i=0; inreaders; i++) { bcf1_t *line = maux_get_line(args,i); @@ -1822,6 +2247,7 @@ void merge_format(args_t *args, bcf1_t *out) memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*)); ma->nfmt_map = max_ifmt+1; } + if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; } } kitr = kh_put(strdict, tmph, key, &ret); kh_value(tmph, kitr) = ifmt; @@ -1835,6 +2261,12 @@ void merge_format(args_t *args, bcf1_t *out) ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1; } + if ( args->local_alleles ) + { + ma->laa_dirty = ma->nlaa = 0; + if ( out->n_allele > args->local_alleles + 1 ) init_local_alleles(args, out, has_PL); + } + out->n_sample = bcf_hdr_nsamples(out_hdr); if ( has_GT ) merge_GT(args, ma->fmt_map, out); @@ -1842,6 +2274,10 @@ void merge_format(args_t *args, bcf1_t *out) for (i=1; i<=max_ifmt; i++) merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); + + if ( ma->laa_dirty ) + update_local_alleles(args, out); + out->d.indiv_dirty = 1; } @@ -2043,6 +2479,23 @@ void gvcf_flush(args_t *args, int done) } } +static inline int is_gvcf_block(bcf1_t *line) +{ + if ( line->rlen<=1 ) return 0; + if ( strlen(line->d.allele[0])==line->rlen ) return 0; + if ( line->n_allele==1 ) return 1; + + int i; + for (i=1; in_allele; i++) + { + if ( !strcmp(line->d.allele[i],"<*>") ) return 1; + if ( !strcmp(line->d.allele[i],"") ) return 1; + if ( !strcmp(line->d.allele[i],"") ) return 1; + } + return 0; +} +static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2; + /* Check incoming lines for new gVCF blocks, set pointer to the current source buffer (gvcf or readers). In contrast to gvcf_flush, this function can be @@ -2061,6 +2514,7 @@ void gvcf_stage(args_t *args, int pos) maux->gvcf_min = INT_MAX; for (i=0; inreaders; i++) { + if ( gaux[i].active && gaux[i].end < pos ) gaux[i].active = 0; if ( gaux[i].active ) { // gvcf block should not overlap with another record @@ -2079,7 +2533,7 @@ void gvcf_stage(args_t *args, int pos) int irec = maux->buf[i].beg; bcf_hdr_t *hdr = bcf_sr_get_header(files, i); bcf1_t *line = args->files->readers[i].buffer[irec]; - int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend); + int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0; if ( ret==1 ) { if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END @@ -2220,7 +2674,6 @@ void debug_state(args_t *args) fprintf(bcftools_stderr,"\n"); } - /* Determine which line should be merged from which reader: go through all readers and all buffered lines, expand REF,ALT and try to match lines with @@ -2229,7 +2682,6 @@ void debug_state(args_t *args) int can_merge(args_t *args) { bcf_srs_t *files = args->files; - int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1; maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; char *id = NULL, ref = 'N'; @@ -2242,6 +2694,9 @@ int can_merge(args_t *args) } maux->var_types = maux->nals = 0; + // this is only for the `-m none -g` mode, ensure that <*> lines come last + #define VCF_GVCF_REF 1 + for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; @@ -2259,12 +2714,17 @@ int can_merge(args_t *args) buf->rec[j].skip = SKIP_DIFF; ntodo++; + bcf1_t *line = buf->lines[j]; if ( args->merge_by_id ) - id = buf->lines[j]->d.id; + id = line->d.id; else { - int var_type = bcf_get_variant_types(buf->lines[j]); - maux->var_types |= var_type ? var_type<<1 : 1; + int var_type = bcf_get_variant_types(line); + maux->var_types |= var_type ? var_type<<2 : 2; + + // for the `-m none -g` mode + if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) ) + maux->var_types |= VCF_GVCF_REF; } } @@ -2296,7 +2756,7 @@ int can_merge(args_t *args) bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer int line_type = bcf_get_variant_types(line); - line_type = line_type ? line_type<<1 : 1; + line_type = line_type ? line_type<<2 : 2; // select relevant lines if ( args->merge_by_id ) @@ -2305,6 +2765,12 @@ int can_merge(args_t *args) } else { + // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant + // records come last, otherwise infinite loop is created (#1164) + if ( args->collapse==COLLAPSE_NONE && args->do_gvcf ) + { + if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue; + } if ( args->collapse==COLLAPSE_NONE && maux->nals ) { // All alleles of the tested record must be present in the @@ -2368,7 +2834,6 @@ int can_merge(args_t *args) */ void stage_line(args_t *args) { - int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1; bcf_srs_t *files = args->files; maux_t *maux = args->maux; @@ -2438,13 +2903,9 @@ void stage_line(args_t *args) void merge_line(args_t *args) { - if ( args->regs ) - { - if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return; - } - bcf1_t *out = args->out_line; merge_chrom2qual(args, out); + if ( args->regs && !regidx_overlap(args->regs,args->maux->chr,out->pos,out->pos+out->rlen-1,NULL) ) return; merge_filter(args, out); merge_info(args, out); if ( args->do_gvcf ) @@ -2492,9 +2953,59 @@ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *c error_errno("[%s] Failed to add program information to header", __func__); } +void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr) +{ + char **str = NULL; + int i,j, nstr = 0, mstr = 0; + for (i=0; inhrec; i++) + { + if ( hdr->hrec[i]->type!=BCF_HL_FMT ) continue; + j = bcf_hrec_find_key(hdr->hrec[i],"ID"); + if ( j<0 ) continue; + char *key = hdr->hrec[i]->vals[j]; + int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key); + assert( id>=0 ); + int localize = 0; + if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G ) localize = 1; + if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A ) localize = 1; + if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R ) localize = 1; + if ( !localize ) continue; + args->tmps.l = 0; + + uint32_t e = 0, nout = 0; + e |= ksprintf(&args->tmps, "##%s=<", hdr->hrec[i]->key) < 0; + for (j=0; jhrec[i]->nkeys; j++) + { + if ( !strcmp("IDX",hdr->hrec[i]->keys[j]) ) continue; + if ( nout ) e |= kputc(',',&args->tmps) < 0; + if ( !strcmp("ID",hdr->hrec[i]->keys[j]) ) + e |= ksprintf(&args->tmps,"%s=L%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0; + else if ( !strcmp("Number",hdr->hrec[i]->keys[j]) ) + e |= ksprintf(&args->tmps,"Number=.") < 0; + else if ( !strcmp("Description",hdr->hrec[i]->keys[j]) && hdr->hrec[i]->vals[j][0]=='"' ) + e |= ksprintf(&args->tmps,"Description=\"Localized field: %s", hdr->hrec[i]->vals[j]+1) < 0; + else + e |= ksprintf(&args->tmps,"%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0; + nout++; + } + e |= ksprintf(&args->tmps,">\n") < 0; + if ( e ) error("Failed to format the header line for %s\n", key); + nstr++; + hts_expand(char*,nstr,mstr,str); + str[nstr-1] = strdup(args->tmps.s); + } + if ( !nstr ) return; + bcf_hdr_append(hdr,"##FORMAT="); + for (i=0; iout_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); @@ -2511,6 +3022,7 @@ void merge_vcf(args_t *args) char buf[24]; snprintf(buf,sizeof buf,"%d",i+1); merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples); } + if ( args->local_alleles ) hdr_add_localized_tags(args, args->out_hdr); if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); if (bcf_hdr_sync(args->out_hdr) < 0) error_errno("[%s] Failed to update header", __func__); @@ -2582,7 +3094,9 @@ static void usage(void) fprintf(bcftools_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); fprintf(bcftools_stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(bcftools_stderr, " -l, --file-list read file names from the file\n"); + fprintf(bcftools_stderr, " -L, --local-alleles EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); fprintf(bcftools_stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); + fprintf(bcftools_stderr, " --no-index merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); @@ -2590,7 +3104,7 @@ static void usage(void) fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfmerge(int argc, char *argv[]) @@ -2610,6 +3124,7 @@ int main_vcfmerge(int argc, char *argv[]) { {"help",no_argument,NULL,'h'}, {"merge",required_argument,NULL,'m'}, + {"local-alleles",required_argument,NULL,'L'}, {"gvcf",required_argument,NULL,'g'}, {"file-list",required_argument,NULL,'l'}, {"missing-to-ref",no_argument,NULL,'0'}, @@ -2624,11 +3139,19 @@ int main_vcfmerge(int argc, char *argv[]) {"regions-file",required_argument,NULL,'R'}, {"info-rules",required_argument,NULL,'i'}, {"no-version",no_argument,NULL,8}, + {"no-index",no_argument,NULL,10}, {"filter-logic",required_argument,NULL,'F'}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) { + char *tmp; + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) { switch (c) { + case 'L': + args->local_alleles = strtol(optarg,&tmp,10); + if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg); + if ( args->local_alleles < 1 ) + error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg); + break; case 'F': if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD; else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE; @@ -2674,6 +3197,7 @@ int main_vcfmerge(int argc, char *argv[]) case 3 : args->force_samples = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->no_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -2682,7 +3206,13 @@ int main_vcfmerge(int argc, char *argv[]) if ( argc==optind && !args->file_list ) usage(); if ( argc-optind<2 && !args->file_list ) usage(); - args->files->require_index = 1; + if ( args->no_index ) + { + if ( args->regions_list ) error("Error: cannot combine --no-index with -r/-R\n"); + bcf_sr_set_opt(args->files,BCF_SR_ALLOW_NO_IDX); + } + else + bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index dcaaba1..7b510b1 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -26,6 +26,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -38,6 +39,7 @@ THE SOFTWARE. */ #include #include "bcftools.h" #include "rbuf.h" +#include "abuf.h" #define CHECK_REF_EXIT 1 #define CHECK_REF_WARN 2 @@ -84,20 +86,25 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, tmp_als_str; + kstring_t *tmp_als, tmp_kstr; int ntmp_als; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider int aln_win; // the realignment window size (maximum repeat size) bcf_srs_t *files; // using the synced reader only for -r option - bcf_hdr_t *hdr; + bcf_hdr_t *hdr, *out_hdr; cmpals_t cmpals_in, cmpals_out; faidx_t *fai; struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; - int record_cmd_line, force, force_warned; + int record_cmd_line, force, force_warned, keep_sum_ad; + abuf_t *abuf; + abuf_opt_t atomize; + int use_star_allele; + char *old_rec_tag; + htsFile *out; } args_t; @@ -136,7 +143,7 @@ static void seq_to_upper(char *seq, int len) static void fix_ref(args_t *args, bcf1_t *line) { int reflen = strlen(line->d.allele[0]); - int i, maxlen = reflen, len; + int i,j, maxlen = reflen, len; for (i=1; in_allele; i++) { int len = strlen(line->d.allele[i]); @@ -149,27 +156,57 @@ static void fix_ref(args_t *args, bcf1_t *line) args->nref.tot++; - // is the REF different? + // is the REF different? If not, we are done if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } - // is the REF allele missing or N? - if ( reflen==1 && (line->d.allele[0][0]=='.' || line->d.allele[0][0]=='N' || line->d.allele[0][0]=='n') ) + // is the REF allele missing? + if ( reflen==1 && line->d.allele[0][0]=='.' ) { line->d.allele[0][0] = ref[0]; args->nref.set++; free(ref); - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); return; } - // does REF contain non-standard bases? - if ( replace_iupac_codes(line->d.allele[0],strlen(line->d.allele[0])) ) + // does REF or ALT contain non-standard bases? + int has_non_acgtn = 0; + for (i=0; in_allele; i++) + { + if ( line->d.allele[i][0]=='<' ) continue; + has_non_acgtn += replace_iupac_codes(line->d.allele[i],strlen(line->d.allele[i])); + } + if ( has_non_acgtn ) { args->nref.set++; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } } + // does the REF allele contain N's ? + int fix = 0; + for (i=0; id.allele[0][i]!='N' ) continue; + if ( ref[i]=='N' ) continue; + line->d.allele[0][i] = ref[i]; + fix++; + for (j=1; jn_allele; j++) + { + int len = strlen(line->d.allele[j]); + if ( len <= i || line->d.allele[j][i]!='N' ) continue; + line->d.allele[j][i] = ref[i]; + fix++; + } + } + if ( fix ) + { + args->nref.set++; + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); + if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } + } + + // is it swapped? for (i=1; in_allele; i++) { @@ -178,45 +215,35 @@ static void fix_ref(args_t *args, bcf1_t *line) } kstring_t str = {0,0,0}; - if ( i==line->n_allele ) + if ( i==line->n_allele ) // none of the alternate alleles matches the reference { - // none of the alternate alleles matches the reference - if ( line->n_allele>1 ) - args->nref.set++; - else - args->nref.swap++; - - kputs(line->d.allele[0],&str); - kputc(',',&str); + args->nref.set++; + kputsn(ref,reflen,&str); for (i=1; in_allele; i++) { - kputs(line->d.allele[i],&str); kputc(',',&str); + kputs(line->d.allele[i],&str); } - kputc(ref[0],&str); - bcf_update_alleles_str(args->hdr,line,str.s); - str.l = 0; + bcf_update_alleles_str(args->out_hdr,line,str.s); + free(ref); + free(str.s); + return; } - else - args->nref.swap++; - free(ref); - // swap the alleles - int j; + // one of the alternate alleles matches the reference, assume it's a simple swap kputs(line->d.allele[i],&str); - for (j=1; jd.allele[j],&str); - } - kputc(',',&str); - kputs(line->d.allele[0],&str); - for (j=i+1; jn_allele; j++) + for (j=1; jn_allele; j++) { kputc(',',&str); - kputs(line->d.allele[j],&str); + if ( j==i ) + kputs(line->d.allele[0],&str); + else + kputs(line->d.allele[j],&str); } - bcf_update_alleles_str(args->hdr,line,str.s); + bcf_update_alleles_str(args->out_hdr,line,str.s); + args->nref.swap++; + free(ref); + free(str.s); // swap genotypes int ntmp = args->ntmp_arr1 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t @@ -231,7 +258,7 @@ static void fix_ref(args_t *args, bcf1_t *line) else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0); else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0); } - bcf_update_genotypes(args->hdr,line,gts,ngts); + bcf_update_genotypes(args->out_hdr,line,gts,ngts); // update AC int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp); @@ -240,10 +267,8 @@ static void fix_ref(args_t *args, bcf1_t *line) { int32_t *ac = (int32_t*)args->tmp_arr1; ac[i-1] = ni; - bcf_update_info_int32(args->hdr, line, "AC", ac, nac); + bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac); } - - free(str.s); } static void fix_dup_alt(args_t *args, bcf1_t *line) @@ -268,7 +293,7 @@ static void fix_dup_alt(args_t *args, bcf1_t *line) if ( !args->tmp_arr1[i] ) continue; line->d.allele[j++] = line->d.allele[i]; } - bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals); + bcf_update_alleles(args->out_hdr, line, (const char**)line->d.allele, nals); // update genotypes @@ -286,7 +311,36 @@ static void fix_dup_alt(args_t *args, bcf1_t *line) gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new); changed = 1; } - if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts); + if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts); +} + +static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) +{ + if ( !args->old_rec_tag ) return; + + // only update if the tag is not present already, there can be multiple normalization steps + int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); + bcf_unpack(dst, BCF_UN_INFO); + for (i=0; in_info; i++) + { + bcf_info_t *inf = &dst->d.info[i]; + if ( inf && inf->key == id ) return; + } + + args->tmp_kstr.l = 0; + ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]); + for (i=1; in_allele; i++) + { + kputs(src->d.allele[i],&args->tmp_kstr); + if ( i+1n_allele ) kputc(',',&args->tmp_kstr); + } + if ( ialt>0 ) + { + kputc('|',&args->tmp_kstr); + kputw(ialt,&args->tmp_kstr); + } + if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 ) + error("An error occurred while updating INFO/%s\n",args->old_rec_tag); } #define ERR_DUP_ALLELE -2 @@ -333,7 +387,7 @@ static int realign(args_t *args, bcf1_t *line) if ( line->rlen > 1 ) { line->d.allele[0][1] = 0; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); } return ERR_OK; } @@ -363,7 +417,7 @@ static int realign(args_t *args, bcf1_t *line) } // trim from right - int ori_pos = line->pos; + int new_pos = line->pos; while (1) { // is the rightmost base identical in all alleles? @@ -374,7 +428,7 @@ static int realign(args_t *args, bcf1_t *line) if ( als[i].l < min_len ) min_len = als[i].l; } if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed - if ( min_len<=1 && line->pos==0 ) break; + if ( min_len<=1 && new_pos==0 ) break; int pad_from_left = 0; for (i=0; in_allele; i++) // trim all alleles @@ -384,10 +438,10 @@ static int realign(args_t *args, bcf1_t *line) } if ( pad_from_left ) { - int npad = line->pos >= args->aln_win ? args->aln_win : line->pos; + int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; free(ref); - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref); - if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1); + ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref); + if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1); replace_iupac_codes(ref,nref); for (i=0; in_allele; i++) { @@ -396,7 +450,7 @@ static int realign(args_t *args, bcf1_t *line) memcpy(als[i].s,ref,npad); als[i].l += npad; } - line->pos -= npad; + new_pos -= npad; } } free(ref); @@ -422,39 +476,43 @@ static int realign(args_t *args, bcf1_t *line) memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); als[i].l -= ntrim_left; } - line->pos += ntrim_left; + new_pos += ntrim_left; } // Have the alleles changed? als[0].s[ als[0].l ] = 0; // in order for strcmp to work - if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; + if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; + + set_old_rec_tag(args, line, line, 0); // Create new block of alleles and update - args->tmp_als_str.l = 0; + args->tmp_kstr.l = 0; for (i=0; in_allele; i++) { - if (i>0) kputc(',',&args->tmp_als_str); - kputsn(als[i].s,als[i].l,&args->tmp_als_str); + if (i>0) kputc(',',&args->tmp_kstr); + kputsn(als[i].s,als[i].l,&args->tmp_kstr); } - args->tmp_als_str.s[ args->tmp_als_str.l ] = 0; - bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s); + args->tmp_kstr.s[ args->tmp_kstr.l ] = 0; + bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s); args->nchanged++; // Update INFO/END if necessary int new_reflen = strlen(line->d.allele[0]); - if ( (ori_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 ) + if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 ) { // bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32() + line->pos = new_pos; args->int32_arr[0] = line->pos + new_reflen; - bcf_update_info_int32(args->hdr, line, "END", args->int32_arr, 1); + bcf_update_info_int32(args->out_hdr, line, "END", args->int32_arr, 1); } + line->pos = new_pos; return ERR_OK; } static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst) { - #define BRANCH_NUMERIC(type,type_t) \ + #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing) \ { \ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \ int ntmp = args->ntmp_arr1 / sizeof(type_t); \ @@ -477,13 +535,13 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -499,14 +557,24 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ } \ - if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ - bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ + if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==info->key ) \ + { \ + int j; \ + for (j=1; jlen; j++) \ + if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) vals[0] += vals[j]; \ + vals[1] = vals[ialt+1]; \ + } \ + else \ + { \ + if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ + } \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,2); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -522,7 +590,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -533,15 +601,15 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals,3); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,3); \ } \ else \ - bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,ret); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key)) { - case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t); break; - case BCF_HT_REAL: BRANCH_NUMERIC(float, float); break; + case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, vals[j]==bcf_int32_vector_end, vals[j]==bcf_int32_missing); break; + case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(vals[j]), bcf_float_is_missing(vals[j])); break; } #undef BRANCH_NUMERIC } @@ -589,7 +657,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_R ) { @@ -600,7 +668,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_G ) { @@ -615,16 +683,16 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst) { const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_flag(args->hdr,dst,tag,NULL,ret); + bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret); } static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst) @@ -650,11 +718,11 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } gt += ngts; } - bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl); + bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl); } static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst) { - #define BRANCH_NUMERIC(type,type_t,is_vector_end,set_vector_end) \ + #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \ { \ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \ int ntmp = args->ntmp_arr1 / sizeof(type_t); \ @@ -663,10 +731,10 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int assert( nvals>0 ); \ type_t *vals = (type_t *) args->tmp_arr1; \ int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \ - int i, nsmpl = bcf_hdr_nsamples(args->hdr); \ + int i,j, nsmpl = bcf_hdr_nsamples(args->hdr); \ if ( nvals==nsmpl ) /* all values are missing */ \ { \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \ return; \ } \ if ( len==BCF_VL_A ) \ @@ -683,7 +751,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -697,7 +765,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int dst_vals += 1; \ src_vals += nvals; \ } \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -713,7 +781,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -721,14 +789,29 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ nvals /= nsmpl; \ type_t *src_vals = vals, *dst_vals = vals; \ - for (i=0; ikeep_sum_ad >= 0 && args->keep_sum_ad==fmt->id ) \ { \ - dst_vals[0] = src_vals[0]; \ - dst_vals[1] = src_vals[ialt+1]; \ - dst_vals += 2; \ - src_vals += nvals; \ + for (i=0; ihdr,dst,tag,vals,nsmpl*2); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl*2); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -744,7 +827,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ @@ -775,15 +858,15 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int dst_vals += all_haploid ? 2 : 3; \ src_vals += nvals; \ } \ - bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \ } \ else \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nvals); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id)) { - case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, dst_vals[2]=bcf_int32_vector_end); break; - case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break; + case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break; + case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break; } #undef BRANCH_NUMERIC } @@ -825,7 +908,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else if ( len==BCF_VL_R ) { @@ -843,7 +926,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else if ( len==BCF_VL_G ) { @@ -871,7 +954,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i } if ( args->force ) { - bcf_update_format_char(args->hdr,dst,tag,NULL,0); + bcf_update_format_char(args->out_hdr,dst,tag,NULL,0); return; } error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n", @@ -902,13 +985,12 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else - bcf_update_format_char(args->hdr,dst,tag,str.s,str.l); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l); } - static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) { int i; @@ -941,11 +1023,11 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) // Not quite sure how to handle IDs, they can be assigned to a specific // ALT. For now we leave the ID unchanged for all. - bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : "."); + bcf_update_id(args->out_hdr, dst, line->d.id ? line->d.id : "."); tmp.l = rlen; kputs(line->d.allele[i+1],&tmp); - bcf_update_alleles_str(args->hdr,dst,tmp.s); + bcf_update_alleles_str(args->out_hdr,dst,tmp.s); if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt); @@ -958,6 +1040,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst); else split_info_string(args, line, info, i, dst); } + set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes dst->n_sample = line->n_sample; for (j=0; jn_fmt; j++) @@ -1021,7 +1104,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -1045,7 +1128,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf vals[ args->maps[i].map[k] ] = vals2[k]; \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -1079,10 +1162,10 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf } \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else \ - bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,nvals_ori); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key)) { @@ -1095,7 +1178,7 @@ static void merge_info_flag(args_t *args, bcf1_t **lines, int nlines, bcf_info_t { const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_flag(args->hdr,dst,tag,NULL,ret); + bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret); } int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst) @@ -1123,7 +1206,7 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info str.s[str.l] = 0; args->tmp_arr1 = (uint8_t*) str.s; args->ntmp_arr1 = str.m; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_G ) { @@ -1150,12 +1233,12 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info str.s[str.l] = 0; args->tmp_arr1 = (uint8_t*) str.s; args->ntmp_arr1 = str.m; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else { bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1); + bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1); } } static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst) @@ -1198,7 +1281,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ gt2 += ngts; } } - bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl); + bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl); } static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals) { @@ -1251,7 +1334,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals2; \ } \ } \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -1279,7 +1362,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals2; \ } \ } \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -1358,10 +1441,10 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals;\ }\ }\ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id)) { @@ -1378,7 +1461,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G ) { int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret); + bcf_update_format_char(args->out_hdr,dst,tag,args->tmp_arr1,nret); return; } @@ -1397,7 +1480,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm for (i=0; ihdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1); - if (nret<0) continue; /* format tag does not exist in this record, skip */ \ + if (nret<0) continue; /* format tag does not exist in this record, skip */ nret /= nsmpl; for (k=0; khdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1); - if (nret<0) continue; /* format tag does not exist in this record, skip */ \ + if (nret<0) continue; /* format tag does not exist in this record, skip */ nret /= nsmpl; } for (k=0; kntmp_arr2 = str.m; args->tmp_arr2 = (uint8_t*)str.s; - bcf_update_format_char(args->hdr,dst,tag,str.s,str.l); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l); } char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb); // see vcfmerge.c @@ -1511,7 +1594,7 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * dst->qual = lines[i]->qual; } - bcf_update_id(args->hdr, dst, lines[0]->d.id); + bcf_update_id(args->out_hdr, dst, lines[0]->d.id); // Merge and set the alleles, create a mapping from source allele indexes to dst idxs hts_expand0(map_t,nlines,args->mmaps,args->maps); // a mapping for each line @@ -1525,20 +1608,20 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * } for (i=1; id.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id); + if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->out_hdr, dst, lines[i]->d.id); args->maps[i].nals = lines[i]->n_allele; hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map); args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals); if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); } - bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals); + bcf_update_alleles(args->out_hdr, dst, (const char**)args->als, args->nals); for (i=0; inals; i++) { free(args->als[i]); args->als[i] = NULL; } - if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt); + if ( lines[0]->d.n_flt ) bcf_update_filter(args->out_hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt); for (i=1; id.n_flt; j++) { @@ -1546,13 +1629,13 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * // otherwise accumulate FILTERs if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) { if (args->strict_filter) { - bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt); + bcf_update_filter(args->out_hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt); break; } else continue; } - bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]); + bcf_add_filter(args->out_hdr, dst, lines[i]->d.flt[j]); } } @@ -1722,7 +1805,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n) if ( mrows_ready_to_flush(args, args->lines[k]) ) { while ( (line=mrows_flush(args)) ) - if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } int merge = 1; if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY ) @@ -1755,18 +1838,30 @@ static void flush_buffer(args_t *args, htsFile *file, int n) prev_type |= line_type; if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]); } - if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) { while ( (line=mrows_flush(args)) ) - if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } } static void init_data(args_t *args) { args->hdr = args->files->readers[0].header; + if ( args->keep_sum_ad ) + { + args->keep_sum_ad = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"AD"); + if ( args->keep_sum_ad < 0 ) error("Error: --keep-sum-ad requested but the tag AD is not present\n"); + } + else + args->keep_sum_ad = -1; + + args->out_hdr = bcf_hdr_dup(args->hdr); + if ( args->old_rec_tag ) + bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); + rbuf_init(&args->rbuf, 100); args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); if ( args->ref_fname ) @@ -1780,6 +1875,14 @@ static void init_data(args_t *args) args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t)); args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr)); } + if ( args->atomize==SPLIT ) + { + args->abuf = abuf_init(args->hdr, SPLIT); + abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr); + if ( args->old_rec_tag ) + abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag); + abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele); + } } static void destroy_data(args_t *args) @@ -1804,7 +1907,7 @@ static void destroy_data(args_t *args) for (i=0; intmp_als; i++) free(args->tmp_als[i].s); free(args->tmp_als); - free(args->tmp_als_str.s); + free(args->tmp_kstr.s); if ( args->tmp_str ) { for (i=0; ihdr); i++) free(args->tmp_str[i].s); @@ -1816,15 +1919,16 @@ static void destroy_data(args_t *args) free(args->tmp_arr1); free(args->tmp_arr2); free(args->diploid); + if ( args->abuf ) abuf_destroy(args->abuf); + bcf_hdr_destroy(args->out_hdr); if ( args->mrow_out ) bcf_destroy1(args->mrow_out); if ( args->fai ) fai_destroy(args->fai); if ( args->mseq ) free(args->seq); } -static void normalize_line(args_t *args, bcf1_t **line_ptr) +static void normalize_line(args_t *args, bcf1_t *line) { - bcf1_t *line = *line_ptr; if ( args->fai ) { if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line); @@ -1854,8 +1958,8 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines); int i,j; i = j = rbuf_append(&args->rbuf); - if ( !args->lines[i] ) args->lines[i] = bcf_init1(); - SWAP(bcf1_t*, (*line_ptr), args->lines[i]); + if ( args->lines[i] ) bcf_destroy(args->lines[i]); + args->lines[i] = bcf_dup(line); while ( rbuf_prev(&args->rbuf,&i) ) { if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]); @@ -1863,21 +1967,38 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) } } +static bcf1_t *next_atomized_line(args_t *args) +{ + bcf1_t *rec = NULL; + if ( args->atomize==SPLIT ) + { + rec = abuf_flush(args->abuf, 0); + if ( rec ) return rec; + } + + if ( !bcf_sr_next_line(args->files) ) return NULL; + + if ( args->atomize==SPLIT ) + { + abuf_push(args->abuf,bcf_sr_get_line(args->files,0)); + return abuf_flush(args->abuf, 0); + } + return bcf_sr_get_line(args->files,0); +} static void normalize_vcf(args_t *args) { - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); - if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); + args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) - hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); - if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); + if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm"); + if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + bcf1_t *line; int prev_rid = -1, prev_pos = -1, prev_type = 0; - while ( bcf_sr_next_line(args->files) ) + while ( (line = next_atomized_line(args)) ) { args->ntotal++; - - bcf1_t *line = args->files->readers[0].buffer[0]; if ( args->rmdup ) { int line_type = bcf_get_variant_types(line); @@ -1901,7 +2022,7 @@ static void normalize_vcf(args_t *args) // still on the same chromosome? int i,j,ilast = rbuf_last(&args->rbuf); - if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome + if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome int split = 0; if ( args->mrows_op==MROWS_SPLIT ) @@ -1916,13 +2037,13 @@ static void normalize_vcf(args_t *args) args->nsplit++; split_multiallelic_to_biallelics(args, line); for (j=0; jntmp_lines; j++) - normalize_line(args, &args->tmp_lines[j]); + normalize_line(args, args->tmp_lines[j]); } else split = 0; } if ( !split ) - normalize_line(args, &args->files->readers[0].buffer[0]); + normalize_line(args, line); // find out how many sites to flush ilast = rbuf_last(&args->rbuf); @@ -1932,10 +2053,10 @@ static void normalize_vcf(args_t *args) if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break; j++; } - if ( j>0 ) flush_buffer(args, out, j); + if ( j>0 ) flush_buffer(args, args->out, j); } - flush_buffer(args, out, args->rbuf.n); - if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + flush_buffer(args, args->out, args->rbuf.n); + if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); if ( args->check_ref & CHECK_REF_FIX ) @@ -1951,23 +2072,27 @@ static void usage(void) fprintf(stderr, "Usage: bcftools norm [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); - fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); - fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|exact\n"); - fprintf(stderr, " -f, --fasta-ref reference sequence\n"); - fprintf(stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); - fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(stderr, " -a, --atomize Decompose complex variants (e.g. MNVs become consecutive SNVs)\n"); + fprintf(stderr, " --atom-overlaps '*'|. Use the star allele (*) for overlapping alleles or set to missing (.) [*]\n"); + fprintf(stderr, " -c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); + fprintf(stderr, " -D, --remove-duplicates Remove duplicate lines of the same type.\n"); + fprintf(stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n"); + fprintf(stderr, " -f, --fasta-ref FILE Reference sequence\n"); + fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); + fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); + fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n"); + fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # normalize and left-align indels\n"); @@ -1995,11 +2120,16 @@ int main_vcfnorm(int argc, char *argv[]) args->do_indels = 1; int region_is_file = 0; int targets_is_file = 0; + args->use_star_allele = 1; static struct option loptions[] = { {"help",no_argument,NULL,'h'}, {"force",no_argument,NULL,7}, + {"atomize",no_argument,NULL,'a'}, + {"atom-overlaps",required_argument,NULL,11}, + {"old-rec-tag",required_argument,NULL,12}, + {"keep-sum",required_argument,NULL,10}, {"fasta-ref",required_argument,NULL,'f'}, {"do-not-normalize",no_argument,NULL,'N'}, {"multiallelics",required_argument,NULL,'m'}, @@ -2019,8 +2149,21 @@ int main_vcfnorm(int argc, char *argv[]) {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) { switch (c) { + case 10: + // possibly generalize this also to INFO/AD and other tags + if ( strcasecmp("ad",optarg) ) + error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n"); + args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data + break; + case 'a': args->atomize = SPLIT; break; + case 11 : + if ( optarg[0]=='*' ) args->use_star_allele = 1; + else if ( optarg[0]=='.' ) args->use_star_allele = 0; + else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n"); + break; + case 12 : args->old_rec_tag = optarg; break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; @@ -2092,7 +2235,7 @@ int main_vcfnorm(int argc, char *argv[]) } else fname = argv[optind]; - if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n"); + if ( !args->ref_fname && !args->mrows_op && !args->rmdup && args->atomize==NONE ) error("Expected -a, -f, -m, -D or -d option\n"); if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT; if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n"); diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index 6125a1b..e48443f 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -2,7 +2,7 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -28,6 +28,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ THE SOFTWARE. */ #include #include "bcftools.h" #include "rbuf.h" +#include "abuf.h" #define CHECK_REF_EXIT 1 #define CHECK_REF_WARN 2 @@ -86,20 +88,25 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, tmp_als_str; + kstring_t *tmp_als, tmp_kstr; int ntmp_als; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider int aln_win; // the realignment window size (maximum repeat size) bcf_srs_t *files; // using the synced reader only for -r option - bcf_hdr_t *hdr; + bcf_hdr_t *hdr, *out_hdr; cmpals_t cmpals_in, cmpals_out; faidx_t *fai; struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; - int record_cmd_line, force, force_warned; + int record_cmd_line, force, force_warned, keep_sum_ad; + abuf_t *abuf; + abuf_opt_t atomize; + int use_star_allele; + char *old_rec_tag; + htsFile *out; } args_t; @@ -138,7 +145,7 @@ static void seq_to_upper(char *seq, int len) static void fix_ref(args_t *args, bcf1_t *line) { int reflen = strlen(line->d.allele[0]); - int i, maxlen = reflen, len; + int i,j, maxlen = reflen, len; for (i=1; in_allele; i++) { int len = strlen(line->d.allele[i]); @@ -151,27 +158,57 @@ static void fix_ref(args_t *args, bcf1_t *line) args->nref.tot++; - // is the REF different? + // is the REF different? If not, we are done if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } - // is the REF allele missing or N? - if ( reflen==1 && (line->d.allele[0][0]=='.' || line->d.allele[0][0]=='N' || line->d.allele[0][0]=='n') ) + // is the REF allele missing? + if ( reflen==1 && line->d.allele[0][0]=='.' ) { line->d.allele[0][0] = ref[0]; args->nref.set++; free(ref); - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); return; } - // does REF contain non-standard bases? - if ( replace_iupac_codes(line->d.allele[0],strlen(line->d.allele[0])) ) + // does REF or ALT contain non-standard bases? + int has_non_acgtn = 0; + for (i=0; in_allele; i++) + { + if ( line->d.allele[i][0]=='<' ) continue; + has_non_acgtn += replace_iupac_codes(line->d.allele[i],strlen(line->d.allele[i])); + } + if ( has_non_acgtn ) { args->nref.set++; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } } + // does the REF allele contain N's ? + int fix = 0; + for (i=0; id.allele[0][i]!='N' ) continue; + if ( ref[i]=='N' ) continue; + line->d.allele[0][i] = ref[i]; + fix++; + for (j=1; jn_allele; j++) + { + int len = strlen(line->d.allele[j]); + if ( len <= i || line->d.allele[j][i]!='N' ) continue; + line->d.allele[j][i] = ref[i]; + fix++; + } + } + if ( fix ) + { + args->nref.set++; + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); + if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } + } + + // is it swapped? for (i=1; in_allele; i++) { @@ -180,45 +217,35 @@ static void fix_ref(args_t *args, bcf1_t *line) } kstring_t str = {0,0,0}; - if ( i==line->n_allele ) + if ( i==line->n_allele ) // none of the alternate alleles matches the reference { - // none of the alternate alleles matches the reference - if ( line->n_allele>1 ) - args->nref.set++; - else - args->nref.swap++; - - kputs(line->d.allele[0],&str); - kputc(',',&str); + args->nref.set++; + kputsn(ref,reflen,&str); for (i=1; in_allele; i++) { - kputs(line->d.allele[i],&str); kputc(',',&str); + kputs(line->d.allele[i],&str); } - kputc(ref[0],&str); - bcf_update_alleles_str(args->hdr,line,str.s); - str.l = 0; + bcf_update_alleles_str(args->out_hdr,line,str.s); + free(ref); + free(str.s); + return; } - else - args->nref.swap++; - free(ref); - // swap the alleles - int j; + // one of the alternate alleles matches the reference, assume it's a simple swap kputs(line->d.allele[i],&str); - for (j=1; jd.allele[j],&str); - } - kputc(',',&str); - kputs(line->d.allele[0],&str); - for (j=i+1; jn_allele; j++) + for (j=1; jn_allele; j++) { kputc(',',&str); - kputs(line->d.allele[j],&str); + if ( j==i ) + kputs(line->d.allele[0],&str); + else + kputs(line->d.allele[j],&str); } - bcf_update_alleles_str(args->hdr,line,str.s); + bcf_update_alleles_str(args->out_hdr,line,str.s); + args->nref.swap++; + free(ref); + free(str.s); // swap genotypes int ntmp = args->ntmp_arr1 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t @@ -233,7 +260,7 @@ static void fix_ref(args_t *args, bcf1_t *line) else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0); else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0); } - bcf_update_genotypes(args->hdr,line,gts,ngts); + bcf_update_genotypes(args->out_hdr,line,gts,ngts); // update AC int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp); @@ -242,10 +269,8 @@ static void fix_ref(args_t *args, bcf1_t *line) { int32_t *ac = (int32_t*)args->tmp_arr1; ac[i-1] = ni; - bcf_update_info_int32(args->hdr, line, "AC", ac, nac); + bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac); } - - free(str.s); } static void fix_dup_alt(args_t *args, bcf1_t *line) @@ -270,7 +295,7 @@ static void fix_dup_alt(args_t *args, bcf1_t *line) if ( !args->tmp_arr1[i] ) continue; line->d.allele[j++] = line->d.allele[i]; } - bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals); + bcf_update_alleles(args->out_hdr, line, (const char**)line->d.allele, nals); // update genotypes @@ -288,7 +313,36 @@ static void fix_dup_alt(args_t *args, bcf1_t *line) gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new); changed = 1; } - if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts); + if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts); +} + +static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) +{ + if ( !args->old_rec_tag ) return; + + // only update if the tag is not present already, there can be multiple normalization steps + int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); + bcf_unpack(dst, BCF_UN_INFO); + for (i=0; in_info; i++) + { + bcf_info_t *inf = &dst->d.info[i]; + if ( inf && inf->key == id ) return; + } + + args->tmp_kstr.l = 0; + ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]); + for (i=1; in_allele; i++) + { + kputs(src->d.allele[i],&args->tmp_kstr); + if ( i+1n_allele ) kputc(',',&args->tmp_kstr); + } + if ( ialt>0 ) + { + kputc('|',&args->tmp_kstr); + kputw(ialt,&args->tmp_kstr); + } + if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 ) + error("An error occurred while updating INFO/%s\n",args->old_rec_tag); } #define ERR_DUP_ALLELE -2 @@ -335,7 +389,7 @@ static int realign(args_t *args, bcf1_t *line) if ( line->rlen > 1 ) { line->d.allele[0][1] = 0; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); } return ERR_OK; } @@ -365,7 +419,7 @@ static int realign(args_t *args, bcf1_t *line) } // trim from right - int ori_pos = line->pos; + int new_pos = line->pos; while (1) { // is the rightmost base identical in all alleles? @@ -376,7 +430,7 @@ static int realign(args_t *args, bcf1_t *line) if ( als[i].l < min_len ) min_len = als[i].l; } if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed - if ( min_len<=1 && line->pos==0 ) break; + if ( min_len<=1 && new_pos==0 ) break; int pad_from_left = 0; for (i=0; in_allele; i++) // trim all alleles @@ -386,10 +440,10 @@ static int realign(args_t *args, bcf1_t *line) } if ( pad_from_left ) { - int npad = line->pos >= args->aln_win ? args->aln_win : line->pos; + int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; free(ref); - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref); - if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1); + ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref); + if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1); replace_iupac_codes(ref,nref); for (i=0; in_allele; i++) { @@ -398,7 +452,7 @@ static int realign(args_t *args, bcf1_t *line) memcpy(als[i].s,ref,npad); als[i].l += npad; } - line->pos -= npad; + new_pos -= npad; } } free(ref); @@ -424,39 +478,43 @@ static int realign(args_t *args, bcf1_t *line) memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); als[i].l -= ntrim_left; } - line->pos += ntrim_left; + new_pos += ntrim_left; } // Have the alleles changed? als[0].s[ als[0].l ] = 0; // in order for strcmp to work - if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; + if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; + + set_old_rec_tag(args, line, line, 0); // Create new block of alleles and update - args->tmp_als_str.l = 0; + args->tmp_kstr.l = 0; for (i=0; in_allele; i++) { - if (i>0) kputc(',',&args->tmp_als_str); - kputsn(als[i].s,als[i].l,&args->tmp_als_str); + if (i>0) kputc(',',&args->tmp_kstr); + kputsn(als[i].s,als[i].l,&args->tmp_kstr); } - args->tmp_als_str.s[ args->tmp_als_str.l ] = 0; - bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s); + args->tmp_kstr.s[ args->tmp_kstr.l ] = 0; + bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s); args->nchanged++; // Update INFO/END if necessary int new_reflen = strlen(line->d.allele[0]); - if ( (ori_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 ) + if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 ) { // bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32() + line->pos = new_pos; args->int32_arr[0] = line->pos + new_reflen; - bcf_update_info_int32(args->hdr, line, "END", args->int32_arr, 1); + bcf_update_info_int32(args->out_hdr, line, "END", args->int32_arr, 1); } + line->pos = new_pos; return ERR_OK; } static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst) { - #define BRANCH_NUMERIC(type,type_t) \ + #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing) \ { \ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \ int ntmp = args->ntmp_arr1 / sizeof(type_t); \ @@ -479,13 +537,13 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -501,14 +559,24 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ } \ - if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ - bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ + if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==info->key ) \ + { \ + int j; \ + for (j=1; jlen; j++) \ + if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) vals[0] += vals[j]; \ + vals[1] = vals[ialt+1]; \ + } \ + else \ + { \ + if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ + } \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,2); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -524,7 +592,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -535,15 +603,15 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals,3); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,3); \ } \ else \ - bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,ret); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key)) { - case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t); break; - case BCF_HT_REAL: BRANCH_NUMERIC(float, float); break; + case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, vals[j]==bcf_int32_vector_end, vals[j]==bcf_int32_missing); break; + case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(vals[j]), bcf_float_is_missing(vals[j])); break; } #undef BRANCH_NUMERIC } @@ -591,7 +659,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_R ) { @@ -602,7 +670,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_G ) { @@ -617,16 +685,16 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst) { const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_flag(args->hdr,dst,tag,NULL,ret); + bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret); } static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst) @@ -652,11 +720,11 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } gt += ngts; } - bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl); + bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl); } static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst) { - #define BRANCH_NUMERIC(type,type_t,is_vector_end,set_vector_end) \ + #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \ { \ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \ int ntmp = args->ntmp_arr1 / sizeof(type_t); \ @@ -665,10 +733,10 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int assert( nvals>0 ); \ type_t *vals = (type_t *) args->tmp_arr1; \ int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \ - int i, nsmpl = bcf_hdr_nsamples(args->hdr); \ + int i,j, nsmpl = bcf_hdr_nsamples(args->hdr); \ if ( nvals==nsmpl ) /* all values are missing */ \ { \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \ return; \ } \ if ( len==BCF_VL_A ) \ @@ -685,7 +753,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -699,7 +767,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int dst_vals += 1; \ src_vals += nvals; \ } \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -715,7 +783,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -723,14 +791,29 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ nvals /= nsmpl; \ type_t *src_vals = vals, *dst_vals = vals; \ - for (i=0; ikeep_sum_ad >= 0 && args->keep_sum_ad==fmt->id ) \ { \ - dst_vals[0] = src_vals[0]; \ - dst_vals[1] = src_vals[ialt+1]; \ - dst_vals += 2; \ - src_vals += nvals; \ + for (i=0; ihdr,dst,tag,vals,nsmpl*2); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl*2); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -746,7 +829,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ @@ -777,15 +860,15 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int dst_vals += all_haploid ? 2 : 3; \ src_vals += nvals; \ } \ - bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \ } \ else \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nvals); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id)) { - case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, dst_vals[2]=bcf_int32_vector_end); break; - case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break; + case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break; + case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break; } #undef BRANCH_NUMERIC } @@ -827,7 +910,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else if ( len==BCF_VL_R ) { @@ -845,7 +928,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else if ( len==BCF_VL_G ) { @@ -873,7 +956,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i } if ( args->force ) { - bcf_update_format_char(args->hdr,dst,tag,NULL,0); + bcf_update_format_char(args->out_hdr,dst,tag,NULL,0); return; } error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n", @@ -904,13 +987,12 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else - bcf_update_format_char(args->hdr,dst,tag,str.s,str.l); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l); } - static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) { int i; @@ -943,11 +1025,11 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) // Not quite sure how to handle IDs, they can be assigned to a specific // ALT. For now we leave the ID unchanged for all. - bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : "."); + bcf_update_id(args->out_hdr, dst, line->d.id ? line->d.id : "."); tmp.l = rlen; kputs(line->d.allele[i+1],&tmp); - bcf_update_alleles_str(args->hdr,dst,tmp.s); + bcf_update_alleles_str(args->out_hdr,dst,tmp.s); if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt); @@ -960,6 +1042,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst); else split_info_string(args, line, info, i, dst); } + set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes dst->n_sample = line->n_sample; for (j=0; jn_fmt; j++) @@ -1023,7 +1106,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -1047,7 +1130,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf vals[ args->maps[i].map[k] ] = vals2[k]; \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -1081,10 +1164,10 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf } \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else \ - bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,nvals_ori); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key)) { @@ -1097,7 +1180,7 @@ static void merge_info_flag(args_t *args, bcf1_t **lines, int nlines, bcf_info_t { const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_flag(args->hdr,dst,tag,NULL,ret); + bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret); } int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst) @@ -1125,7 +1208,7 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info str.s[str.l] = 0; args->tmp_arr1 = (uint8_t*) str.s; args->ntmp_arr1 = str.m; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_G ) { @@ -1152,12 +1235,12 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info str.s[str.l] = 0; args->tmp_arr1 = (uint8_t*) str.s; args->ntmp_arr1 = str.m; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else { bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1); + bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1); } } static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst) @@ -1200,7 +1283,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ gt2 += ngts; } } - bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl); + bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl); } static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals) { @@ -1253,7 +1336,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals2; \ } \ } \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -1281,7 +1364,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals2; \ } \ } \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -1360,10 +1443,10 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals;\ }\ }\ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id)) { @@ -1380,7 +1463,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G ) { int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret); + bcf_update_format_char(args->out_hdr,dst,tag,args->tmp_arr1,nret); return; } @@ -1399,7 +1482,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm for (i=0; ihdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1); - if (nret<0) continue; /* format tag does not exist in this record, skip */ \ + if (nret<0) continue; /* format tag does not exist in this record, skip */ nret /= nsmpl; for (k=0; khdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1); - if (nret<0) continue; /* format tag does not exist in this record, skip */ \ + if (nret<0) continue; /* format tag does not exist in this record, skip */ nret /= nsmpl; } for (k=0; kntmp_arr2 = str.m; args->tmp_arr2 = (uint8_t*)str.s; - bcf_update_format_char(args->hdr,dst,tag,str.s,str.l); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l); } char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb); // see vcfmerge.c @@ -1513,7 +1596,7 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * dst->qual = lines[i]->qual; } - bcf_update_id(args->hdr, dst, lines[0]->d.id); + bcf_update_id(args->out_hdr, dst, lines[0]->d.id); // Merge and set the alleles, create a mapping from source allele indexes to dst idxs hts_expand0(map_t,nlines,args->mmaps,args->maps); // a mapping for each line @@ -1527,20 +1610,20 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * } for (i=1; id.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id); + if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->out_hdr, dst, lines[i]->d.id); args->maps[i].nals = lines[i]->n_allele; hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map); args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals); if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); } - bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals); + bcf_update_alleles(args->out_hdr, dst, (const char**)args->als, args->nals); for (i=0; inals; i++) { free(args->als[i]); args->als[i] = NULL; } - if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt); + if ( lines[0]->d.n_flt ) bcf_update_filter(args->out_hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt); for (i=1; id.n_flt; j++) { @@ -1548,13 +1631,13 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * // otherwise accumulate FILTERs if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) { if (args->strict_filter) { - bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt); + bcf_update_filter(args->out_hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt); break; } else continue; } - bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]); + bcf_add_filter(args->out_hdr, dst, lines[i]->d.flt[j]); } } @@ -1724,7 +1807,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n) if ( mrows_ready_to_flush(args, args->lines[k]) ) { while ( (line=mrows_flush(args)) ) - if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } int merge = 1; if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY ) @@ -1757,18 +1840,30 @@ static void flush_buffer(args_t *args, htsFile *file, int n) prev_type |= line_type; if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]); } - if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) { while ( (line=mrows_flush(args)) ) - if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } } static void init_data(args_t *args) { args->hdr = args->files->readers[0].header; + if ( args->keep_sum_ad ) + { + args->keep_sum_ad = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"AD"); + if ( args->keep_sum_ad < 0 ) error("Error: --keep-sum-ad requested but the tag AD is not present\n"); + } + else + args->keep_sum_ad = -1; + + args->out_hdr = bcf_hdr_dup(args->hdr); + if ( args->old_rec_tag ) + bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); + rbuf_init(&args->rbuf, 100); args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); if ( args->ref_fname ) @@ -1782,6 +1877,14 @@ static void init_data(args_t *args) args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t)); args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr)); } + if ( args->atomize==SPLIT ) + { + args->abuf = abuf_init(args->hdr, SPLIT); + abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr); + if ( args->old_rec_tag ) + abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag); + abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele); + } } static void destroy_data(args_t *args) @@ -1806,7 +1909,7 @@ static void destroy_data(args_t *args) for (i=0; intmp_als; i++) free(args->tmp_als[i].s); free(args->tmp_als); - free(args->tmp_als_str.s); + free(args->tmp_kstr.s); if ( args->tmp_str ) { for (i=0; ihdr); i++) free(args->tmp_str[i].s); @@ -1818,15 +1921,16 @@ static void destroy_data(args_t *args) free(args->tmp_arr1); free(args->tmp_arr2); free(args->diploid); + if ( args->abuf ) abuf_destroy(args->abuf); + bcf_hdr_destroy(args->out_hdr); if ( args->mrow_out ) bcf_destroy1(args->mrow_out); if ( args->fai ) fai_destroy(args->fai); if ( args->mseq ) free(args->seq); } -static void normalize_line(args_t *args, bcf1_t **line_ptr) +static void normalize_line(args_t *args, bcf1_t *line) { - bcf1_t *line = *line_ptr; if ( args->fai ) { if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line); @@ -1856,8 +1960,8 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines); int i,j; i = j = rbuf_append(&args->rbuf); - if ( !args->lines[i] ) args->lines[i] = bcf_init1(); - SWAP(bcf1_t*, (*line_ptr), args->lines[i]); + if ( args->lines[i] ) bcf_destroy(args->lines[i]); + args->lines[i] = bcf_dup(line); while ( rbuf_prev(&args->rbuf,&i) ) { if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]); @@ -1865,21 +1969,38 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) } } +static bcf1_t *next_atomized_line(args_t *args) +{ + bcf1_t *rec = NULL; + if ( args->atomize==SPLIT ) + { + rec = abuf_flush(args->abuf, 0); + if ( rec ) return rec; + } + + if ( !bcf_sr_next_line(args->files) ) return NULL; + + if ( args->atomize==SPLIT ) + { + abuf_push(args->abuf,bcf_sr_get_line(args->files,0)); + return abuf_flush(args->abuf, 0); + } + return bcf_sr_get_line(args->files,0); +} static void normalize_vcf(args_t *args) { - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); - if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); + args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) - hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); - if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); + if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm"); + if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + bcf1_t *line; int prev_rid = -1, prev_pos = -1, prev_type = 0; - while ( bcf_sr_next_line(args->files) ) + while ( (line = next_atomized_line(args)) ) { args->ntotal++; - - bcf1_t *line = args->files->readers[0].buffer[0]; if ( args->rmdup ) { int line_type = bcf_get_variant_types(line); @@ -1903,7 +2024,7 @@ static void normalize_vcf(args_t *args) // still on the same chromosome? int i,j,ilast = rbuf_last(&args->rbuf); - if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome + if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome int split = 0; if ( args->mrows_op==MROWS_SPLIT ) @@ -1918,13 +2039,13 @@ static void normalize_vcf(args_t *args) args->nsplit++; split_multiallelic_to_biallelics(args, line); for (j=0; jntmp_lines; j++) - normalize_line(args, &args->tmp_lines[j]); + normalize_line(args, args->tmp_lines[j]); } else split = 0; } if ( !split ) - normalize_line(args, &args->files->readers[0].buffer[0]); + normalize_line(args, line); // find out how many sites to flush ilast = rbuf_last(&args->rbuf); @@ -1934,10 +2055,10 @@ static void normalize_vcf(args_t *args) if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break; j++; } - if ( j>0 ) flush_buffer(args, out, j); + if ( j>0 ) flush_buffer(args, args->out, j); } - flush_buffer(args, out, args->rbuf.n); - if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + flush_buffer(args, args->out, args->rbuf.n); + if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); fprintf(bcftools_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); if ( args->check_ref & CHECK_REF_FIX ) @@ -1953,23 +2074,27 @@ static void usage(void) fprintf(bcftools_stderr, "Usage: bcftools norm [options] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); - fprintf(bcftools_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); - fprintf(bcftools_stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|exact\n"); - fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence\n"); - fprintf(bcftools_stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); - fprintf(bcftools_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(bcftools_stderr, " -a, --atomize Decompose complex variants (e.g. MNVs become consecutive SNVs)\n"); + fprintf(bcftools_stderr, " --atom-overlaps '*'|. Use the star allele (*) for overlapping alleles or set to missing (.) [*]\n"); + fprintf(bcftools_stderr, " -c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); + fprintf(bcftools_stderr, " -D, --remove-duplicates Remove duplicate lines of the same type.\n"); + fprintf(bcftools_stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence\n"); + fprintf(bcftools_stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); + fprintf(bcftools_stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); + fprintf(bcftools_stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n"); + fprintf(bcftools_stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # normalize and left-align indels\n"); @@ -1978,7 +2103,7 @@ static void usage(void) fprintf(bcftools_stderr, " # split multi-allelic sites\n"); fprintf(bcftools_stderr, " bcftools norm -m- in.vcf\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfnorm(int argc, char *argv[]) @@ -1997,11 +2122,16 @@ int main_vcfnorm(int argc, char *argv[]) args->do_indels = 1; int region_is_file = 0; int targets_is_file = 0; + args->use_star_allele = 1; static struct option loptions[] = { {"help",no_argument,NULL,'h'}, {"force",no_argument,NULL,7}, + {"atomize",no_argument,NULL,'a'}, + {"atom-overlaps",required_argument,NULL,11}, + {"old-rec-tag",required_argument,NULL,12}, + {"keep-sum",required_argument,NULL,10}, {"fasta-ref",required_argument,NULL,'f'}, {"do-not-normalize",no_argument,NULL,'N'}, {"multiallelics",required_argument,NULL,'m'}, @@ -2021,8 +2151,21 @@ int main_vcfnorm(int argc, char *argv[]) {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) { switch (c) { + case 10: + // possibly generalize this also to INFO/AD and other tags + if ( strcasecmp("ad",optarg) ) + error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n"); + args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data + break; + case 'a': args->atomize = SPLIT; break; + case 11 : + if ( optarg[0]=='*' ) args->use_star_allele = 1; + else if ( optarg[0]=='.' ) args->use_star_allele = 0; + else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n"); + break; + case 12 : args->old_rec_tag = optarg; break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; @@ -2094,7 +2237,7 @@ int main_vcfnorm(int argc, char *argv[]) } else fname = argv[optind]; - if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n"); + if ( !args->ref_fname && !args->mrows_op && !args->rmdup && args->atomize==NONE ) error("Expected -a, -f, -m, -D or -d option\n"); if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT; if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n"); diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c index a161529..c4ea52d 100644 --- a/bcftools/vcfplugin.c +++ b/bcftools/vcfplugin.c @@ -1,6 +1,6 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2017 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -148,7 +148,7 @@ typedef struct _args_t char **plugin_paths; char **argv, *output_fname, *regions_list, *targets_list; - int argc, drop_header, verbose, record_cmd_line; + int argc, drop_header, verbose, record_cmd_line, plist_only; } args_t; @@ -178,7 +178,7 @@ static void add_plugin_paths(args_t *args, const char *path) args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1)); args->plugin_paths[args->nplugin_paths] = dir; args->nplugin_paths++; - if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. ok\n", dir); + if ( args->verbose > 1 && strcmp(".",dir) ) fprintf(stderr, "plugin directory %s .. ok\n", dir); } else { @@ -220,6 +220,8 @@ static void *dlopen_plugin(args_t *args, const char *fname) #else if ( fname[0]=='/' ) is_absolute_path = 1; #endif + + kstring_t err = {0,0,0}; if ( !is_absolute_path ) { int i; @@ -231,16 +233,14 @@ static void *dlopen_plugin(args_t *args, const char *fname) #else handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though #endif - if ( args->verbose > 1 ) - { - if ( !handle ) + if ( !handle ) #ifdef _WIN32 - fprintf(stderr,"%s:\n\tLoadLibraryA .. %lu\n", tmp, GetLastError()); + ksprintf(&err,"LoadLibraryA .. %lu\n", GetLastError()); #else - fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp, dlerror()); + ksprintf(&err,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); #endif - else fprintf(stderr,"%s:\n\tplugin open .. ok\n", tmp); - } + else if ( args->verbose > 1 ) + fprintf(stderr,"%s:\n\tplugin open .. ok\n", tmp); free(tmp); if ( handle ) return handle; } @@ -251,33 +251,46 @@ static void *dlopen_plugin(args_t *args, const char *fname) #else handle = dlopen(fname, RTLD_NOW); #endif - if ( args->verbose > 1 ) - { - if ( !handle ) + if ( !handle ) #ifdef _WIN32 - fprintf(stderr,"%s:\n\tLoadLibraryA .. %lu\n", fname, GetLastError()); + ksprintf(&err,"LoadLibraryA .. %lu\n", GetLastError()); #else - fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname, dlerror()); + ksprintf(&err,"%s:\n\tdlopen .. %s\n", fname,dlerror()); #endif - else fprintf(stderr,"%s:\n\tplugin open .. ok\n", fname); - } + else if ( args->verbose > 1 ) + fprintf(stderr,"%s:\n\tplugin open .. ok\n", fname); + + if ( !handle && (!args->plist_only || args->verbose>1) ) + fprintf(stderr,"%s",err.s); + free(err.s); return handle; } -static void print_plugin_usage_hint(void) +static void print_plugin_usage_hint(const char *name) { - fprintf(stderr, "\nNo functional bcftools plugins were found"); + if ( name ) + fprintf(stderr, "\nThe bcftools plugin \"%s\" was not found or is not functional", name); + else + fprintf(stderr, "\nNo functional bcftools plugins were found"); if ( !getenv("BCFTOOLS_PLUGINS") ) - fprintf(stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n"); + { + fprintf(stderr,". The environment variable BCFTOOLS_PLUGINS is not set"); +#ifdef PLUGINPATH + fprintf(stderr,"\nand no usable plugins were found in %s", PLUGINPATH); +#endif + fprintf(stderr,".\n\n"); + } else + { fprintf(stderr, " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n" "- Is the plugin path correct?\n\n" - "- Run \"bcftools plugin -lv\" for more detailed error output.\n" + "- Run \"bcftools plugin -l\" or \"bcftools plugin -lvv\" for a list of available plugins.\n" "\n", getenv("BCFTOOLS_PLUGINS") ); + } } static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugin_t *plugin) @@ -289,7 +302,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi { if ( exit_on_error ) { - print_plugin_usage_hint(); + print_plugin_usage_hint(fname); error("Could not load \"%s\".\n\n", fname); } return -1; @@ -410,12 +423,9 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi return 0; } -static void init_plugin(args_t *args) +static void check_version(args_t *args) { static int warned_bcftools = 0, warned_htslib = 0; - - int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out); - if ( ret<0 ) error("The plugin exited with an error.\n"); const char *bver, *hver; args->plugin.version(&bver, &hver); if ( strcmp(bver,bcftools_version()) && !warned_bcftools ) @@ -428,6 +438,13 @@ static void init_plugin(args_t *args) fprintf(stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver); warned_htslib = 1; } +} + +static void init_plugin(args_t *args) +{ + int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out); + if ( ret<0 ) error("The plugin exited with an error.\n"); + check_version(args); args->drop_header += ret; } @@ -487,7 +504,7 @@ static int list_plugins(args_t *args) if ( args->verbose ) printf("\n"); } else - print_plugin_usage_hint(); + print_plugin_usage_hint(NULL); free(str.s); return nplugins ? 0 : 1; } @@ -505,7 +522,7 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); if ( !args->drop_header ) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); @@ -592,10 +609,9 @@ int main_plugin(int argc, char *argv[]) args->n_threads = 0; args->record_cmd_line = 1; args->nplugin_paths = -1; - int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0; + int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0; if ( argc==1 ) usage(args); - char *plugin_name = NULL; if ( argv[1][0]!='-' ) { @@ -606,6 +622,7 @@ int main_plugin(int argc, char *argv[]) load_plugin(args, plugin_name, 1, &args->plugin); if ( args->plugin.run ) { + check_version(args); int ret = args->plugin.run(argc, argv); destroy_data(args); free(args); @@ -646,13 +663,17 @@ int main_plugin(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); }; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; - case 'l': plist_only = 1; break; + case 'l': args->plist_only = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case '?': @@ -660,8 +681,8 @@ int main_plugin(int argc, char *argv[]) default: error("Unknown argument: %s\n", optarg); } } - if ( plist_only ) return list_plugins(args); - if ( usage_only && ! plugin_name ) usage(args); + if ( args->plist_only ) return list_plugins(args); + if ( !plugin_name ) usage(args); if ( version_only ) { @@ -682,7 +703,7 @@ int main_plugin(int argc, char *argv[]) } char *fname = NULL; - if ( optind>=argc || argv[optind][0]=='-' ) + if ( optind>=argc || (argv[optind][0]=='-' && argv[optind][1]) ) { args->plugin.argc = argc - optind + 1; args->plugin.argv = argv + optind - 1; diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index 3b63c8c..2143a0a 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -2,7 +2,7 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2017 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -150,7 +150,7 @@ typedef struct _args_t char **plugin_paths; char **argv, *output_fname, *regions_list, *targets_list; - int argc, drop_header, verbose, record_cmd_line; + int argc, drop_header, verbose, record_cmd_line, plist_only; } args_t; @@ -180,7 +180,7 @@ static void add_plugin_paths(args_t *args, const char *path) args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1)); args->plugin_paths[args->nplugin_paths] = dir; args->nplugin_paths++; - if ( args->verbose > 1 ) fprintf(bcftools_stderr, "plugin directory %s .. ok\n", dir); + if ( args->verbose > 1 && strcmp(".",dir) ) fprintf(bcftools_stderr, "plugin directory %s .. ok\n", dir); } else { @@ -222,6 +222,8 @@ static void *dlopen_plugin(args_t *args, const char *fname) #else if ( fname[0]=='/' ) is_absolute_path = 1; #endif + + kstring_t err = {0,0,0}; if ( !is_absolute_path ) { int i; @@ -233,16 +235,14 @@ static void *dlopen_plugin(args_t *args, const char *fname) #else handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though #endif - if ( args->verbose > 1 ) - { - if ( !handle ) + if ( !handle ) #ifdef _WIN32 - fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA .. %lu\n", tmp, GetLastError()); + ksprintf(&err,"LoadLibraryA .. %lu\n", GetLastError()); #else - fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", tmp, dlerror()); + ksprintf(&err,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); #endif - else fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", tmp); - } + else if ( args->verbose > 1 ) + fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", tmp); free(tmp); if ( handle ) return handle; } @@ -253,33 +253,46 @@ static void *dlopen_plugin(args_t *args, const char *fname) #else handle = dlopen(fname, RTLD_NOW); #endif - if ( args->verbose > 1 ) - { - if ( !handle ) + if ( !handle ) #ifdef _WIN32 - fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA .. %lu\n", fname, GetLastError()); + ksprintf(&err,"LoadLibraryA .. %lu\n", GetLastError()); #else - fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", fname, dlerror()); + ksprintf(&err,"%s:\n\tdlopen .. %s\n", fname,dlerror()); #endif - else fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", fname); - } + else if ( args->verbose > 1 ) + fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", fname); + + if ( !handle && (!args->plist_only || args->verbose>1) ) + fprintf(bcftools_stderr,"%s",err.s); + free(err.s); return handle; } -static void print_plugin_usage_hint(void) +static void print_plugin_usage_hint(const char *name) { - fprintf(bcftools_stderr, "\nNo functional bcftools plugins were found"); + if ( name ) + fprintf(bcftools_stderr, "\nThe bcftools plugin \"%s\" was not found or is not functional", name); + else + fprintf(bcftools_stderr, "\nNo functional bcftools plugins were found"); if ( !getenv("BCFTOOLS_PLUGINS") ) - fprintf(bcftools_stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n"); + { + fprintf(bcftools_stderr,". The environment variable BCFTOOLS_PLUGINS is not set"); +#ifdef PLUGINPATH + fprintf(bcftools_stderr,"\nand no usable plugins were found in %s", PLUGINPATH); +#endif + fprintf(bcftools_stderr,".\n\n"); + } else + { fprintf(bcftools_stderr, " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n" "- Is the plugin path correct?\n\n" - "- Run \"bcftools plugin -lv\" for more detailed error output.\n" + "- Run \"bcftools plugin -l\" or \"bcftools plugin -lvv\" for a list of available plugins.\n" "\n", getenv("BCFTOOLS_PLUGINS") ); + } } static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugin_t *plugin) @@ -291,7 +304,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi { if ( exit_on_error ) { - print_plugin_usage_hint(); + print_plugin_usage_hint(fname); error("Could not load \"%s\".\n\n", fname); } return -1; @@ -412,12 +425,9 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi return 0; } -static void init_plugin(args_t *args) +static void check_version(args_t *args) { static int warned_bcftools = 0, warned_htslib = 0; - - int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out); - if ( ret<0 ) error("The plugin exited with an error.\n"); const char *bver, *hver; args->plugin.version(&bver, &hver); if ( strcmp(bver,bcftools_version()) && !warned_bcftools ) @@ -430,6 +440,13 @@ static void init_plugin(args_t *args) fprintf(bcftools_stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver); warned_htslib = 1; } +} + +static void init_plugin(args_t *args) +{ + int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out); + if ( ret<0 ) error("The plugin exited with an error.\n"); + check_version(args); args->drop_header += ret; } @@ -489,7 +506,7 @@ static int list_plugins(args_t *args) if ( args->verbose ) fprintf(bcftools_stdout, "\n"); } else - print_plugin_usage_hint(); + print_plugin_usage_hint(NULL); free(str.s); return nplugins ? 0 : 1; } @@ -507,7 +524,7 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); if ( !args->drop_header ) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); @@ -560,7 +577,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n"); fprintf(bcftools_stderr, " -V, --version print version string and exit\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } static int is_verbose(int argc, char *argv[]) @@ -594,10 +611,9 @@ int main_plugin(int argc, char *argv[]) args->n_threads = 0; args->record_cmd_line = 1; args->nplugin_paths = -1; - int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0; + int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0; if ( argc==1 ) usage(args); - char *plugin_name = NULL; if ( argv[1][0]!='-' ) { @@ -608,6 +624,7 @@ int main_plugin(int argc, char *argv[]) load_plugin(args, plugin_name, 1, &args->plugin); if ( args->plugin.run ) { + check_version(args); int ret = args->plugin.run(argc, argv); destroy_data(args); free(args); @@ -648,13 +665,17 @@ int main_plugin(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); }; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; - case 'l': plist_only = 1; break; + case 'l': args->plist_only = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case '?': @@ -662,8 +683,8 @@ int main_plugin(int argc, char *argv[]) default: error("Unknown argument: %s\n", optarg); } } - if ( plist_only ) return list_plugins(args); - if ( usage_only && ! plugin_name ) usage(args); + if ( args->plist_only ) return list_plugins(args); + if ( !plugin_name ) usage(args); if ( version_only ) { @@ -684,7 +705,7 @@ int main_plugin(int argc, char *argv[]) } char *fname = NULL; - if ( optind>=argc || argv[optind][0]=='-' ) + if ( optind>=argc || (argv[optind][0]=='-' && argv[optind][1]) ) { args->plugin.argc = argc - optind + 1; args->plugin.argv = argv + optind - 1; diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c index 806ecf1..6568c82 100644 --- a/bcftools/vcfquery.c +++ b/bcftools/vcfquery.c @@ -1,6 +1,6 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2017 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -226,7 +226,7 @@ static void usage(void) fprintf(stderr, " -H, --print-header print header\n"); fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -l, --list-samples print the list of samples and exit\n"); - fprintf(stderr, " -o, --output-file output file name [stdout]\n"); + fprintf(stderr, " -o, --output output file name [stdout]\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); fprintf(stderr, " -s, --samples list of samples to include\n"); @@ -257,6 +257,7 @@ int main_vcfquery(int argc, char *argv[]) {"exclude",1,0,'e'}, {"format",1,0,'f'}, {"output-file",1,0,'o'}, + {"output",1,0,'o'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, @@ -296,8 +297,12 @@ int main_vcfquery(int argc, char *argv[]) args->format_str = str.s; break; } - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index 66afb08..fc264b7 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -2,7 +2,7 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2017 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -228,7 +228,7 @@ static void usage(void) fprintf(bcftools_stderr, " -H, --print-header print header\n"); fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); fprintf(bcftools_stderr, " -l, --list-samples print the list of samples and exit\n"); - fprintf(bcftools_stderr, " -o, --output-file output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -o, --output output file name [bcftools_stdout]\n"); fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); fprintf(bcftools_stderr, " -s, --samples list of samples to include\n"); @@ -241,7 +241,7 @@ static void usage(void) fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfquery(int argc, char *argv[]) @@ -259,6 +259,7 @@ int main_vcfquery(int argc, char *argv[]) {"exclude",1,0,'e'}, {"format",1,0,'f'}, {"output-file",1,0,'o'}, + {"output",1,0,'o'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, @@ -298,8 +299,12 @@ int main_vcfquery(int argc, char *argv[]) args->format_str = str.s; break; } - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c index 1c822cb..8e95c9a 100644 --- a/bcftools/vcfroh.c +++ b/bcftools/vcfroh.c @@ -1,6 +1,6 @@ /* vcfroh.c -- HMM model for detecting runs of autozygosity. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -103,7 +103,7 @@ typedef struct _args_t int ntot; // some stats to detect if things didn't go wrong int nno_af; // number of sites rejected because AF could not be determined int nfiltered; // .. because of filters - int nnot_biallelic, ndup; + int nno_alt, nmultiallelic, ndup; smpl_t *smpl; // HMM data for each sample smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF) smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file) @@ -111,6 +111,7 @@ typedef struct _args_t int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname; int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads; + int include_noalt_sites; BGZF *out; kstring_t str; @@ -548,6 +549,7 @@ static void flush_viterbi(args_t *args, int ismpl) { smpl->rg.state = 1; smpl->rg.beg = smpl->sites[i]; + smpl->rg.end = smpl->sites[i]; smpl->rg.rid = args->prev_rid; smpl->rg.qual = qual; smpl->rg.nqual = 1; @@ -656,8 +658,10 @@ static void flush_viterbi(args_t *args, int ismpl) } } -int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) +int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) { + if ( tgt->nals < 2 ) + error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s); if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match int i; @@ -837,7 +841,7 @@ int process_line(args_t *args, bcf1_t *line, int ial) else if ( args->af_fname ) { // Read AF from a file - ret = read_AF(args->files->targets, line, &alt_freq); + ret = read_AF(args, args->files->targets, line, &alt_freq); } else if ( args->dflt_AF > 0 ) { @@ -997,33 +1001,32 @@ static void vcfroh(args_t *args, bcf1_t *line) // Skip unwanted lines, for simplicity we consider only biallelic sites if ( line->rid == args->skip_rid ) return; - if ( line->n_allele==1 ) { args->nnot_biallelic++; return; } // no ALT allele - if ( line->n_allele > 3 ) { args->nnot_biallelic++; return; } // cannot be bi-allelic, even with <*> // This can be raw callable VCF with the symbolic unseen allele <*> - int ial = 0; + int ial = 0, nalt = line->n_allele - 1; for (i=1; in_allele; i++) - if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; } - if ( ial==0 ) // normal VCF, the symbolic allele is not present { - if ( line->n_allele!=2 ) { args->nnot_biallelic++; return; } // not biallelic - ial = 1; + if ( !strcmp("<*>",line->d.allele[i]) || !strcmp("",line->d.allele[i]) ) nalt--; + else if ( !ial ) ial = i; } - else + + if ( !nalt ) // no ALT allele { - if ( line->n_allele!=3 ) return; // not biallelic - ial = ial==1 ? 2 : 1; // <*> can come in any order + args->nno_alt++; + if ( !args->include_noalt_sites ) return; + } + else if ( nalt>1 ) + { + args->nmultiallelic++; + return; } + if ( args->snps_only && !bcf_is_snp(line) ) return; // Initialize genetic map int skip_rid = 0; if ( args->prev_rid<0 ) - { - args->prev_rid = line->rid; - args->prev_pos = line->pos; skip_rid = load_genmap(args, bcf_seqname(args->hdr,line)); - } // New chromosome? if ( args->prev_rid!=line->rid ) @@ -1071,7 +1074,7 @@ static void usage(args_t *args) fprintf(stderr, "General Options:\n"); fprintf(stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); fprintf(stderr, " --AF-tag use TAG for allele frequency\n"); - fprintf(stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF\\tALT\\tAF)\n"); + fprintf(stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); fprintf(stderr, " -b --buffer-size buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); fprintf(stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); fprintf(stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); @@ -1082,6 +1085,7 @@ static void usage(args_t *args) fprintf(stderr, " Safe value to use is 30 to account for GT errors.\n"); fprintf(stderr, " --include select sites for which the expression is true\n"); fprintf(stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n"); + fprintf(stderr, " --include-noalt include sites with no ALT allele (ignored by default)\n"); fprintf(stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); fprintf(stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); fprintf(stderr, " is replaced with chromosome name\n"); @@ -1122,6 +1126,7 @@ int main_vcfroh(int argc, char *argv[]) {"AF-dflt",1,0,2}, {"include",1,0,3}, {"exclude",1,0,4}, + {"include-noalt",0,0,5}, {"buffer-size",1,0,'b'}, {"ignore-homref",0,0,'i'}, {"estimate-AF",1,0,'e'}, @@ -1154,8 +1159,13 @@ int main_vcfroh(int argc, char *argv[]) args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; - case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break; - case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break; + case 3 : + if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 4 : + if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 5: args->include_noalt_sites = 1; break; case 'o': args->output_fname = optarg; break; case 'O': if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST; @@ -1257,7 +1267,7 @@ int main_vcfroh(int argc, char *argv[]) fprintf(stderr,"Number of lines overlapping with --AF-file/processed: %d/%d\n", args->ntot,nmin); else fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin); - fprintf(stderr,"Number of lines filtered/no AF/not biallelic/dup: %d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nnot_biallelic,args->ndup); + fprintf(stderr,"Number of lines filtered/no AF/no alt/multiallelic/dup: %d/%d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nno_alt,args->nmultiallelic,args->ndup); if ( nmin==0 ) { fprintf(stderr,"No usable sites were found.\n"); diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c index 33defa4..b742faa 100644 --- a/bcftools/vcfroh.c.pysam.c +++ b/bcftools/vcfroh.c.pysam.c @@ -2,7 +2,7 @@ /* vcfroh.c -- HMM model for detecting runs of autozygosity. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -105,7 +105,7 @@ typedef struct _args_t int ntot; // some stats to detect if things didn't go wrong int nno_af; // number of sites rejected because AF could not be determined int nfiltered; // .. because of filters - int nnot_biallelic, ndup; + int nno_alt, nmultiallelic, ndup; smpl_t *smpl; // HMM data for each sample smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF) smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file) @@ -113,6 +113,7 @@ typedef struct _args_t int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname; int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads; + int include_noalt_sites; BGZF *out; kstring_t str; @@ -550,6 +551,7 @@ static void flush_viterbi(args_t *args, int ismpl) { smpl->rg.state = 1; smpl->rg.beg = smpl->sites[i]; + smpl->rg.end = smpl->sites[i]; smpl->rg.rid = args->prev_rid; smpl->rg.qual = qual; smpl->rg.nqual = 1; @@ -658,8 +660,10 @@ static void flush_viterbi(args_t *args, int ismpl) } } -int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) +int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) { + if ( tgt->nals < 2 ) + error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s); if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match int i; @@ -769,7 +773,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr case BCF_BT_INT8: BRANCH(int8_t); break; case BCF_BT_INT16: BRANCH(int16_t); break; case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1); } #undef BRANCH } @@ -799,7 +803,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr case BCF_BT_INT8: BRANCH(int8_t); break; case BCF_BT_INT16: BRANCH(int16_t); break; case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1); } #undef BRANCH } @@ -839,7 +843,7 @@ int process_line(args_t *args, bcf1_t *line, int ial) else if ( args->af_fname ) { // Read AF from a file - ret = read_AF(args->files->targets, line, &alt_freq); + ret = read_AF(args, args->files->targets, line, &alt_freq); } else if ( args->dflt_AF > 0 ) { @@ -941,7 +945,7 @@ int process_line(args_t *args, bcf1_t *line, int ial) case BCF_BT_INT8: BRANCH(int8_t); break; case BCF_BT_INT16: BRANCH(int16_t); break; case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1); } #undef BRANCH } @@ -999,33 +1003,32 @@ static void vcfroh(args_t *args, bcf1_t *line) // Skip unwanted lines, for simplicity we consider only biallelic sites if ( line->rid == args->skip_rid ) return; - if ( line->n_allele==1 ) { args->nnot_biallelic++; return; } // no ALT allele - if ( line->n_allele > 3 ) { args->nnot_biallelic++; return; } // cannot be bi-allelic, even with <*> // This can be raw callable VCF with the symbolic unseen allele <*> - int ial = 0; + int ial = 0, nalt = line->n_allele - 1; for (i=1; in_allele; i++) - if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; } - if ( ial==0 ) // normal VCF, the symbolic allele is not present { - if ( line->n_allele!=2 ) { args->nnot_biallelic++; return; } // not biallelic - ial = 1; + if ( !strcmp("<*>",line->d.allele[i]) || !strcmp("",line->d.allele[i]) ) nalt--; + else if ( !ial ) ial = i; } - else + + if ( !nalt ) // no ALT allele { - if ( line->n_allele!=3 ) return; // not biallelic - ial = ial==1 ? 2 : 1; // <*> can come in any order + args->nno_alt++; + if ( !args->include_noalt_sites ) return; + } + else if ( nalt>1 ) + { + args->nmultiallelic++; + return; } + if ( args->snps_only && !bcf_is_snp(line) ) return; // Initialize genetic map int skip_rid = 0; if ( args->prev_rid<0 ) - { - args->prev_rid = line->rid; - args->prev_pos = line->pos; skip_rid = load_genmap(args, bcf_seqname(args->hdr,line)); - } // New chromosome? if ( args->prev_rid!=line->rid ) @@ -1073,7 +1076,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "General Options:\n"); fprintf(bcftools_stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); fprintf(bcftools_stderr, " --AF-tag use TAG for allele frequency\n"); - fprintf(bcftools_stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF\\tALT\\tAF)\n"); + fprintf(bcftools_stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); fprintf(bcftools_stderr, " -b --buffer-size buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); fprintf(bcftools_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); fprintf(bcftools_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); @@ -1084,6 +1087,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " Safe value to use is 30 to account for GT errors.\n"); fprintf(bcftools_stderr, " --include select sites for which the expression is true\n"); fprintf(bcftools_stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n"); + fprintf(bcftools_stderr, " --include-noalt include sites with no ALT allele (ignored by default)\n"); fprintf(bcftools_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); fprintf(bcftools_stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); fprintf(bcftools_stderr, " is replaced with chromosome name\n"); @@ -1103,7 +1107,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); fprintf(bcftools_stderr, " -V, --viterbi-training estimate HMM parameters, is the convergence threshold, e.g. 1e-10 (experimental)\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfroh(int argc, char *argv[]) @@ -1124,6 +1128,7 @@ int main_vcfroh(int argc, char *argv[]) {"AF-dflt",1,0,2}, {"include",1,0,3}, {"exclude",1,0,4}, + {"include-noalt",0,0,5}, {"buffer-size",1,0,'b'}, {"ignore-homref",0,0,'i'}, {"estimate-AF",1,0,'e'}, @@ -1156,8 +1161,13 @@ int main_vcfroh(int argc, char *argv[]) args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; - case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break; - case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break; + case 3 : + if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 4 : + if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 5: args->include_noalt_sites = 1; break; case 'o': args->output_fname = optarg; break; case 'O': if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST; @@ -1259,7 +1269,7 @@ int main_vcfroh(int argc, char *argv[]) fprintf(bcftools_stderr,"Number of lines overlapping with --AF-file/processed: %d/%d\n", args->ntot,nmin); else fprintf(bcftools_stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin); - fprintf(bcftools_stderr,"Number of lines filtered/no AF/not biallelic/dup: %d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nnot_biallelic,args->ndup); + fprintf(bcftools_stderr,"Number of lines filtered/no AF/no alt/multiallelic/dup: %d/%d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nno_alt,args->nmultiallelic,args->ndup); if ( nmin==0 ) { fprintf(bcftools_stderr,"No usable sites were found.\n"); diff --git a/bcftools/vcfsom.c b/bcftools/vcfsom.c index ed86422..db01d24 100644 --- a/bcftools/vcfsom.c +++ b/bcftools/vcfsom.c @@ -1,6 +1,6 @@ /* vcfsom.c -- SOM (Self-Organizing Map) filtering. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2014, 2020 Genome Research Ltd. Author: Petr Danecek @@ -25,6 +25,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -165,15 +166,16 @@ void annots_reader_close(args_t *args) static void som_write_map(char *prefix, som_t **som, int nsom) { FILE *fp = open_file(NULL,"w","%s.som",prefix); - fwrite("SOMv1",5,1,fp); - fwrite(&nsom,sizeof(int),1,fp); + size_t nw; + if ( (nw=fwrite("SOMv1",5,1,fp))!=5 ) error("Failed to write 5 bytes\n"); + if ( (nw=fwrite(&nsom,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int)); int i; for (i=0; isize,sizeof(int),1,fp); - fwrite(&som[i]->kdim,sizeof(int),1,fp); - fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp); - fwrite(som[i]->c,sizeof(double),som[i]->size,fp); + if ( (nw=fwrite(&som[i]->size,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int)); + if ( (nw=fwrite(&som[i]->kdim,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int)); + if ( (nw=fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp))!=sizeof(double)*som[i]->size*som[i]->kdim ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size*som[i]->kdim); + if ( (nw=fwrite(som[i]->c,sizeof(double),som[i]->size,fp))!=sizeof(double)*som[i]->size ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size); } if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix); } diff --git a/bcftools/vcfsom.c.pysam.c b/bcftools/vcfsom.c.pysam.c index b8368f6..effd352 100644 --- a/bcftools/vcfsom.c.pysam.c +++ b/bcftools/vcfsom.c.pysam.c @@ -2,7 +2,7 @@ /* vcfsom.c -- SOM (Self-Organizing Map) filtering. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2014, 2020 Genome Research Ltd. Author: Petr Danecek @@ -27,6 +27,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -167,15 +168,16 @@ void annots_reader_close(args_t *args) static void som_write_map(char *prefix, som_t **som, int nsom) { FILE *fp = open_file(NULL,"w","%s.som",prefix); - fwrite("SOMv1",5,1,fp); - fwrite(&nsom,sizeof(int),1,fp); + size_t nw; + if ( (nw=fwrite("SOMv1",5,1,fp))!=5 ) error("Failed to write 5 bytes\n"); + if ( (nw=fwrite(&nsom,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int)); int i; for (i=0; isize,sizeof(int),1,fp); - fwrite(&som[i]->kdim,sizeof(int),1,fp); - fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp); - fwrite(som[i]->c,sizeof(double),som[i]->size,fp); + if ( (nw=fwrite(&som[i]->size,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int)); + if ( (nw=fwrite(&som[i]->kdim,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int)); + if ( (nw=fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp))!=sizeof(double)*som[i]->size*som[i]->kdim ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size*som[i]->kdim); + if ( (nw=fwrite(som[i]->c,sizeof(double),som[i]->size,fp))!=sizeof(double)*som[i]->size ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size); } if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix); } @@ -638,7 +640,7 @@ static void usage(void) fprintf(bcftools_stderr, " -n, --ntrain-sites effective number of training sites [number of good sites]\n"); fprintf(bcftools_stderr, " -r, --random-seed random seed, 0 for time() [1]\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfsom(int argc, char *argv[]) diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c index 99aa598..7ec13fb 100644 --- a/bcftools/vcfsort.c +++ b/bcftools/vcfsort.c @@ -1,6 +1,6 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -227,7 +227,7 @@ void merge_blocks(args_t *args) blk_read(args, bhp, args->hdr, blk); } - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); while ( bhp->ndat ) { @@ -252,19 +252,23 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools sort [OPTIONS] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(stderr, " -o, --output-file output file name [stdout]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX]\n"); + fprintf(stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(stderr, " -o, --output FILE output file name [stdout]\n"); + fprintf(stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); +#ifdef _WIN32 + fprintf(stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n"); +#else + fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); +#endif fprintf(stderr, "\n"); exit(1); } -size_t parse_mem_string(char *str) +size_t parse_mem_string(const char *str) { char *tmp; double mem = strtod(str, &tmp); - if ( tmp==str ) error("Could not parse: --max-mem %s\n", str); + if ( tmp==str ) error("Could not parse the memory string: \"%s\"\n", str); if ( !strcasecmp("k",tmp) ) mem *= 1000; else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000; else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000; @@ -274,21 +278,8 @@ size_t parse_mem_string(char *str) void mkdir_p(const char *fmt, ...); static void init(args_t *args) { -#ifdef _WIN32 - char tmp_path[MAX_PATH]; - int ret = GetTempPath(MAX_PATH, tmp_path); - if (!ret || ret > MAX_PATH) - error("Could not get the path to the temporary folder\n"); - if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH) - error("Full path to the temporary folder is too long\n"); - strcat(tmp_path, "/bcftools-sort.XXXXXX"); - args->tmp_dir = strdup(tmp_path); -#else - args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX"); -#endif - size_t len = strlen(args->tmp_dir); - if ( !strcmp("XXXXXX",args->tmp_dir+len-6) ) - { + args->tmp_dir = init_tmp_prefix(args->tmp_dir); + #ifdef _WIN32 int ret = mkdir(mktemp(args->tmp_dir), 0700); if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno)); @@ -298,10 +289,6 @@ static void init(args_t *args) int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR); if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno)); #endif - } - else { - mkdir_p("%s/",args->tmp_dir); - } fprintf(stderr,"Writing to %s\n", args->tmp_dir); } @@ -326,6 +313,7 @@ int main_sort(int argc, char *argv[]) {"temp-dir",required_argument,NULL,'T'}, {"output-type",required_argument,NULL,'O'}, {"output-file",required_argument,NULL,'o'}, + {"output",required_argument,NULL,'o'}, {"help",no_argument,NULL,'h'}, {0,0,0,0} }; diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c index 542fc28..1fd74d3 100644 --- a/bcftools/vcfsort.c.pysam.c +++ b/bcftools/vcfsort.c.pysam.c @@ -2,7 +2,7 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -90,7 +90,7 @@ void clean_files_and_throw(args_t *args, const char *format, ...) vfprintf(bcftools_stderr, format, ap); va_end(ap); clean_files(args); - exit(-1); + bcftools_exit(-1); } int cmp_bcf_pos(const void *aptr, const void *bptr) @@ -229,7 +229,7 @@ void merge_blocks(args_t *args) blk_read(args, bhp, args->hdr, blk); } - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); while ( bhp->ndat ) { @@ -254,19 +254,23 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools sort [OPTIONS] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(bcftools_stderr, " -o, --output-file output file name [bcftools_stdout]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX]\n"); + fprintf(bcftools_stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(bcftools_stderr, " -o, --output FILE output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); +#ifdef _WIN32 + fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n"); +#else + fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); +#endif fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } -size_t parse_mem_string(char *str) +size_t parse_mem_string(const char *str) { char *tmp; double mem = strtod(str, &tmp); - if ( tmp==str ) error("Could not parse: --max-mem %s\n", str); + if ( tmp==str ) error("Could not parse the memory string: \"%s\"\n", str); if ( !strcasecmp("k",tmp) ) mem *= 1000; else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000; else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000; @@ -276,21 +280,8 @@ size_t parse_mem_string(char *str) void mkdir_p(const char *fmt, ...); static void init(args_t *args) { -#ifdef _WIN32 - char tmp_path[MAX_PATH]; - int ret = GetTempPath(MAX_PATH, tmp_path); - if (!ret || ret > MAX_PATH) - error("Could not get the path to the temporary folder\n"); - if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH) - error("Full path to the temporary folder is too long\n"); - strcat(tmp_path, "/bcftools-sort.XXXXXX"); - args->tmp_dir = strdup(tmp_path); -#else - args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX"); -#endif - size_t len = strlen(args->tmp_dir); - if ( !strcmp("XXXXXX",args->tmp_dir+len-6) ) - { + args->tmp_dir = init_tmp_prefix(args->tmp_dir); + #ifdef _WIN32 int ret = mkdir(mktemp(args->tmp_dir), 0700); if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno)); @@ -300,10 +291,6 @@ static void init(args_t *args) int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR); if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno)); #endif - } - else { - mkdir_p("%s/",args->tmp_dir); - } fprintf(bcftools_stderr,"Writing to %s\n", args->tmp_dir); } @@ -328,6 +315,7 @@ int main_sort(int argc, char *argv[]) {"temp-dir",required_argument,NULL,'T'}, {"output-type",required_argument,NULL,'O'}, {"output-file",required_argument,NULL,'o'}, + {"output",required_argument,NULL,'o'}, {"help",no_argument,NULL,'h'}, {0,0,0,0} }; diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index ffa367d..601c557 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2017 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -31,6 +31,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "bin.h" +#include "dist.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -57,7 +59,7 @@ typedef struct float min, max; uint64_t *vals_ts, *vals_tv; void *val; - int nbins, type, m_val; + int nbins, type, m_val, idx; } user_stats_t; @@ -81,7 +83,9 @@ typedef struct #endif int ts_alt1, tv_alt1; #if QUAL_STATS - int *qual_ts, *qual_tv, *qual_snps, *qual_indels; + // Values are rounded to one significant digit and 1 is added (Q*10+1); missing and negative values go in the first bin + // Only SNPs that are the 1st alternate allele are counted + dist_t *qual_ts, *qual_tv, *qual_indels; #endif int *insertions, *deletions, m_indel; // maximum indel length int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; @@ -186,13 +190,6 @@ static inline int idist_i2bin(idist_t *d, int i) return i-1+d->min; } -static inline int clip_nonnegative(float x, int limit) -{ - if (x >= limit || isnan(x)) return limit - 1; - else if (x <= 0.0) return 0; - else return (int) x; -} - #define IC_DBG 0 #if IC_DBG static void _indel_ctx_print1(_idc1_t *idc) @@ -349,12 +346,29 @@ static void add_user_stats(args_t *args, char *str) args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr); user_stats_t *usr = &args->usr[args->nusr-1]; memset(usr,0,sizeof(*usr)); - usr->min = 0; - usr->max = 1; + usr->min = 0; + usr->max = 1; usr->nbins = 100; + usr->idx = 0; char *tmp = str; while ( *tmp && *tmp!=':' ) tmp++; + + // Tag with an index or just tag? (e.g. PV4[1] vs DP) + if ( tmp > str && tmp[-1]==']' ) + { + char *ptr = tmp; + while ( ptr>str && *ptr!='[' ) ptr--; + if ( *ptr=='[' ) + { + char *ptr2; + usr->idx = strtol(ptr+1, &ptr2, 10); + if ( ptr+1==ptr2 || ptr2 != tmp-1 ) error("Could not parse the index in \"%s\" (ptr=%s;ptr2=%s(%p),tmp=%s(%p),idx=%d)\n", str,ptr,ptr2,ptr2,tmp,tmp,usr->idx); + if ( usr->idx<0 ) error("Error: negative index is not allowed: \"%s\"\n", str); + *ptr = 0; + } + } + usr->tag = (char*)calloc(tmp-str+2,sizeof(char)); memcpy(usr->tag,str,tmp-str); @@ -465,10 +479,9 @@ static void init_stats(args_t *args) int j; for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int)); #if QUAL_STATS - stats->qual_ts = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_tv = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_snps = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int)); + stats->qual_ts = dist_init(5); + stats->qual_tv = dist_init(5); + stats->qual_indels = dist_init(5); #endif if ( args->files->n_smpl ) { @@ -548,10 +561,9 @@ static void destroy_stats(args_t *args) for (j=0; j<3; j++) if (stats->af_repeats[j]) free(stats->af_repeats[j]); #if QUAL_STATS - if (stats->qual_ts) free(stats->qual_ts); - if (stats->qual_tv) free(stats->qual_tv); - if (stats->qual_snps) free(stats->qual_snps); - if (stats->qual_indels) free(stats->qual_indels); + if (stats->qual_ts) dist_destroy(stats->qual_ts); + if (stats->qual_tv) dist_destroy(stats->qual_tv); + if (stats->qual_indels) dist_destroy(stats->qual_indels); #endif #if HWE_STATS free(stats->af_hwe); @@ -678,8 +690,8 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) bcf1_t *line = reader->buffer[0]; #if QUAL_STATS - int iqual = clip_nonnegative(line->qual, args->m_qual); - stats->qual_indels[iqual]++; + int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10); + dist_insert(stats->qual_indels, iqual); #endif // Check if the indel is near an exon for the frameshift statistics @@ -780,7 +792,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts) { - int i; + int i, nval; for (i=0; inusr; i++) { user_stats_t *usr = &stats->usr[i]; @@ -788,13 +800,15 @@ static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts) float val; if ( usr->type==BCF_HT_REAL ) { - if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue; - val = ((float*)usr->val)[0]; + if ( (nval=bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue; + if ( usr->idx >= nval ) continue; + val = ((float*)usr->val)[usr->idx]; } else { - if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue; - val = ((int32_t*)usr->val)[0]; + if ( (nval=bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue; + if ( usr->idx >= nval ) continue; + val = ((int32_t*)usr->val)[usr->idx]; } int idx; if ( val<=usr->min ) idx = 0; @@ -813,8 +827,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) if ( ref<0 ) return; #if QUAL_STATS - int iqual = clip_nonnegative(line->qual, args->m_qual); - stats->qual_snps[iqual]++; + int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10); #endif int i; @@ -833,7 +846,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->ts_alt1++; #if QUAL_STATS - stats->qual_ts[iqual]++; + dist_insert(stats->qual_ts,iqual); #endif do_user_stats(stats, reader, 1); } @@ -845,7 +858,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->tv_alt1++; #if QUAL_STATS - stats->qual_tv[iqual]++; + dist_insert(stats->qual_tv,iqual); #endif do_user_stats(stats, reader, 0); } @@ -1354,21 +1367,50 @@ static void print_stats(args_t *args) } } #if QUAL_STATS - printf("# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); + printf("# QUAL, Stats by quality\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - for (i=0; im_qual; i++) + int ndist_ts = dist_nbins(stats->qual_ts); + int ndist_tv = dist_nbins(stats->qual_tv); + int ndist_in = dist_nbins(stats->qual_indels); + int ndist_max = ndist_ts; + if ( ndist_max < ndist_tv ) ndist_max = ndist_tv; + if ( ndist_max < ndist_in ) ndist_max = ndist_in; + uint32_t beg, end; + uint32_t nts, ntv, nin; + for (i=0; iqual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue; - printf("QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]); + nts = ntv = nin = 0; + float qval = -1; + if ( i < ndist_ts ) + { + nts = dist_get(stats->qual_ts, i, &beg, &end); + qval = beg>0 ? 0.1*(beg - 1) : -1; + } + if ( i < ndist_tv ) + { + ntv = dist_get(stats->qual_tv, i, &beg, &end); + if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1; + } + if ( i < ndist_in ) + { + nin = dist_get(stats->qual_indels, i, &beg, &end); + if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1; + } + if ( nts+ntv+nin==0 ) continue; + + printf("QUAL\t%d\t",id); + if ( qval==-1 ) printf("."); + else printf("%.1f",qval); + printf("\t%d\t%d\t%d\t%d\n",nts+ntv,nts,ntv,nin); } } #endif for (i=0; inusr; i++) { - printf("# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", - args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag); + printf("# USR:%s/%d\t[2]id\t[3]%s/%d\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", + args->usr[i].tag,args->usr[i].idx,args->usr[i].tag,args->usr[i].idx); for (id=0; idnstats; id++) { user_stats_t *usr = &args->stats[id].usr[i]; @@ -1377,8 +1419,8 @@ static void print_stats(args_t *args) { if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); - const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n"; - printf(fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); + const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n"; + printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } } @@ -1482,10 +1524,10 @@ static void print_stats(args_t *args) printf("# NRD and discordance is calculated as follows:\n"); printf("# m .. number of matches\n"); printf("# x .. number of mismatches\n"); - printf("# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); - printf("# RR discordance = xRR / (xRR + mRR)\n"); - printf("# RA discordance = xRA / (xRA + mRA)\n"); - printf("# AA discordance = xAA / (xAA + mAA)\n"); + printf("# NRD = 100 * (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); + printf("# RR discordance = 100 * xRR / (xRR + mRR)\n"); + printf("# RA discordance = 100 * xRA / (xRA + mRA)\n"); + printf("# AA discordance = 100 * xAA / (xAA + mAA)\n"); printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); } else @@ -1704,26 +1746,27 @@ static void usage(void) fprintf(stderr, "Usage: bcftools stats [options] []\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " --af-bins allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); - fprintf(stderr, " --af-tag allele frequency tag to use, by default estimated from AN,AC or GT\n"); - fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); - fprintf(stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -E, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); - fprintf(stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(stderr, " -F, --fasta-ref faidx indexed reference sequence file to determine INDEL context\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --samples list of samples for sample stats, \"-\" to include all samples\n"); - fprintf(stderr, " -S, --samples-file file of samples to include\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); + fprintf(stderr, " --af-bins LIST Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); + fprintf(stderr, " --af-tag STRING Allele frequency tag to use, by default estimated from AN,AC or GT\n"); + fprintf(stderr, " -1, --1st-allele-only Include only 1st allele at multiallelic sites\n"); + fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); + fprintf(stderr, " -d, --depth INT,INT,INT Depth distribution: min,max,bin size [0,500,1]\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -E, --exons FILE.gz Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed)\n"); + fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(stderr, " -F, --fasta-ref FILE Faidx indexed reference sequence file to determine INDEL context\n"); + fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n"); + fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); + fprintf(stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -v, --verbose Produce verbose per-site and per-sample output\n"); fprintf(stderr, "\n"); exit(1); } @@ -1795,8 +1838,12 @@ int main_vcfstats(int argc, char *argv[]) case 's': args->samples_list = optarg; break; case 'S': args->samples_list = optarg; args->samples_is_file = 1; break; case 'I': args->split_by_id = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': case '?': usage(); break; diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index c52d016..050a68a 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -2,7 +2,7 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2017 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -33,6 +33,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "bin.h" +#include "dist.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -59,7 +61,7 @@ typedef struct float min, max; uint64_t *vals_ts, *vals_tv; void *val; - int nbins, type, m_val; + int nbins, type, m_val, idx; } user_stats_t; @@ -83,7 +85,9 @@ typedef struct #endif int ts_alt1, tv_alt1; #if QUAL_STATS - int *qual_ts, *qual_tv, *qual_snps, *qual_indels; + // Values are rounded to one significant digit and 1 is added (Q*10+1); missing and negative values go in the first bin + // Only SNPs that are the 1st alternate allele are counted + dist_t *qual_ts, *qual_tv, *qual_indels; #endif int *insertions, *deletions, m_indel; // maximum indel length int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; @@ -188,13 +192,6 @@ static inline int idist_i2bin(idist_t *d, int i) return i-1+d->min; } -static inline int clip_nonnegative(float x, int limit) -{ - if (x >= limit || isnan(x)) return limit - 1; - else if (x <= 0.0) return 0; - else return (int) x; -} - #define IC_DBG 0 #if IC_DBG static void _indel_ctx_print1(_idc1_t *idc) @@ -351,12 +348,29 @@ static void add_user_stats(args_t *args, char *str) args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr); user_stats_t *usr = &args->usr[args->nusr-1]; memset(usr,0,sizeof(*usr)); - usr->min = 0; - usr->max = 1; + usr->min = 0; + usr->max = 1; usr->nbins = 100; + usr->idx = 0; char *tmp = str; while ( *tmp && *tmp!=':' ) tmp++; + + // Tag with an index or just tag? (e.g. PV4[1] vs DP) + if ( tmp > str && tmp[-1]==']' ) + { + char *ptr = tmp; + while ( ptr>str && *ptr!='[' ) ptr--; + if ( *ptr=='[' ) + { + char *ptr2; + usr->idx = strtol(ptr+1, &ptr2, 10); + if ( ptr+1==ptr2 || ptr2 != tmp-1 ) error("Could not parse the index in \"%s\" (ptr=%s;ptr2=%s(%p),tmp=%s(%p),idx=%d)\n", str,ptr,ptr2,ptr2,tmp,tmp,usr->idx); + if ( usr->idx<0 ) error("Error: negative index is not allowed: \"%s\"\n", str); + *ptr = 0; + } + } + usr->tag = (char*)calloc(tmp-str+2,sizeof(char)); memcpy(usr->tag,str,tmp-str); @@ -467,10 +481,9 @@ static void init_stats(args_t *args) int j; for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int)); #if QUAL_STATS - stats->qual_ts = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_tv = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_snps = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int)); + stats->qual_ts = dist_init(5); + stats->qual_tv = dist_init(5); + stats->qual_indels = dist_init(5); #endif if ( args->files->n_smpl ) { @@ -550,10 +563,9 @@ static void destroy_stats(args_t *args) for (j=0; j<3; j++) if (stats->af_repeats[j]) free(stats->af_repeats[j]); #if QUAL_STATS - if (stats->qual_ts) free(stats->qual_ts); - if (stats->qual_tv) free(stats->qual_tv); - if (stats->qual_snps) free(stats->qual_snps); - if (stats->qual_indels) free(stats->qual_indels); + if (stats->qual_ts) dist_destroy(stats->qual_ts); + if (stats->qual_tv) dist_destroy(stats->qual_tv); + if (stats->qual_indels) dist_destroy(stats->qual_indels); #endif #if HWE_STATS free(stats->af_hwe); @@ -680,8 +692,8 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) bcf1_t *line = reader->buffer[0]; #if QUAL_STATS - int iqual = clip_nonnegative(line->qual, args->m_qual); - stats->qual_indels[iqual]++; + int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10); + dist_insert(stats->qual_indels, iqual); #endif // Check if the indel is near an exon for the frameshift statistics @@ -782,7 +794,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts) { - int i; + int i, nval; for (i=0; inusr; i++) { user_stats_t *usr = &stats->usr[i]; @@ -790,13 +802,15 @@ static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts) float val; if ( usr->type==BCF_HT_REAL ) { - if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue; - val = ((float*)usr->val)[0]; + if ( (nval=bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue; + if ( usr->idx >= nval ) continue; + val = ((float*)usr->val)[usr->idx]; } else { - if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue; - val = ((int32_t*)usr->val)[0]; + if ( (nval=bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue; + if ( usr->idx >= nval ) continue; + val = ((int32_t*)usr->val)[usr->idx]; } int idx; if ( val<=usr->min ) idx = 0; @@ -815,8 +829,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) if ( ref<0 ) return; #if QUAL_STATS - int iqual = clip_nonnegative(line->qual, args->m_qual); - stats->qual_snps[iqual]++; + int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10); #endif int i; @@ -835,7 +848,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->ts_alt1++; #if QUAL_STATS - stats->qual_ts[iqual]++; + dist_insert(stats->qual_ts,iqual); #endif do_user_stats(stats, reader, 1); } @@ -847,7 +860,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->tv_alt1++; #if QUAL_STATS - stats->qual_tv[iqual]++; + dist_insert(stats->qual_tv,iqual); #endif do_user_stats(stats, reader, 0); } @@ -872,7 +885,7 @@ static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); bcftools_exit(1); break; } #undef BRANCH_INT @@ -1020,7 +1033,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break; } #undef BRANCH_INT } @@ -1051,7 +1064,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break; } #undef BRANCH_INT } @@ -1356,21 +1369,50 @@ static void print_stats(args_t *args) } } #if QUAL_STATS - fprintf(bcftools_stdout, "# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); + fprintf(bcftools_stdout, "# QUAL, Stats by quality\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - for (i=0; im_qual; i++) + int ndist_ts = dist_nbins(stats->qual_ts); + int ndist_tv = dist_nbins(stats->qual_tv); + int ndist_in = dist_nbins(stats->qual_indels); + int ndist_max = ndist_ts; + if ( ndist_max < ndist_tv ) ndist_max = ndist_tv; + if ( ndist_max < ndist_in ) ndist_max = ndist_in; + uint32_t beg, end; + uint32_t nts, ntv, nin; + for (i=0; iqual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue; - fprintf(bcftools_stdout, "QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]); + nts = ntv = nin = 0; + float qval = -1; + if ( i < ndist_ts ) + { + nts = dist_get(stats->qual_ts, i, &beg, &end); + qval = beg>0 ? 0.1*(beg - 1) : -1; + } + if ( i < ndist_tv ) + { + ntv = dist_get(stats->qual_tv, i, &beg, &end); + if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1; + } + if ( i < ndist_in ) + { + nin = dist_get(stats->qual_indels, i, &beg, &end); + if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1; + } + if ( nts+ntv+nin==0 ) continue; + + fprintf(bcftools_stdout, "QUAL\t%d\t",id); + if ( qval==-1 ) fprintf(bcftools_stdout, "."); + else fprintf(bcftools_stdout, "%.1f",qval); + fprintf(bcftools_stdout, "\t%d\t%d\t%d\t%d\n",nts+ntv,nts,ntv,nin); } } #endif for (i=0; inusr; i++) { - fprintf(bcftools_stdout, "# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", - args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag); + fprintf(bcftools_stdout, "# USR:%s/%d\t[2]id\t[3]%s/%d\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", + args->usr[i].tag,args->usr[i].idx,args->usr[i].tag,args->usr[i].idx); for (id=0; idnstats; id++) { user_stats_t *usr = &args->stats[id].usr[i]; @@ -1379,8 +1421,8 @@ static void print_stats(args_t *args) { if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); - const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n"; - fprintf(bcftools_stdout, fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); + const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n"; + fprintf(bcftools_stdout, fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } } @@ -1484,10 +1526,10 @@ static void print_stats(args_t *args) fprintf(bcftools_stdout, "# NRD and discordance is calculated as follows:\n"); fprintf(bcftools_stdout, "# m .. number of matches\n"); fprintf(bcftools_stdout, "# x .. number of mismatches\n"); - fprintf(bcftools_stdout, "# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); - fprintf(bcftools_stdout, "# RR discordance = xRR / (xRR + mRR)\n"); - fprintf(bcftools_stdout, "# RA discordance = xRA / (xRA + mRA)\n"); - fprintf(bcftools_stdout, "# AA discordance = xAA / (xAA + mAA)\n"); + fprintf(bcftools_stdout, "# NRD = 100 * (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); + fprintf(bcftools_stdout, "# RR discordance = 100 * xRR / (xRR + mRR)\n"); + fprintf(bcftools_stdout, "# RA discordance = 100 * xRA / (xRA + mRA)\n"); + fprintf(bcftools_stdout, "# AA discordance = 100 * xAA / (xAA + mAA)\n"); fprintf(bcftools_stdout, "# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); } else @@ -1706,28 +1748,29 @@ static void usage(void) fprintf(bcftools_stderr, "Usage: bcftools stats [options] []\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " --af-bins allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); - fprintf(bcftools_stderr, " --af-tag allele frequency tag to use, by default estimated from AN,AC or GT\n"); - fprintf(bcftools_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); - fprintf(bcftools_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(bcftools_stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -E, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); - fprintf(bcftools_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(bcftools_stderr, " -F, --fasta-ref faidx indexed reference sequence file to determine INDEL context\n"); - fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --samples list of samples for sample stats, \"-\" to include all samples\n"); - fprintf(bcftools_stderr, " -S, --samples-file file of samples to include\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); + fprintf(bcftools_stderr, " --af-bins LIST Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); + fprintf(bcftools_stderr, " --af-tag STRING Allele frequency tag to use, by default estimated from AN,AC or GT\n"); + fprintf(bcftools_stderr, " -1, --1st-allele-only Include only 1st allele at multiallelic sites\n"); + fprintf(bcftools_stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); + fprintf(bcftools_stderr, " -d, --depth INT,INT,INT Depth distribution: min,max,bin size [0,500,1]\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -E, --exons FILE.gz Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed)\n"); + fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(bcftools_stderr, " -F, --fasta-ref FILE Faidx indexed reference sequence file to determine INDEL context\n"); + fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n"); + fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); + fprintf(bcftools_stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbose Produce verbose per-site and per-sample output\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfstats(int argc, char *argv[]) @@ -1797,8 +1840,12 @@ int main_vcfstats(int argc, char *argv[]) case 's': args->samples_list = optarg; break; case 'S': args->samples_list = optarg; args->samples_is_file = 1; break; case 'I': args->split_by_id = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': case '?': usage(); break; diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index 4117d10..ce4c810 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -1,6 +1,6 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Shane McCarthy @@ -221,12 +221,10 @@ static void init_data(args_t *args) } // setup output + const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out); char modew[8]; - strcpy(modew, "w"); + strcpy(modew,tmp); if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel); - if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF - else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF - else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( args->n_threads > 0) @@ -501,7 +499,7 @@ static void usage(args_t *args) fprintf(stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); fprintf(stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output-file output file name [stdout]\n"); + fprintf(stderr, " -o, --output output file name [stdout]\n"); fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); @@ -575,6 +573,7 @@ int main_vcfview(int argc, char *argv[]) {"force-samples",no_argument,NULL,1}, {"output-type",required_argument,NULL,'O'}, {"output-file",required_argument,NULL,'o'}, + {"output",required_argument,NULL,'o'}, {"types",required_argument,NULL,'v'}, {"exclude-types",required_argument,NULL,'V'}, {"targets",required_argument,NULL,'t'}, @@ -639,9 +638,12 @@ int main_vcfview(int argc, char *argv[]) break; case 'v': args->include_types = optarg; break; case 'V': args->exclude_types = optarg; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'c': { args->min_ac_type = ALLELE_NONREF; diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index 77643b7..75b3e64 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -2,7 +2,7 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Shane McCarthy @@ -166,7 +166,7 @@ static void init_data(args_t *args) if (args->include_types || args->exclude_types) { if (args->include_types && args->exclude_types) { fprintf(bcftools_stderr, "Error: only supply one of --include-types, --exclude-types options\n"); - exit(1); + bcftools_exit(1); } char **type_list = 0; int m = 0, n = 0; @@ -197,7 +197,7 @@ static void init_data(args_t *args) else { fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n"); - exit(1); + bcftools_exit(1); } } } @@ -213,7 +213,7 @@ static void init_data(args_t *args) else { fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n"); - exit(1); + bcftools_exit(1); } } } @@ -223,12 +223,10 @@ static void init_data(args_t *args) } // setup output + const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out); char modew[8]; - strcpy(modew, "w"); + strcpy(modew,tmp); if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel); - if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF - else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF - else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( args->n_threads > 0) @@ -302,7 +300,7 @@ int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line) case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break; } #undef BRANCH_INT if (!sample_phased) { @@ -503,7 +501,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); fprintf(bcftools_stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output-file output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -o, --output output file name [bcftools_stdout]\n"); fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); @@ -533,7 +531,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); fprintf(bcftools_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); fprintf(bcftools_stderr, "\n"); - exit(1); + bcftools_exit(1); } int main_vcfview(int argc, char *argv[]) @@ -577,6 +575,7 @@ int main_vcfview(int argc, char *argv[]) {"force-samples",no_argument,NULL,1}, {"output-type",required_argument,NULL,'O'}, {"output-file",required_argument,NULL,'o'}, + {"output",required_argument,NULL,'o'}, {"types",required_argument,NULL,'v'}, {"exclude-types",required_argument,NULL,'V'}, {"targets",required_argument,NULL,'t'}, @@ -641,9 +640,12 @@ int main_vcfview(int argc, char *argv[]) break; case 'v': args->include_types = optarg; break; case 'V': args->exclude_types = optarg; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'c': { args->min_ac_type = ALLELE_NONREF; diff --git a/bcftools/vcmp.c b/bcftools/vcmp.c index 7d3b0f9..dbdc4b7 100644 --- a/bcftools/vcmp.c +++ b/bcftools/vcmp.c @@ -1,6 +1,6 @@ /* vcmp.c -- reference allele utility functions. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2015, 2018 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/vcmp.c.pysam.c b/bcftools/vcmp.c.pysam.c index 00435bd..18a6813 100644 --- a/bcftools/vcmp.c.pysam.c +++ b/bcftools/vcmp.c.pysam.c @@ -2,7 +2,7 @@ /* vcmp.c -- reference allele utility functions. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2015, 2018 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/vcmp.h b/bcftools/vcmp.h index 9c6370c..03234b4 100644 --- a/bcftools/vcmp.h +++ b/bcftools/vcmp.h @@ -1,6 +1,6 @@ /* vcmp.h -- reference allele utility functions. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2015 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/version.c b/bcftools/version.c index 19cec91..d068897 100644 --- a/bcftools/version.c +++ b/bcftools/version.c @@ -1,6 +1,6 @@ /* version.c -- report version numbers for plugins. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include "bcftools.h" @@ -60,7 +61,6 @@ void error_errno(const char *format, ...) exit(-1); } - const char *hts_bcf_wmode(int file_type) { if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF @@ -69,4 +69,14 @@ const char *hts_bcf_wmode(int file_type) return "w"; // uncompressed VCF } +const char *hts_bcf_wmode2(int file_type, char *fname) +{ + if ( !fname ) return hts_bcf_wmode(file_type); + int len = strlen(fname); + if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); + if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF); + if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + return hts_bcf_wmode(file_type); +} diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c index 01dad07..37fa828 100644 --- a/bcftools/version.c.pysam.c +++ b/bcftools/version.c.pysam.c @@ -2,7 +2,7 @@ /* version.c -- report version numbers for plugins. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include "bcftools.h" @@ -44,7 +45,7 @@ void error(const char *format, ...) va_start(ap, format); vfprintf(bcftools_stderr, format, ap); va_end(ap); - exit(-1); + bcftools_exit(-1); } void error_errno(const char *format, ...) @@ -59,10 +60,9 @@ void error_errno(const char *format, ...) } else { fprintf(bcftools_stderr, "\n"); } - exit(-1); + bcftools_exit(-1); } - const char *hts_bcf_wmode(int file_type) { if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF @@ -71,4 +71,14 @@ const char *hts_bcf_wmode(int file_type) return "w"; // uncompressed VCF } +const char *hts_bcf_wmode2(int file_type, char *fname) +{ + if ( !fname ) return hts_bcf_wmode(file_type); + int len = strlen(fname); + if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); + if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF); + if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + return hts_bcf_wmode(file_type); +} diff --git a/bcftools/version.sh b/bcftools/version.sh index 7232440..52b1e08 100755 --- a/bcftools/version.sh +++ b/bcftools/version.sh @@ -1,7 +1,30 @@ #!/bin/sh +# version.sh +# +# Author : Petr Danecek +# +# Copyright (C) 2018-2021 Genome Research Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.10.2 +VERSION=1.13 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/cy_build.py b/cy_build.py index fae7055..aff41a0 100644 --- a/cy_build.py +++ b/cy_build.py @@ -31,6 +31,7 @@ def is_pip_install(): class CyExtension(Extension): def __init__(self, *args, **kwargs): self._init_func = kwargs.pop("init_func", None) + self._prebuild_func = kwargs.pop("prebuild_func", None) Extension.__init__(self, *args, **kwargs) def extend_includes(self, includes): @@ -82,5 +83,8 @@ class cy_build_ext(build_ext): ext.extra_link_args = [] ext.extra_link_args += ['-Wl,-rpath,$ORIGIN'] - + + if isinstance(ext, CyExtension) and ext._prebuild_func: + ext._prebuild_func(ext, self.force) + build_ext.build_extension(self, ext) diff --git a/devtools/import.py b/devtools/import.py index f54138b..ea35792 100644 --- a/devtools/import.py +++ b/devtools/import.py @@ -40,7 +40,7 @@ EXCLUDE = { "htslib": ( 'htslib/tabix.c', 'htslib/bgzip.c', 'htslib/htsfile.c', - "test"), + "test", "tests"), } @@ -93,6 +93,10 @@ def _update_pysam_files(cf, destdir): else: lines = re.sub(r"int main\(", "int {}_{}_main(".format( basename, subname), lines) + if basename == "samtools": + lines = re.sub(r"main_(reheader)\(", + r"samtools_main_\1(", lines) + lines = re.sub(r"\bexit\(", "{}_exit(".format(basename), lines) lines = re.sub("stderr", "{}_stderr".format(basename), lines) lines = re.sub("stdout", "{}_stdout".format(basename), lines) lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines) @@ -103,9 +107,6 @@ def _update_pysam_files(cf, destdir): fn = os.path.basename(filename) # some specific fixes: SPECIFIC_SUBSTITUTIONS = { - "bamtk.c": ( - 'else if (strcmp(argv[1], "tview") == 0)', - '//else if (strcmp(argv[1], "tview") == 0)'), "bam_md.c": ( 'sam_open_format("-", mode_w', 'sam_open_format({}_stdout_fn, mode_w'.format(basename)), @@ -120,6 +121,10 @@ def _update_pysam_files(cf, destdir): lines = lines.replace( SPECIFIC_SUBSTITUTIONS[fn][0], SPECIFIC_SUBSTITUTIONS[fn][1]) + if fn == "bamtk.c": + lines = re.sub(r'(#include "version.h")', r'\1\n#include "samtools_config_vars.h"', lines) + lines = re.sub(r'(else if.*"tview")', r'//\1', lines) + outfile.write(lines) with open(os.path.join("import", "pysam.h")) as inf, \ @@ -224,9 +229,25 @@ if len(sys.argv) >= 1: outf.write(line) os.rename(tmpfilename, filename) + def _update_version_doc_file(dest, value, filename): + tmpfilename = filename + ".tmp" + with open(filename, encoding="utf-8") as inf: + with open(tmpfilename, "w", encoding="utf-8") as outf: + for line in inf: + if " wraps " in line: + # hide the sentence's fullstop from the main regexp + line = re.sub(r'\.$', ',DOT', line) + line = re.sub(r'{}-[^*,]*'.format(dest), + '{}-{}'.format(dest, value), line) + line = re.sub(',DOT', '.', line) + outf.write(line) + os.rename(tmpfilename, filename) + version = _getVersion(srcdir) _update_version_file("__{}_version__".format(dest), version, "pysam/version.py") _update_version_file(C_VERSION[dest], version + " (pysam)", "pysam/version.h") + _update_version_doc_file(dest, version, "README.rst") + _update_version_doc_file(dest, version, "doc/index.rst") sys.exit(0) diff --git a/devtools/install-CGAT-tools.sh b/devtools/install-CGAT-tools.sh index 27eb481..e45d391 100755 --- a/devtools/install-CGAT-tools.sh +++ b/devtools/install-CGAT-tools.sh @@ -80,7 +80,7 @@ else fi # if-OS } # install_os_packages -# funcion to install Python dependencies +# function to install Python dependencies install_python_deps() { if [ "$OS" == "ubuntu" -o "$OS" == "sl" ] ; then @@ -185,12 +185,13 @@ python setup.py install # problems in the compilation test. cd tests -# create auxilliary data +# create auxiliary data echo echo 'building test data' echo make -C pysam_data all make -C cbcf_data all +make -C tabix_data all # run nosetests # -s: do not capture stdout, conflicts with pysam.dispatch diff --git a/devtools/run_tests_travis.sh b/devtools/run_tests_travis.sh index 9ad41a7..1f14fc3 100755 --- a/devtools/run_tests_travis.sh +++ b/devtools/run_tests_travis.sh @@ -37,8 +37,8 @@ conda config --add channels conda-forge # pin versions, so that tests do not fail when pysam/htslib out of step # add htslib dependencies -# NB: we force conda-forge:ncurses due to bioconda/bioconda-recipes#13488 -conda install -y "samtools=1.9" "bcftools=1.9" "htslib=1.9" xz curl bzip2 conda-forge:ncurses +# NB: force conda-forge:blas due to conda/conda#7548 +conda install -y "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" xz curl bzip2 "conda-forge::blas=*=openblas" # As HTSLIB_MODE is (defaulted to) 'shared', ensure we don't pick up # the external headers from the Conda-installed htslib package. @@ -60,12 +60,13 @@ echo "============ installing via setup.py from repository ============" echo python setup.py install || exit -# create auxilliary data +# create auxiliary data echo echo 'building test data' echo make -C tests/pysam_data make -C tests/cbcf_data +make -C tests/tabix_data # echo any limits that are in place ulimit -a diff --git a/doc/api.rst b/doc/api.rst index 3f2c042..6246c35 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1,7 +1,4 @@ -====================================================== -pysam - An interface for reading and writing SAM files -====================================================== - +============ Introduction ============ @@ -24,7 +21,7 @@ iteration returns a :class:`~pysam.AlignedSegment` object which represents a single read along with its fields and optional tags:: for read in samfile.fetch('chr1', 100, 120): - print read + print read samfile.close() @@ -41,8 +38,8 @@ You can also write to a :class:`~pysam.AlignmentFile`:: samfile = pysam.AlignmentFile("ex1.bam", "rb") pairedreads = pysam.AlignmentFile("allpaired.bam", "wb", template=samfile) for read in samfile.fetch(): - if read.is_paired: - pairedreads.write(read) + if read.is_paired: + pairedreads.write(read) pairedreads.close() samfile.close() @@ -130,11 +127,12 @@ More detailed usage instructions is at :ref:`usage`. The pysam website containing documentation +=== API === SAM/BAM/CRAM files -------------------- +================== Objects of type :class:`~pysam.AlignmentFile` allow working with BAM/SAM formatted files. @@ -162,7 +160,7 @@ a SAM/BAM file. Tabix files ------------ +=========== :class:`~pysam.TabixFile` opens tabular files that have been indexed with tabix_. @@ -191,14 +189,14 @@ To iterate over tabix files, use :func:`~pysam.tabix_iterator`: :members: -Fasta files ------------ +FASTA files +=========== .. autoclass:: pysam.FastaFile :members: -Fastq files ------------ +FASTQ files +=========== .. autoclass:: pysam.FastxFile :members: @@ -208,8 +206,8 @@ Fastq files :members: -VCF files ---------- +VCF/BCF files +============= .. autoclass:: pysam.VariantFile :members: @@ -224,7 +222,7 @@ VCF files :members: HTSFile -------- +======= HTSFile is the base class for :class:`pysam.AlignmentFile` and :class:`pysam.VariantFile`. diff --git a/doc/benchmarking.rst b/doc/benchmarking.rst index 1ec0d43..8fc054a 100644 --- a/doc/benchmarking.rst +++ b/doc/benchmarking.rst @@ -1,3 +1,5 @@ +.. _Benchmarking: + ============ Benchmarking ============ diff --git a/doc/conf.py b/doc/conf.py index 375aa55..39b6f45 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -11,13 +11,13 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os, glob +import sys, os, sysconfig # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -_libdir = "../build/lib.%s-%s-%s.%s" % (os.uname()[0].lower(), os.uname()[4], - sys.version_info[0], sys.version_info[1]) +_pyversion = sysconfig.get_python_version() +_libdir = "../build/lib.%s-%s" % (sysconfig.get_platform(), _pyversion) if os.path.exists(_libdir): sys.path.insert(0, os.path.abspath(_libdir)) @@ -32,7 +32,7 @@ extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon'] -intersphinx_mapping = {'python': ('http://docs.python.org/3.5', None)} +intersphinx_mapping = {'python': ('https://docs.python.org/%s' % _pyversion, None)} # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -48,7 +48,7 @@ master_doc = 'index' # General information about the project. project = u'pysam' -copyright = u'2009, Andreas Heger, Kevin Jacobs et al.' +copyright = u'2009–2021, Andreas Heger, Kevin Jacobs, et al' # Included at the end of each rst file rst_epilog = ''' @@ -61,6 +61,8 @@ rst_epilog = ''' .. _Galaxy: https://main.g2.bx.psu.edu/ .. _cython: http://cython.org/ .. _python: http://python.org/ +.. _pypi: https://pypi.org/ +.. _pip: https://pip.pypa.io/ .. _pyximport: http://www.prescod.net/pyximport/ .. _conda: https://conda.io/docs/ .. _bioconda: https://bioconda.github.io/ @@ -201,8 +203,8 @@ htmlhelp_basename = 'samtoolsdoc' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'pysam.tex', ur'pysam documentation', - ur'Andreas Heger, Kevin Jacobs et al.', 'manual'), + ('index', 'pysam.tex', u'pysam documentation', + u'Andreas Heger, Kevin Jacobs, et al.', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/doc/developer.rst b/doc/developer.rst index 09ae832..ca49fdc 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -12,7 +12,7 @@ directories: Code specific to pysam :file:`doc` - The documentation. To build the latest documention type:: + The documentation. To build the latest documentation type:: make -C doc html @@ -46,6 +46,17 @@ run:: pytest tests +Most tests use test data from the :file:`tests/*_data` directories. +Some of these test data files are generated from other files in these +directories, which is done by running ``make`` in each directory:: + + make -C tests/pysam_data + # etc + +Alternatively if any :file:`tests/*_data/all.stamp` file is not already +present, running the unit tests should generate that directory's data +files automatically. + Benchmarking ============ diff --git a/doc/faq.rst b/doc/faq.rst index 62fe11d..fc39b60 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -5,17 +5,18 @@ FAQ How should I cite pysam ======================= -Pysam has not been published in print. When refering pysam, please +Pysam has not been published in print. When referring to pysam, please use the github URL: https://github.com/pysam-developers/pysam. As pysam is a wrapper around htslib and the samtools package, I -suggest cite `Li et al (2009) `. +suggest citing [Li.2009]_, [Bonfield.2021]_, and/or [Danecek.2021]_, +as appropriate. Is pysam thread-safe? ===================== Pysam is a mix of python and C code. Instructions within python are generally made thread-safe through python's `global interpreter lock`_ -(GIL_). This ensures that python data structures will always be in a +(:dfn:`GIL`). This ensures that python data structures will always be in a consistent state. If an external function outside python is called, the programmer has a @@ -28,7 +29,7 @@ Alternatively, the GIL can be released while the external function is called. This will allow other threads to run concurrently. This can be beneficial if the external function is expected to halt, for example when waiting for data to read or write. However, to achieve -thread-safety, the external function needs to implememented with +thread-safety, the external function needs to be implemented with thread-safety in mind. This means that there can be no shared state between threads, or if there is shared, it needs to be controlled to prevent any access conflicts. @@ -38,7 +39,7 @@ I/O intensive tasks. This is generally fine, but thread-safety of all parts have not been fully tested. A related issue is when different threads read from the same file -objec - or the same thread uses two iterators over a file. There is +object - or the same thread uses two iterators over a file. There is only a single file-position for each opened file. To prevent this from hapeding, use the option ``multiple_iterator=True`` when calling a fetch() method. This will return an iterator on a newly opened @@ -141,7 +142,7 @@ I can't call AlignmentFile.fetch on a file without index :meth:`~pysam.AlignmentFile.fetch` requires an index when iterating over a SAM/BAM file. To iterate over a file without -index, use the ``until_eof=True`:: +index, use the ``until_eof=True``:: bf = pysam.AlignmentFile(fname, "rb") for r in bf.fetch(until_eof=True): diff --git a/doc/glossary.rst b/doc/glossary.rst index 4e9fa57..0389270 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -48,6 +48,11 @@ Glossary Binary SAM format. BAM files are binary formatted, indexed and allow random access. + CRAM + CRAM is a binary format representing the same sequence alignment + information as SAM and BAM, but offering significantly better + lossless compression than BAM. + TAM Text SAM file. TAM files are human readable files of tab-separated fields. TAM files do not allow random access. @@ -106,6 +111,14 @@ Glossary BCF Binary :term:`VCF` + FASTA + Simple text format containing sequence data, with only the bare + minimum of metadata. Typically used for reference sequence data. + + FASTQ + Simple text format containing sequence data and associated base + qualities. + tabix Utility in the htslib package to index :term:`bgzip` compressed files. diff --git a/doc/index.rst b/doc/index.rst index 4e18b76..15de2ca 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,8 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as using cython and a high-level, pythonic API for convenient access to the data within genomic file formats. -The current version wraps *htslib-1.10.2*, *samtools-1.10* and -*bcftools-1.10.2*. +The current version wraps *htslib-1.13*, *samtools-1.13*, and *bcftools-1.13*. To install the latest release, type:: @@ -54,9 +53,21 @@ Contents: References ---------- -.. [Li2009] The Sequence Alignment/Map format and SAMtools. Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. - Bioinformatics. 2009 Aug 15;25(16):2078-9. Epub 2009 Jun 8. - `PMID: 19505943 `_ +.. [Li.2009] *The Sequence Alignment/Map format and SAMtools.* + Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. + Bioinformatics. 2009 Aug 15;25(16):2078-9. + Epub 2009 Jun 8 `btp352 `_. + PMID: `19505943 `_. + +.. [Bonfield.2021] *HTSlib: C library for reading/writing high-throughput sequencing data.* + Bonfield JK, Marshall J, Danecek P, Li H, Ohan V, Whitwham A, Keane T, Davies RM. + GigaScience (2021) 10(2) `giab007 `_. + PMID: `33594436 `_. + +.. [Danecek.2021] *Twelve years of SAMtools and BCFtools.* + Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. + GigaScience (2021) 10(2) `giab008 `_. + PMID: `33590861 `_. .. seealso:: diff --git a/doc/installation.rst b/doc/installation.rst index 535f4bc..a286c27 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -46,7 +46,7 @@ features. If these fail, for example due to missing library dependencies (`libcurl`, `libcrypto`), it will fall back to conservative defaults. -Options can be passed to the configure script explicitely by +Options can be passed to the configure script explicitly by setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`. For example:: diff --git a/doc/release.rst b/doc/release.rst index 06c602b..966ee6a 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,48 @@ Release notes ============= +Release 0.17.0 +============== + +This release wraps htslib/samtools/bcftools version 1.13. Corresponding +to new samtools commands, `pysam.samtools` now has additional functions +`ampliconclip`, `ampliconstats`, `fqimport`, and `version`. + +Bugs fixed: + +* [#447] The maximum QNAME length is fully restored to 254 +* [#506, #958, #1000] Don't crash the Python interpreter on ``pysam.bcftools.*()`` errors +* [#603] count_coverage: ignore reads that have no SEQ field +* [#928] Fix ``pysam.bcftools.mpileup()`` segmentation fault +* [#983] Add win32/\*.[ch] to MANIFEST.in +* [#994] Raise exception in ``get_tid()`` if header could not be parsed +* [#995] Choose TBI/CSI in ``tabix_index()`` via both min_shift and csi +* [#996] ``AlignmentFile.fetch()`` now works with large chromosomes longer than 2\ :sup:`29` bases +* [#1019] Fix Sphinx documentation generation by avoiding Python 2 ``ur'string'`` syntax +* [#1035] Improved handling of file iteration errors +* [#1038] ``tabix_index()`` no longer leaks file descriptors +* [#1040] ``print(aligned_segment)`` now prints the correct TLEN value + (it also now prints RNAME/RNEXT more clearly and prints POS/PNEXT 1-based) +* *setup.py* longer uses ``setup(use_2to3)`` for compatibility with setuptools >= v58.0.0 + +New facilities: + +* [PR #963] Additional VCF classes are exposed to pysam programmers +* [#998, PR #1001] Add ``get/set_encoding_error_handler()`` to control UTF-8 conversion +* [PR #1012] Running ``python setup.py sdist`` now automatically runs cythonize +* Running tests with ``pytest`` now automatically runs ``make`` to generate test data + +Documentation improvements: + +* [#726] Clarify get_forward_sequence/get_forward_qualities documentation +* [#865] Improved example +* [#968] ``get_index_statstics`` parameters +* [#986] Clarify ``VariantFile.fetch`` start/stop region parameters are 0-based and half-open. +* [#990] Corrected ``PileupColumn.get_query_sequences`` documentation +* [#999] Fix documentation for ``AlignmentFile.get_reference_length()`` +* [#1002] Document the default min_base_quality for ``pileup()`` + + Release 0.16.0 ============== @@ -149,7 +191,7 @@ Backwards incompatible changes: The rationale for this change is to have consistency between AlignmentFile and VariantFile. - + * AlignmentFile and FastaFile now raise IOError instead of OSError Medium term we plan to have a 1.0 release. The pysam @@ -190,6 +232,7 @@ contains a series of bugfixes. * [#473] A new FastxRecord class that can be instantiated from class and modified in-place. Replaces PersistentFastqProxy. * [#521] In AligmentFile, Simplify file detection logic and allow remote index files + * Removed attempts to guess data and index file names; this is magic left to htslib. * Removed file existence check prior to opening files with htslib @@ -200,6 +243,7 @@ contains a series of bugfixes. * Allow remote indices (tested using S3 signed URLs). * Document filepath_index and make it an alias for index_filename. * Added a require_index parameter to AlignmentFile + * [#526] handle unset ref when creating new records * [#513] fix bcf_translate to skip deleted FORMAT fields to avoid segfaults @@ -225,7 +269,7 @@ are created will need to change as the constructor requires a header:: header = pysam.AlignmentHeader( reference_names=["chr1", "chr2"], reference_lengths=[1000, 1000]) - + read = pysam.AlignedSegment(header) This will affect all code that instantiates AlignedSegment objects @@ -252,7 +296,7 @@ Release 0.11.2 ============== This release wraps htslib/samtools/bcfools versions 1.4.1 in response -to a security fix in these libraries. Additionaly the following +to a security fix in these libraries. Additionally the following issues have been fixed: * [#452] add GFF3 support for tabix parsers @@ -373,7 +417,7 @@ Overview -------- The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other -enchancements and bugfixes. See below for a detailed list. +enhancements and bugfixes. See below for a detailed list. `Htslib 1.3 `_ comes with additional capabilities for remote file access which depend @@ -416,7 +460,7 @@ Detailed release notes and code bloat. * run configure for the builtin htslib library in order to detect optional libraries such as libcurl. Configure behaviour can be - controlled by setting the environmet variable + controlled by setting the environment variable HTSLIB_CONFIGURE_OPTIONS. * get_reference_sequence() now returns the reference sequence and not something looking like it. This bug had effects on @@ -440,15 +484,17 @@ Potential isses when upgrading from v0.8.3: * renamed several methods for pep8 compatibility, old names still retained for backwards compatibility, but should be considered deprecated. + * gettid() is now get_tid() * getrname() is now get_reference_name() * parseRegion() is now parse_region() * some methods have changed for pep8 compatibility without the old names being present: + * fromQualityString() is now qualitystring_to_array() * toQualityString() is now qualities_to_qualitystring() - + * faidx now returns strings and not binary strings in py3. * The cython components have been broken up into smaller files with @@ -557,7 +603,7 @@ Release 0.8.2 with reading and writing capability. However, the interface is still incomplete and preliminary and lacks capability to mutate the resulting data. - + Release 0.8.1 ============= @@ -569,7 +615,7 @@ Release 0.8.1 * issue #19: multiple iterators can now be made to work on the same tabix file * issue #24: All strings returned from/passed to the pysam API are now unicode in python 3 * issue #5: type guessing for lists of integers fixed - + * API changes for consistency. The old API is still present, but deprecated. In particular: @@ -619,7 +665,7 @@ Other changes: Backwards incompatible changes -* Empty cigarstring now returns None (intstead of '') +* Empty cigarstring now returns None (instead of '') * Empty cigar now returns None (instead of []) * When using the extension classes in cython modules, AlignedRead needs to be substituted with AlignedSegment. @@ -686,18 +732,18 @@ Release 0.7.5 Release 0.7.4 ============= - + * further bugfixes to setup.py and package layout Release 0.7.3 ============= - + * further bugfixes to setup.py * upgraded distribute_setup.py to 0.6.34 Release 0.7.2 ============= - + * bugfix in installer - failed when cython not present * changed installation locations of shared libraries diff --git a/doc/usage.rst b/doc/usage.rst index f4b7498..fc4f2bb 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -269,7 +269,8 @@ simple variant attributes such as :class:`~pysam.VariantRecord.contig`, print (rec.pos) but also to complex attributes such as the contents to the -:term:`info`, :term:`format` and :term:`genotype` columns. These +:class:`~pysam.VariantRecord.info`, :class:`~pysam.VariantRecord.format` +and :term:`genotype` columns. These complex attributes are views on the underlying htslib data structures and provide dictionary-like access to the data:: diff --git a/import/pysam.c b/import/pysam.c index 5692622..2a81e4d 100644 --- a/import/pysam.c +++ b/import/pysam.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,25 @@ int @pysam@_puts(const char *s) return putc('\n', @pysam@_stdout); } + +static jmp_buf @pysam@_jmpbuf; +static int @pysam@_status = 0; + +int @pysam@_dispatch(int argc, char *argv[]) +{ + if (setjmp(@pysam@_jmpbuf) == 0) + return @pysam@_main(argc, argv); + else + return @pysam@_status; +} + +void @pysam@_exit(int status) +{ + @pysam@_status = status; + longjmp(@pysam@_jmpbuf, 1); +} + + void @pysam@_set_optind(int val) { // setting this in cython via diff --git a/import/pysam.h b/import/pysam.h index 6abb884..8dbb09e 100644 --- a/import/pysam.h +++ b/import/pysam.h @@ -3,6 +3,17 @@ #include +#ifndef __has_attribute +#define __has_attribute(attribute) 0 +#endif +#ifndef PYSAM_NORETURN +#if __has_attribute(__noreturn__) || __GNUC__ >= 3 +#define PYSAM_NORETURN __attribute__((__noreturn__)) +#else +#define PYSAM_NORETURN +#endif +#endif + extern FILE * @pysam@_stderr; extern FILE * @pysam@_stdout; @@ -40,6 +51,8 @@ int @pysam@_puts(const char *s); int @pysam@_dispatch(int argc, char *argv[]); +void PYSAM_NORETURN @pysam@_exit(int status); + void @pysam@_set_optind(int); extern int @pysam@_main(int argc, char *argv[]); diff --git a/pysam.py b/pysam.py deleted file mode 100644 index 0823abd..0000000 --- a/pysam.py +++ /dev/null @@ -1 +0,0 @@ -raise ImportError('''calling "import pysam" from the source directory is not supported - please import pysam from somewhere else.''') diff --git a/pysam/__init__.py b/pysam/__init__.py index 40877da..a6ff6d7 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -11,8 +11,8 @@ import pysam.libcfaidx as libcfaidx from pysam.libcfaidx import * import pysam.libctabix as libctabix from pysam.libctabix import * -# import pysam.libctabixproxies as libctabixproxies -# from pysam.libctabixproxies import * +import pysam.libctabixproxies as libctabixproxies +from pysam.libctabixproxies import * import pysam.libcsamfile as libcsamfile from pysam.libcsamfile import * import pysam.libcalignmentfile as libcalignmentfile diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd index c964160..473c5b1 100644 --- a/pysam/libcalignedsegment.pxd +++ b/pysam/libcalignedsegment.pxd @@ -64,7 +64,7 @@ cdef class AlignedSegment: cdef class PileupColumn: - cdef bam_pileup1_t ** plp + cdef const bam_pileup1_t ** plp cdef int tid cdef int pos cdef int n_pu @@ -89,7 +89,7 @@ cdef AlignedSegment makeAlignedSegment( AlignmentHeader header) cdef PileupColumn makePileupColumn( - bam_pileup1_t ** plp, + const bam_pileup1_t ** plp, int tid, int pos, int n_pu, @@ -97,7 +97,7 @@ cdef PileupColumn makePileupColumn( char * reference_sequence, AlignmentHeader header) -cdef PileupRead makePileupRead(bam_pileup1_t * src, +cdef PileupRead makePileupRead(const bam_pileup1_t * src, AlignmentHeader header) cdef uint32_t get_alignment_length(bam1_t * src) diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index 5674b49..da7274c 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -134,7 +134,7 @@ cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b): return toupper(ch) -cdef inline bint pileup_base_qual_skip(bam_pileup1_t * p, uint32_t threshold): +cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold): cdef uint32_t c if p.qpos < p.b.core.l_qseq: c = bam_get_qual(p.b)[p.qpos] @@ -608,7 +608,7 @@ cdef AlignedSegment makeAlignedSegment(bam1_t *src, cdef class PileupColumn -cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp, +cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp, int tid, int pos, int n_pu, @@ -635,7 +635,7 @@ cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp, cdef class PileupRead -cdef PileupRead makePileupRead(bam_pileup1_t *src, +cdef PileupRead makePileupRead(const bam_pileup1_t *src, AlignmentHeader header): '''return a PileupRead object construted from a bam_pileup1_t * object.''' # note that the following does not call __init__ @@ -784,7 +784,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): # Check if MD tag is valid by matching CIGAR length to MD tag defined length # Insertions would be in addition to what is described by MD, so we calculate - # the number of insertions seperately. + # the number of insertions separately. cdef int insertions = 0 while s[s_idx] != 0: @@ -978,13 +978,13 @@ cdef class AlignedSegment: # requires a valid header. return "\t".join(map(str, (self.query_name, self.flag, - self.reference_id, - self.reference_start, + "#%d" % self.reference_id if self.reference_id >= 0 else "*", + self.reference_start + 1, self.mapping_quality, self.cigarstring, - self.next_reference_id, - self.next_reference_start, - self.query_alignment_length, + "#%d" % self.next_reference_id if self.next_reference_id >= 0 else "*", + self.next_reference_start + 1, + self.template_length, self.query_sequence, self.query_qualities, self.tags))) @@ -1169,10 +1169,8 @@ cdef class AlignedSegment: if qname is None or len(qname) == 0: return - # See issue #447 - # (The threshold is 252 chars, but this includes a \0 byte. - if len(qname) > 251: - raise ValueError("query length out of range {} > 251".format( + if len(qname) > 254: + raise ValueError("query length out of range {} > 254".format( len(qname))) qname = force_bytes(qname) @@ -1392,9 +1390,9 @@ cdef class AlignedSegment: read.query_squence = read.query_sequence[5:10] read.query_qualities = q[5:10] - The sequence is returned as it is stored in the BAM file. Some mappers - might have stored a reverse complement of the original read - sequence. + The sequence is returned as it is stored in the BAM file. (This will + be the reverse complement of the original read sequence if the mapper + has aligned the read to the reverse strand.) """ def __get__(self): if self.cache_query_sequence: @@ -1570,7 +1568,7 @@ cdef class AlignedSegment: def __set__(self, val): pysam_update_flag(self._delegate, val, BAM_FUNMAP) # setting the unmapped flag requires recalculation of - # bin as alignment length is now implicitely 1 + # bin as alignment length is now implicitly 1 update_bin(self._delegate) property mate_is_unmapped: @@ -1843,8 +1841,9 @@ cdef class AlignedSegment: def get_forward_sequence(self): """return the original read sequence. - Reads mapping to the reverse strand will be reverse - complemented. + Reads mapped to the reverse strand are stored reverse complemented in + the BAM file. This method returns such reads reverse complemented back + to their original orientation. Returns None if the record has no query sequence. """ @@ -1856,9 +1855,12 @@ cdef class AlignedSegment: return s def get_forward_qualities(self): - """return base qualities of the read sequence. + """return the original base qualities of the read sequence, + in the same format as the :attr:`query_qualities` property. - Reads mapping to the reverse strand will be reversed. + Reads mapped to the reverse strand have their base qualities stored + reversed in the BAM file. This method returns such reads' base qualities + reversed back to their original orientation. """ if self.is_reverse: return self.query_qualities[::-1] @@ -2242,7 +2244,7 @@ cdef class AlignedSegment: *value*. An existing value of the same *tag* will be overwritten unless - *replace* is set to False. This is usually not recommened as a + *replace* is set to False. This is usually not recommended as a tag may only appear once in the optional alignment section. If *value* is None, the tag will be deleted. @@ -2468,7 +2470,7 @@ cdef class AlignedSegment: return value def get_tags(self, with_value_type=False): - """the fields in the optional aligment section. + """the fields in the optional alignment section. Returns a list of all fields in the optional alignment section. Values are converted to appropriate python @@ -2841,7 +2843,7 @@ cdef class PileupColumn: raise ValueError("PileupColumn accessed after iterator finished") cdef int x - cdef bam_pileup1_t * p = NULL + cdef const bam_pileup1_t * p = NULL pileups = [] # warning: there could be problems if self.n and self.buf are @@ -2893,7 +2895,7 @@ cdef class PileupColumn: cdef uint32_t x = 0 cdef uint32_t c = 0 cdef uint32_t cnt = 0 - cdef bam_pileup1_t * p = NULL + cdef const bam_pileup1_t * p = NULL if self.plp == NULL or self.plp[0] == NULL: raise ValueError("PileupColumn accessed after iterator finished") @@ -2941,7 +2943,7 @@ cdef class PileupColumn: mark_matches: bool - If True, output bases matching the reference as "," or "." + If True, output bases matching the reference as "." or "," for forward and reverse strand, respectively. This mark requires the reference sequence. If no reference is present, this option is ignored. @@ -2969,7 +2971,7 @@ cdef class PileupColumn: cdef uint8_t cc = 0 cdef uint8_t rb = 0 cdef kstring_t * buf = &self.buf - cdef bam_pileup1_t * p = NULL + cdef const bam_pileup1_t * p = NULL if self.plp == NULL or self.plp[0] == NULL: raise ValueError("PileupColumn accessed after iterator finished") @@ -3052,7 +3054,7 @@ cdef class PileupColumn: list: a list of quality scores """ cdef uint32_t x = 0 - cdef bam_pileup1_t * p = NULL + cdef const bam_pileup1_t * p = NULL cdef uint32_t c = 0 result = [] for x from 0 <= x < self.n_pu: @@ -3083,7 +3085,7 @@ cdef class PileupColumn: raise ValueError("PileupColumn accessed after iterator finished") cdef uint32_t x = 0 - cdef bam_pileup1_t * p = NULL + cdef const bam_pileup1_t * p = NULL result = [] for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) @@ -3109,7 +3111,7 @@ cdef class PileupColumn: raise ValueError("PileupColumn accessed after iterator finished") cdef uint32_t x = 0 - cdef bam_pileup1_t * p = NULL + cdef const bam_pileup1_t * p = NULL result = [] for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) @@ -3135,7 +3137,7 @@ cdef class PileupColumn: raise ValueError("PileupColumn accessed after iterator finished") cdef uint32_t x = 0 - cdef bam_pileup1_t * p = NULL + cdef const bam_pileup1_t * p = NULL result = [] for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index 6ee4963..2a17fbe 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -58,24 +58,6 @@ cdef class AlignmentFile(HTSFile): cpdef int write(self, AlignedSegment read) except -1 -cdef class PileupColumn: - cdef bam_pileup1_t ** plp - cdef int tid - cdef int pos - cdef int n_pu - - -cdef class PileupRead: - cdef AlignedSegment _alignment - cdef int32_t _qpos - cdef int _indel - cdef int _level - cdef uint32_t _is_del - cdef uint32_t _is_head - cdef uint32_t _is_tail - cdef uint32_t _is_refskip - - cdef class IteratorRow: cdef int retval cdef bam1_t * b @@ -124,7 +106,7 @@ cdef class IteratorColumn: cdef int pos cdef int n_plp cdef uint32_t min_base_quality - cdef bam_pileup1_t * plp + cdef const bam_pileup1_t * plp cdef bam_mplp_t pileup_iter cdef __iterdata iterdata cdef AlignmentFile samfile diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index b8e4230..e192ff3 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -100,7 +100,7 @@ IndexStats = collections.namedtuple("IndexStats", ######################################################## ## global variables # maximum genomic coordinace -# for some reason, using 'int' causes overlflow +# for some reason, using 'int' causes overflow cdef int MAX_POS = (1 << 31) - 1 # valid types for SAM headers @@ -175,6 +175,12 @@ cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr): return header +def read_failure_reason(code): + if code == -2: + return 'truncated file' + else: + return "error {} while reading file".format(code) + # the following should be class-method for VariantHeader, but cdef @classmethods # are not implemented in cython. @@ -522,7 +528,10 @@ cdef class AlignmentHeader(object): returns -1 if reference is not known. """ reference = force_bytes(reference) - return bam_name2id(self.ptr, reference) + tid = bam_name2id(self.ptr, reference) + if tid < -1: + raise ValueError('could not parse header') + return tid def __str__(self): '''string with the full contents of the :term:`sam file` header as a @@ -1029,7 +1038,7 @@ cdef class AlignmentFile(HTSFile): See :meth:`~pysam.HTSFile.parse_region` for more information on how genomic regions can be specified. :term:`reference` and - `end` are also accepted for backward compatiblity as synonyms + `end` are also accepted for backward compatibility as synonyms for :term:`contig` and `stop`, respectively. Without a `contig` or `region` all mapped reads in the file @@ -1212,7 +1221,7 @@ cdef class AlignmentFile(HTSFile): """perform a :term:`pileup` within a :term:`region`. The region is specified by :term:`contig`, `start` and `stop` (using 0-based indexing). :term:`reference` and `end` are also accepted for - backward compatiblity as synonyms for :term:`contig` and `stop`, + backward compatibility as synonyms for :term:`contig` and `stop`, respectively. Alternatively, a samtools 'region' string can be supplied. @@ -1239,7 +1248,7 @@ cdef class AlignmentFile(HTSFile): By default, the samtools pileup engine outputs all reads overlapping a region. If truncate is True and a region is - given, only columns in the exact region specificied are + given, only columns in the exact region specified are returned. max_depth : int @@ -1288,7 +1297,7 @@ cdef class AlignmentFile(HTSFile): min_base_quality: int Minimum base quality. Bases below the minimum quality will - not be output. + not be output. The default is 13. adjust_capq_threshold: int @@ -1354,7 +1363,7 @@ cdef class AlignmentFile(HTSFile): The region is specified by :term:`contig`, `start` and `stop`. :term:`reference` and `end` are also accepted for backward - compatiblity as synonyms for :term:`contig` and `stop`, + compatibility as synonyms for :term:`contig` and `stop`, respectively. Alternatively, a :term:`samtools` :term:`region` string can be supplied. @@ -1458,7 +1467,7 @@ cdef class AlignmentFile(HTSFile): The region is specified by :term:`contig`, `start` and `stop`. :term:`reference` and `end` are also accepted for backward - compatiblity as synonyms for :term:`contig` and `stop`, + compatibility as synonyms for :term:`contig` and `stop`, respectively. Alternatively, a :term:`samtools` :term:`region` string can be supplied. The coverage is computed per-base [ACGT]. @@ -1575,6 +1584,8 @@ cdef class AlignmentFile(HTSFile): # count seq = read.seq + if seq is None: + continue quality = read.query_qualities for qpos, refpos in read.get_aligned_pairs(True): @@ -1779,7 +1790,8 @@ cdef class AlignmentFile(HTSFile): property nocoordinate: """int with total number of reads without coordinates according to the - statistics recorded in the index. This is a read-only attribute. + statistics recorded in the index, i.e., the statistic printed for "*" + by the ``samtools idxstats`` command. This is a read-only attribute. """ def __get__(self): self.check_index() @@ -1790,7 +1802,8 @@ cdef class AlignmentFile(HTSFile): def get_index_statistics(self): """return statistics about mapped/unmapped reads per chromosome as - they are stored in the index. + they are stored in the index, similarly to the statistics printed + by the ``samtools idxstats`` command. Returns: list : @@ -1846,12 +1859,12 @@ cdef class AlignmentFile(HTSFile): def __next__(self): cdef int ret = self.cnext() - if (ret >= 0): + if ret >= 0: return makeAlignedSegment(self.b, self.header) - elif ret == -2: - raise IOError('truncated file') - else: + elif ret == -1: raise StopIteration + else: + raise IOError(read_failure_reason(ret)) ########################################### # methods/properties referencing the header @@ -1886,7 +1899,7 @@ cdef class AlignmentFile(HTSFile): def get_reference_length(self, reference): """ - return :term:`reference` name corresponding to numerical :term:`tid` + return :term:`reference` length corresponding to numerical :term:`tid` """ if self.header is None: raise ValueError("header not available in closed files") @@ -2138,10 +2151,10 @@ cdef class IteratorRowHead(IteratorRow): if ret >= 0: self.current_row += 1 return makeAlignedSegment(self.b, self.header) - elif ret == -2: - raise IOError('truncated file') - else: + elif ret == -1: raise StopIteration + else: + raise IOError(read_failure_reason(ret)) cdef class IteratorRowAll(IteratorRow): @@ -2183,10 +2196,10 @@ cdef class IteratorRowAll(IteratorRow): cdef int ret = self.cnext() if ret >= 0: return makeAlignedSegment(self.b, self.header) - elif ret == -2: - raise IOError('truncated file') - else: + elif ret == -1: raise StopIteration + else: + raise IOError(read_failure_reason(ret)) cdef class IteratorRowAllRefs(IteratorRow): @@ -2217,7 +2230,7 @@ cdef class IteratorRowAllRefs(IteratorRow): self.rowiter = IteratorRowRegion(self.samfile, self.tid, 0, - 1<<29) + MAX_POS) # set htsfile and header of the rowiter # to the values in this iterator to reflect multiple_iterators self.rowiter.htsfile = self.htsfile @@ -2301,10 +2314,10 @@ cdef class IteratorRowSelection(IteratorRow): cdef int ret = self.cnext() if ret >= 0: return makeAlignedSegment(self.b, self.header) - elif ret == -2: - raise IOError('truncated file') - else: + elif ret == -1: raise StopIteration + else: + raise IOError(read_failure_reason(ret)) cdef int __advance_nofilter(void *data, bam1_t *b): @@ -2434,7 +2447,7 @@ cdef class IteratorColumn: For reasons of efficiency, the iterator points to the current pileup buffer. The pileup buffer is updated at every iteration. - This might cause some unexpected behavious. For example, + This might cause some unexpected behaviour. For example, consider the conversion to a list:: f = AlignmentFile("file.bam", "rb") @@ -2661,7 +2674,7 @@ cdef class IteratorColumn: # reset in order to avoid memory leak messages for iterators # that have not been fully consumed self._free_pileup_iter() - self.plp = NULL + self.plp = NULL if self.iterdata.seq != NULL: free(self.iterdata.seq) @@ -2858,9 +2871,7 @@ cdef class SNPCall: cdef class IndexedReads: - """*(AlignmentFile samfile, multiple_iterators=True) - - Index a Sam/BAM-file by query name while keeping the + """Index a Sam/BAM-file by query name while keeping the original sort order intact. The index is kept in memory and can be substantial. diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index c9bcbd2..05a5fe8 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -106,6 +106,24 @@ from pysam.utils import unquoted_str __all__ = ['VariantFile', 'VariantHeader', 'VariantHeaderRecord', + 'VariantHeaderRecords', + 'VariantMetadata', + 'VariantHeaderMetadata', + 'VariantContig', + 'VariantHeaderContigs', + 'VariantHeaderSamples', + 'VariantRecordFilter', + 'VariantRecordFormat', + 'VariantRecordInfo', + 'VariantRecordSamples', + 'VariantRecord', + 'VariantRecordSample', + 'BaseIndex', + 'BCFIndex', + 'TabixIndex', + 'BaseIterator', + 'BCFIterator', + 'TabixIterator', 'VariantRecord'] ######################################################################## @@ -125,7 +143,7 @@ cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') ######################################################################## from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len -from pysam.libcutils cimport encode_filename, from_string_and_size +from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes ######################################################################## @@ -166,7 +184,7 @@ cdef inline bcf_str_cache_get_charptr(const char* s): ######################################################################## cdef int comb(int n, int k) except -1: - """Return binomial coeffient: n choose k + """Return binomial coefficient: n choose k >>> comb(5, 1) 5 @@ -284,7 +302,7 @@ cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int sca else: # Otherwise, copy the entire block b = datac[:n] - value = tuple(v.decode('utf-8') if v and v != bcf_str_missing else None for v in b.split(b',')) + value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b',')) else: value = [] if type == BCF_BT_INT8: @@ -3141,7 +3159,7 @@ cdef class VariantRecord(object): # causes a memory leak https://github.com/pysam-developers/pysam/issues/773 # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None if (r.d.m_id == 0): - raise ValueError('Error extracing ID') + raise ValueError('Error extracting ID') return charptr_to_str(r.d.id) if r.d.id != b'.' else None @id.setter @@ -3755,7 +3773,7 @@ cdef class BaseIterator(object): pass -# Interal function to clean up after iteration stop or failure. +# Internal function to clean up after iteration stop or failure. # This would be a nested function if it weren't a cdef function. cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record): bcf_destroy1(record) @@ -3786,7 +3804,7 @@ cdef class BCFIterator(BaseIterator): try: rid = index.refmap[contig] except KeyError: - # A query for a non-existant contig yields an empty iterator, does not raise an error + # A query for a non-existent contig yields an empty iterator, does not raise an error self.iter = NULL return @@ -3874,7 +3892,7 @@ cdef class TabixIterator(BaseIterator): try: rid = index.refmap[contig] except KeyError: - # A query for a non-existant contig yields an empty iterator, does not raise an error + # A query for a non-existent contig yields an empty iterator, does not raise an error self.iter = NULL return @@ -4346,9 +4364,10 @@ cdef class VariantFile(HTSFile): return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None): - """fetch records in a :term:`region` using 0-based indexing. The - region is specified by :term:`contig`, *start* and *end*. - Alternatively, a samtools :term:`region` string can be supplied. + """fetch records in a :term:`region`, specified either by + :term:`contig`, *start*, and *end* (which are 0-based, half-open); + or alternatively by a samtools :term:`region` string (which is + 1-based inclusive). Without *contig* or *region* all mapped records will be fetched. The records will be returned ordered by contig, which will not necessarily diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd index 62a6f3d..d57f784 100644 --- a/pysam/libcbcftools.pxd +++ b/pysam/libcbcftools.pxd @@ -1,6 +1,6 @@ cdef extern from "bcftools.pysam.h": - int bcftools_main(int argc, char *argv[]) + int bcftools_dispatch(int argc, char *argv[]) void bcftools_set_stderr(int fd) void bcftools_close_stderr() void bcftools_set_stdout(int fd) diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index a70d42d..e73adf9 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -496,7 +496,7 @@ cdef class FastxRecord: cdef class FastxFile: - """Stream access to :term:`fasta` or :term:`fastq` formatted files. + r"""Stream access to :term:`fasta` or :term:`fastq` formatted files. The file is automatically opened. @@ -541,7 +541,7 @@ cdef class FastxFile: ... print(entry.quality) >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout: ... for entry in fin: - ... fout.write(str(entry)) + ... fout.write(str(entry) + '\n') """ def __cinit__(self, *args, **kwargs): diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index 370e492..9684ef9 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -275,7 +275,7 @@ cdef extern from "htslib/bgzf.h" nogil: int SEEK_SET # Return a virtual file pointer to the current location in the file. - # No interpetation of the value should be made, other than a subsequent + # No interpretation of the value should be made, other than a subsequent # call to bgzf_seek can be used to position the file at the same point. # Return value is non-negative on success. int64_t bgzf_tell(BGZF *fp) @@ -326,7 +326,7 @@ cdef extern from "htslib/bgzf.h" nogil: # Read one line from a BGZF file. It is faster than bgzf_getc() # # @param fp BGZF file handler - # @param delim delimitor + # @param delim delimiter # @param str string to write to; must be initialized # @return length of the string; 0 on end-of-file; negative on error int bgzf_getline(BGZF *fp, int delim, kstring_t *str) @@ -796,7 +796,7 @@ cdef extern from "htslib/hts.h" nogil: ctypedef struct hts_md5_context - # /*! @abstract Intialises an MD5 context. + # /*! @abstract Initialises an MD5 context. # * @discussion # * The expected use is to allocate an hts_md5_context using # * hts_md5_init(). This pointer is then passed into one or more calls @@ -1353,10 +1353,10 @@ cdef extern from "htslib/tbx.h" nogil: # tbx.h definitions int8_t TBX_MAX_SHIFT - int8_t TBX_GENERIC - int8_t TBX_SAM - int8_t TBX_VCF - int8_t TBX_UCSC + int32_t TBX_GENERIC + int32_t TBX_SAM + int32_t TBX_VCF + int32_t TBX_UCSC ctypedef struct tbx_conf_t: int32_t preset @@ -1418,7 +1418,7 @@ cdef extern from "htslib/vcf.h" nogil: # === Dictionary === # - # The header keeps three dictonaries. The first keeps IDs in the + # The header keeps three dictionaries. The first keeps IDs in the # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] # is the actual hash table, which is opaque to the end users. In the hash @@ -2112,8 +2112,7 @@ cdef extern from "htslib/vcfutils.h" nogil: # be determined. # # The value of @which determines if existing INFO/AC,AN can be - # used (BCF_UN_INFO) and and if indv fields can be splitted - # (BCF_UN_FMT). + # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT). int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) # bcf_gt_type() - determines type of the genotype @@ -2261,7 +2260,7 @@ cdef extern from "htslib/cram.h" nogil: # the container, meaning multiple compression headers to manipulate. # Changing RG may change the size of the compression header and # therefore the length field in the container. Hence we rewrite all - # blocks just incase and also emit the adjusted container. + # blocks just in case and also emit the adjusted container. # # The current implementation can only cope with renumbering a single # RG (and only then if it is using HUFFMAN or BETA codecs). In @@ -2511,7 +2510,7 @@ cdef extern from "htslib/cram.h" nogil: # 2 if the file is a stream and thus unseekable # 1 if the file contains an EOF block # 0 if the file does not contain an EOF block - # -1 if an error occured whilst reading the file or we could not seek back to where we were + # -1 if an error occurred whilst reading the file or we could not seek back to where we were # # int cram_check_EOF(cram_fd *fd) diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index 92d4e8f..778fc23 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -72,7 +72,7 @@ cdef class HFile(object): cdef hFILE *fp cdef readonly object name, mode - def __init__(self, name, mode='r', closedf=True): + def __init__(self, name, mode='r', closefd=True): self._open(name, mode, closefd=True) def __dealloc__(self): @@ -585,7 +585,7 @@ cdef class HTSFile(object): rval = hts_opt_apply(self.htsfile, opts) if rval != 0: hts_opt_free(opts) - raise RuntimeError('An error occured while applying the requested format options') + raise RuntimeError('An error occurred while applying the requested format options') hts_opt_free(opts) def parse_region(self, contig=None, start=None, stop=None, @@ -595,7 +595,7 @@ cdef class HTSFile(object): either be specified by :term:`contig`, `start` and `stop`. `start` and `stop` denote 0-based, half-open intervals. :term:`reference` and `end` are also accepted for - backward compatiblity as synonyms for :term:`contig` and + backward compatibility as synonyms for :term:`contig` and `stop`, respectively. Alternatively, a samtools :term:`region` string can be diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd index 70fda60..3c39476 100644 --- a/pysam/libcsamtools.pxd +++ b/pysam/libcsamtools.pxd @@ -1,6 +1,6 @@ cdef extern from "samtools.pysam.h": - int samtools_main(int argc, char *argv[]) + int samtools_dispatch(int argc, char *argv[]) void samtools_set_stderr(int fd) void samtools_close_stderr() void samtools_set_stdout(int fd) diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index e581b61..4436420 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -53,7 +53,6 @@ # DEALINGS IN THE SOFTWARE. # ############################################################################### -import binascii import os import sys @@ -75,8 +74,8 @@ from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ tbx_destroy, hisremote, region_list, hts_getline, \ - TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, htsExactFormat, bcf, \ - bcf_index_build2 + TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \ + no_compression, bcf, bcf_index_build2 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size @@ -302,7 +301,7 @@ cdef class TabixFile: index : string The filename of the index. If not set, the default is to - assume that the index is called ``filename.tbi` + assume that the index is called ``filename.tbi`` mode : char The file opening mode. Currently, only ``r`` is permitted. @@ -581,7 +580,7 @@ cdef class TabixFile: property contigs: '''list of chromosome names''' def __get__(self): - cdef char ** sequences + cdef const char ** sequences cdef int nsequences with nogil: @@ -880,13 +879,6 @@ def tabix_compress(filename_in, raise IOError("error %i when closing file %s" % (r, filename_in)) -def is_gzip_file(filename): - gzip_magic_hex = b'1f8b' - fd = os.open(filename, os.O_RDONLY) - header = os.read(fd, 2) - return header == binascii.a2b_hex(gzip_magic_hex) - - def tabix_index(filename, force=False, seq_col=None, @@ -928,16 +920,13 @@ def tabix_index(filename, compressed. The original file will be removed and only the compressed file will be retained. - *min-shift* sets the minimal interval size to 1< 0: suffix = ".csi" + if min_shift <= 0: min_shift = 14 else: suffix = ".tbi" + min_shift = 0 + index = index or filename + suffix fn_index = encode_filename(index) @@ -1024,7 +1016,7 @@ def tabix_index(filename, cdef char *fnidx = fn_index cdef int retval = 0 - if csi and fmt == bcf: + if csi and fmt.format == bcf: with nogil: retval = bcf_index_build2(cfn, fnidx, min_shift) else: diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd index 9e1cce1..d78b706 100644 --- a/pysam/libcutils.pxd +++ b/pysam/libcutils.pxd @@ -14,15 +14,21 @@ cpdef array_to_qualitystring(c_array.array arr, int offset=*) cpdef qualities_to_qualitystring(qualities, int offset=*) ######################################################################## +## String encoding configuration facilities ######################################################################## + +cpdef get_encoding_error_handler() +cpdef set_encoding_error_handler(name) + ######################################################################## ## Python 3 compatibility functions ######################################################################## -cdef charptr_to_str(const char *s, encoding=*) -cdef bytes charptr_to_bytes(const char *s, encoding=*) -cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*) -cdef force_str(object s, encoding=*) -cdef bytes force_bytes(object s, encoding=*) +cdef charptr_to_str(const char *s, encoding=*, errors=*) +cdef bytes charptr_to_bytes(const char *s, encoding=*, errors=*) +cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*, errors=*) +cdef force_str(object s, encoding=*, errors=*) +cdef bytes force_bytes(object s, encoding=*, errors=*) +cdef decode_bytes(bytes s, encoding=*, errors=*) cdef bytes encode_filename(object filename) cdef from_string_and_size(const char *s, size_t length) diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index fe61bb8..adc9cec 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -6,6 +6,7 @@ import tempfile import os import io from contextlib import contextmanager +from codecs import register_error from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION from cpython cimport PyBytes_Check, PyUnicode_Check @@ -17,10 +18,10 @@ from libc.stdio cimport fprintf, stderr, fflush from libc.stdio cimport stdout as c_stdout from posix.fcntl cimport open as c_open, O_WRONLY -from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \ +from libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \ samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn, samtools_set_optind -from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \ +from libcbcftools cimport bcftools_dispatch, bcftools_set_stdout, bcftools_set_stderr, \ bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn, bcftools_set_optind ##################################################################### @@ -82,7 +83,27 @@ cpdef qualities_to_qualitystring(qualities, int offset=33): ######################################################################## +## String encoding configuration facilities ######################################################################## + +# Codec error handler that just interprets each bad byte as ISO-8859-1. +def latin1_replace(exception): + return (chr(exception.object[exception.start]), exception.end) + +register_error('pysam.latin1replace', latin1_replace) + + +cdef str ERROR_HANDLER = 'strict' + +cpdef get_encoding_error_handler(): + return ERROR_HANDLER + +cpdef set_encoding_error_handler(name): + global ERROR_HANDLER + previous = ERROR_HANDLER + ERROR_HANDLER = name + return previous + ######################################################################## ## Python 3 compatibility functions ######################################################################## @@ -91,7 +112,7 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 cdef from_string_and_size(const char* s, size_t length): if IS_PYTHON3: - return s[:length].decode("utf8") + return s[:length].decode('utf-8', ERROR_HANDLER) else: return s[:length] @@ -115,7 +136,7 @@ cdef bytes encode_filename(object filename): raise TypeError("Argument must be string or unicode.") -cdef bytes force_bytes(object s, encoding=TEXT_ENCODING): +cdef bytes force_bytes(object s, encoding=None, errors=None): """convert string or unicode object to bytes, assuming utf8 encoding. """ @@ -124,37 +145,37 @@ cdef bytes force_bytes(object s, encoding=TEXT_ENCODING): elif PyBytes_Check(s): return s elif PyUnicode_Check(s): - return s.encode(encoding) + return s.encode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) else: raise TypeError("Argument must be string, bytes or unicode.") -cdef charptr_to_str(const char* s, encoding=TEXT_ENCODING): +cdef charptr_to_str(const char* s, encoding=None, errors=None): if s == NULL: return None if PY_MAJOR_VERSION < 3: return s else: - return s.decode(encoding) + return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) -cdef charptr_to_str_w_len(const char* s, size_t n, encoding=TEXT_ENCODING): +cdef charptr_to_str_w_len(const char* s, size_t n, encoding=None, errors=None): if s == NULL: return None if PY_MAJOR_VERSION < 3: return s[:n] else: - return s[:n].decode(encoding) + return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) -cdef bytes charptr_to_bytes(const char* s, encoding=TEXT_ENCODING): +cdef bytes charptr_to_bytes(const char* s, encoding=None, errors=None): if s == NULL: return None else: return s -cdef force_str(object s, encoding=TEXT_ENCODING): +cdef force_str(object s, encoding=None, errors=None): """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)""" if s is None: @@ -162,12 +183,21 @@ cdef force_str(object s, encoding=TEXT_ENCODING): if PY_MAJOR_VERSION < 3: return s elif PyBytes_Check(s): - return s.decode(encoding) + return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) else: # assume unicode return s +cdef decode_bytes(bytes s, encoding=None, errors=None): + """Return s converted to current Python's str type, + always decoding even in Python 2""" + if s is None: + return None + else: + return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) + + cpdef parse_region(contig=None, start=None, stop=None, @@ -179,7 +209,7 @@ cpdef parse_region(contig=None, `end`. `start` and `end` denote 0-based, half-open intervals. :term:`reference` and `end` are also accepted for backward - compatiblity as synonyms for :term:`contig` and `stop`, + compatibility as synonyms for :term:`contig` and `stop`, respectively. Alternatively, a samtools :term:`region` string can be supplied. @@ -386,13 +416,13 @@ def _pysam_dispatch(collection, if collection == b"samtools": samtools_set_stdout(stdout_h) samtools_set_stderr(stderr_h) - retval = samtools_main(n + 2, cargs) + retval = samtools_dispatch(n + 2, cargs) samtools_close_stdout() samtools_close_stderr() elif collection == b"bcftools": bcftools_set_stdout(stdout_h) bcftools_set_stderr(stderr_h) - retval = bcftools_main(n + 2, cargs) + retval = bcftools_dispatch(n + 2, cargs) bcftools_close_stdout() bcftools_close_stderr() @@ -425,6 +455,10 @@ def _pysam_dispatch(collection, return retval, out_stderr, out_stdout -__all__ = ["qualitystring_to_array", - "array_to_qualitystring", - "qualities_to_qualitystring"] +__all__ = [ + "qualitystring_to_array", + "array_to_qualitystring", + "qualities_to_qualitystring", + "get_encoding_error_handler", + "set_encoding_error_handler", +] diff --git a/pysam/samtools.py b/pysam/samtools.py index 58cc2ee..9042cc1 100644 --- a/pysam/samtools.py +++ b/pysam/samtools.py @@ -37,6 +37,10 @@ SAMTOOLS_DISPATCH = { "quickcheck": ("quickcheck", None), "split": ("split", None), "flags": ("flags", None), + "ampliconclip": ("ampliconclip", None), + "ampliconstats": ("ampliconstats", None), + "version": ("version", None), + "fqimport": ("import", None), } # instantiate samtools commands as python functions diff --git a/pysam/version.h b/pysam/version.h index 7c4ea99..33676ea 100644 --- a/pysam/version.h +++ b/pysam/version.h @@ -1,5 +1,5 @@ // Version information used while compiling samtools, bcftools, and htslib -#define SAMTOOLS_VERSION "1.10 (pysam)" -#define BCFTOOLS_VERSION "1.10.2 (pysam)" -#define HTS_VERSION_TEXT "1.10.2 (pysam)" +#define SAMTOOLS_VERSION "1.13 (pysam)" +#define BCFTOOLS_VERSION "1.13 (pysam)" +#define HTS_VERSION_TEXT "1.13 (pysam)" diff --git a/pysam/version.py b/pysam/version.py index 3ad71c7..8c871ba 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,6 +1,6 @@ # pysam versioning information -__version__ = "0.16.0.1" +__version__ = "0.17.0" -__samtools_version__ = "1.10" -__bcftools_version__ = "1.10.2" -__htslib_version__ = "1.10.2" +__samtools_version__ = "1.13" +__bcftools_version__ = "1.13" +__htslib_version__ = "1.13" diff --git a/samtools/LICENSE b/samtools/LICENSE index 3c56f48..cd102b8 100644 --- a/samtools/LICENSE +++ b/samtools/LICENSE @@ -1,6 +1,6 @@ The MIT/Expat License -Copyright (C) 2008-2019 Genome Research Ltd. +Copyright (C) 2008-2021 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/samtools/README b/samtools/README index bb7af6c..dd27670 100644 --- a/samtools/README +++ b/samtools/README @@ -9,7 +9,7 @@ Building samtools The typical simple case of building Samtools using the HTSlib bundled within this Samtools release tarball is done as follows: - cd .../samtools-1.10 # Within the unpacked release directory + cd .../samtools-1.13 # Within the unpacked release directory ./configure make @@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for installation using the HTSlib bundled within this Samtools release tarball, and building the various HTSlib utilities such as bgzip is done as follows: - cd .../samtools-1.10 # Within the unpacked release directory + cd .../samtools-1.13 # Within the unpacked release directory ./configure --prefix=/path/to/location make all all-htslib make install install-htslib @@ -48,7 +48,7 @@ There are two advantages to this: To build with plug-ins, you need to use the --enable-plugins configure option as follows: - cd .../samtools-1.10 # Within the unpacked release directory + cd .../samtools-1.13 # Within the unpacked release directory ./configure --enable-plugins --prefix=/path/to/location make all all-htslib make install install-htslib @@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from the source distribution instead of installing the package. In that case you can use: - cd .../samtools-1.10 # Within the unpacked release directory - ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.10 + cd .../samtools-1.13 # Within the unpacked release directory + ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.13 make all all-htslib It is possible to override the built-in search path using the HTS_PATH @@ -99,3 +99,28 @@ Benchmarks comparing the various zlibs are available at: It is recommended that you perform your own rigorous tests for an entire pipeline if you wish to switch to one of the optimised zlib implementations. + +Citing +====== + +Please cite this paper when using SAMtools for your publications: + +Twelve years of SAMtools and BCFtools +Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li +GigaScience, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008 + +@article{10.1093/gigascience/giab008, + author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng}, + title = "{Twelve years of SAMtools and BCFtools}", + journal = {GigaScience}, + volume = {10}, + number = {2}, + year = {2021}, + month = {02}, + abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\>1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}", + issn = {2047-217X}, + doi = {10.1093/gigascience/giab008}, + url = {https://doi.org/10.1093/gigascience/giab008}, + note = {giab008}, + eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf}, +} diff --git a/samtools/amplicon_stats.c b/samtools/amplicon_stats.c new file mode 100644 index 0000000..62bb15c --- /dev/null +++ b/samtools/amplicon_stats.c @@ -0,0 +1,1754 @@ +/* stats.c -- This is the former bamcheck integrated into samtools/htslib. + + Copyright (C) 2020-2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + * This tool is designed to give "samtools stats" style output, but dedicated + * to small amplicon sequencing projects. It gathers stats on the + * distribution of reads across amplicons. + */ + +/* + * TODO: + * - Cope with multiple references. What do we do here? Just request one? + * - Permit regions rather than consuming whole file (maybe solves above). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "samtools.h" +#include "sam_opts.h" +#include "bam_ampliconclip.h" + +KHASH_MAP_INIT_INT64(tcoord, int64_t) +KHASH_MAP_INIT_STR(qname, int64_t) + +#ifndef MIN +#define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef MAX +#define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +#ifndef ABS +#define ABS(a) ((a)>=0?(a):-(a)) +#endif + +#define TCOORD_MIN_COUNT 10 +#define MAX_AMP 1000 // Default maximum number of amplicons +#define MAX_AMP_LEN 1000 // Default maximum length of any single amplicon +#define MAX_PRIMER_PER_AMPLICON 4 // Max primers per LEFT/RIGHT +#define MAX_DEPTH 5 // Number of different depths permitted + +typedef struct { + sam_global_args ga; + uint32_t flag_require; + uint32_t flag_filter; + int max_delta; // Used for matching read to amplicon primer loc + int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep + int use_sample_name; + int max_amp; // Total number of amplicons + int max_amp_len; // Maximum length of an individual amplicon + double depth_bin;// aggregate depth within this fraction + int tlen_adj; // Adjust tlen by this amount, due to clip but no fixmate + FILE *out_fp; + char *argv; + int tcoord_min_count; + int tcoord_bin; + int multi_ref; +} astats_args_t; + +typedef struct { + int nseq; // total sequence count + int nfiltered; // sequence filtered + int nfailprimer;// count of sequences not matching the primer locations + + // Sizes of memory allocated below, to permit reset + int max_amp, max_amp_len, max_len; + + // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc + int64_t *nreads, *nreads2; // [max_amp] + double *nfull_reads; // [max_amp]; 0.5/read if paired. + double *nrperc, *nrperc2; // [max_amp] + int64_t *nbases, *nbases2; // [max_amp] + int64_t *coverage; // [max_amp][max_amp_len] + double (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH] + double (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH]; + khash_t(tcoord) **tcoord; // [max_amp+1] + + // 0 is correct pair, 1 is incorrect pair, 2 is unidentified + int (*amp_dist)[3]; // [MAX_AMP][3]; + + int *depth_valid; // [max_len] + int *depth_all; // [max_len] + khash_t(qname) *qend; // queryname end, for overlap removal +} astats_t; + +// We can have multiple primers for LEFT / RIGHT, so this +// permits detection by any compatible combination. +// One reference: +typedef struct { + int64_t left[MAX_PRIMER_PER_AMPLICON]; + int nleft; + int64_t right[MAX_PRIMER_PER_AMPLICON]; + int nright; + int64_t max_left, min_right; // inner dimensions + int64_t min_left, max_right; // outer dimensions +} amplicon_t; + +// Multiple references, we have an array of amplicons_t - one per used ref. +// We have per reference local and global stats here, as some of the stats +// are coordinate based. However we report them combined together as a single +// list across all references. +// "namp" is the number of amplicons in this reference, but they're +// numbered first_amp to first_amp+namp-1 inclusively. +typedef struct { + int tid, namp; + int64_t len; + bed_entry_list_t *sites; + amplicon_t *amp; + astats_t *lstats, *gstats; // local (1 file) and global (all file) stats + const char *ref; // ref name (pointer to the bed hash table key) + int first_amp; // first amplicon number for this ref +} amplicons_t; + +// Reinitialised for each new reference/chromosome. +// Counts from 1 to namp, -1 for no match and 0 for ?. +static int *pos2start = NULL; +static int *pos2end = NULL; +static int pos2size = 0; // allocated size of pos2start/end + +// Lookup table to go from position to amplicon based on +// read start / end. +static int initialise_amp_pos_lookup(astats_args_t *args, + amplicons_t *amps, + int ref) { + int64_t i, j; + amplicon_t *amp = amps[ref].amp; + int64_t max_len = amps[ref].len; + int namp = amps[ref].namp; + + if (max_len+1 > pos2size) { + if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start)))) + return -1; + if (!(pos2end = realloc(pos2end, (max_len+1)*sizeof(*pos2end)))) + return -1; + pos2size = max_len; + } + for (i = 0; i < max_len; i++) + pos2start[i] = pos2end[i] = -1; + + for (i = 0; i < namp; i++) { + for (j = 0; j < amp[i].nleft; j++) { + int64_t p; + for (p = amp[i].left[j] - args->max_delta; + p <= amp[i].left[j] + args->max_delta; p++) { + if (p < 1 || p > max_len) + continue; + pos2start[p-1] = i; + } + } + for (j = 0; j < amp[i].nright; j++) { + int64_t p; + for (p = amp[i].right[j] - args->max_delta; + p <= amp[i].right[j] + args->max_delta; p++) { + if (p < 1 || p > max_len) + continue; + pos2end[p-1] = i; + } + } + } + + return 0; +} + +// Counts amplicons. +// Assumption: input BED file alternates between LEFT and RIGHT primers +// per amplicon, thus we can count the number based on the switching +// orientation. +static int count_amplicon(bed_entry_list_t *sites) { + int i, namp, last_rev = 0; + for (i = namp = 0; i < sites->length; i++) { + if (sites->bp[i].rev == 0 && last_rev) + namp++; + last_rev = sites->bp[i].rev; + } + + return ++namp; +} + +// We're only interest in the internal part of the amplicon. +// Our bed file has LEFT start/end followed by RIGHT start/end, +// so collapse these to LEFT end / RIGHT start. +// +// Returns right most amplicon position on success, +// < 0 on error +static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites, + amplicon_t *amp, int *namp, int do_title, + const char *ref, int first_amp) { + int i, j; + int64_t max_right = 0; + FILE *ofp = args->out_fp; + + *namp = 0; + + // Assume all primers for the same amplicon are adjacent in BED + // with all + followed by all -. Thus - to + signifies next primer set. + int last_rev = 0; + amp[0].max_left = 0; + amp[0].min_right = INT64_MAX; + amp[0].min_left = INT64_MAX; + amp[0].max_right = 0; + if (do_title) { + fprintf(ofp, "# Amplicon locations from BED file.\n"); + fprintf(ofp, "# LEFT/RIGHT are - format and " + "comma-separated for alt-primers.\n"); + if (args->multi_ref) + fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n"); + else + fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n"); + } + for (i = j = 0; i < sites->length; i++) { + if (i == 0 && sites->bp[i].rev != 0) { + fprintf(stderr, "[ampliconstats] error: BED file should start" + " with the + strand primer\n"); + return -1; + } + if (sites->bp[i].rev == 0 && last_rev) { + j++; + if (j >= args->max_amp) { + fprintf(stderr, "[ampliconstats] error: too many amplicons" + " (%d). Use -a option to raise this.\n", j); + return -1; + } + amp[j].max_left = 0; + amp[j].min_right = INT64_MAX; + amp[j].min_left = INT64_MAX; + amp[j].max_right = 0; + } + if (sites->bp[i].rev == 0) { + if (i == 0 || last_rev) { + if (j>0) fprintf(ofp, "\n"); + if (args->multi_ref) + fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp); + else + fprintf(ofp, "AMPLICON\t%d", j+1); + } + if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) { + print_error_errno("ampliconstats", + "too many primers per amplicon (%d).\n", + MAX_PRIMER_PER_AMPLICON); + return -1; + } + amp[j].left[amp[j].nleft++] = sites->bp[i].right; + if (amp[j].max_left < sites->bp[i].right+1) + amp[j].max_left = sites->bp[i].right+1; + if (amp[j].min_left > sites->bp[i].right+1) + amp[j].min_left = sites->bp[i].right+1; + // BED file, so left+1 as zero based. right(+1-1) as + // BED goes one beyond end (and we want inclusive range). + fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1], + sites->bp[i].left+1, sites->bp[i].right); + } else { + if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) { + print_error_errno("ampliconstats", + "too many primers per amplicon (%d)", + MAX_PRIMER_PER_AMPLICON); + return -1; + } + amp[j].right[amp[j].nright++] = sites->bp[i].left; + if (amp[j].min_right > sites->bp[i].left-1) + amp[j].min_right = sites->bp[i].left-1; + if (amp[j].max_right < sites->bp[i].left-1) { + amp[j].max_right = sites->bp[i].left-1; + if (amp[j].max_right - amp[j].min_left + 1 >= + args->max_amp_len) { + fprintf(stderr, "[ampliconstats] error: amplicon " + "longer (%d) than max_amp_len option (%d)\n", + (int)(amp[j].max_right - amp[j].min_left + 2), + args->max_amp_len); + return -1; + } + if (max_right < amp[j].max_right) + max_right = amp[j].max_right; + } + fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1], + sites->bp[i].left+1, sites->bp[i].right); + } + last_rev = sites->bp[i].rev; + } + if (last_rev != 1) { + fprintf(ofp, "\n"); // useful if going to stdout + fprintf(stderr, "[ampliconstats] error: bed file does not end on" + " a reverse strand primer.\n"); + return -1; + } + *namp = ++j; + if (j) fprintf(ofp, "\n"); + + if (j >= args->max_amp) { + fprintf(stderr, "[ampliconstats] error: " + "too many amplicons (%d). Use -a option to raise this.", j); + return -1; + } + +// for (i = 0; i < *namp; i++) { +// printf("%d\t%ld", i, amp[i].length); +// for (j = 0; j < amp[i].nleft; j++) +// printf("%c%ld", "\t,"[j>0], amp[i].left[j]); +// for (j = 0; j < amp[i].nright; j++) +// printf("%c%ld", "\t,"[j>0], amp[i].right[j]); +// printf("\n"); +// } + + return max_right; +} + +void stats_free(astats_t *st) { + if (!st) + return; + + free(st->nreads); + free(st->nreads2); + free(st->nfull_reads); + free(st->nrperc); + free(st->nrperc2); + free(st->nbases); + free(st->nbases2); + free(st->coverage); + free(st->covered_perc); + free(st->covered_perc2); + free(st->amp_dist); + + free(st->depth_valid); + free(st->depth_all); + + if (st->tcoord) { + int i; + for (i = 0; i <= st->max_amp; i++) { + if (st->tcoord[i]) + kh_destroy(tcoord, st->tcoord[i]); + } + free(st->tcoord); + } + + khiter_t k; + for (k = kh_begin(st->qend); k != kh_end(st->qend); k++) + if (kh_exist(st->qend, k)) + free((void *)kh_key(st->qend, k)); + kh_destroy(qname, st->qend); + + free(st); +} + +astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) { + astats_t *st = calloc(1, sizeof(*st)); + if (!st) + return NULL; + + st->max_amp = max_amp; + st->max_amp_len = max_amp_len; + st->max_len = max_len; + + if (!(st->nreads = calloc(max_amp, sizeof(*st->nreads)))) goto err; + if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err; + if (!(st->nrperc = calloc(max_amp, sizeof(*st->nrperc)))) goto err; + if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err; + if (!(st->nbases = calloc(max_amp, sizeof(*st->nbases)))) goto err; + if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err; + + if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads)))) + goto err; + + if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage)))) + goto err; + + if (!(st->covered_perc = calloc(max_amp, sizeof(*st->covered_perc)))) + goto err; + if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2)))) + goto err; + + if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err; + int i; + for (i = 0; i <= st->max_amp; i++) + if (!(st->tcoord[i] = kh_init(tcoord))) + goto err; + + if (!(st->qend = kh_init(qname))) + goto err; + + if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid)))) + goto err; + if (!(st->depth_all = calloc(max_len, sizeof(*st->depth_all)))) + goto err; + + if (!(st->amp_dist = calloc(max_amp, sizeof(*st->amp_dist)))) goto err; + + return st; + + err: + stats_free(st); + return NULL; +} + +static void stats_reset(astats_t *st) { + st->nseq = 0; + st->nfiltered = 0; + st->nfailprimer = 0; + + memset(st->nreads, 0, st->max_amp * sizeof(*st->nreads)); + memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2)); + memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads)); + + memset(st->nrperc, 0, st->max_amp * sizeof(*st->nrperc)); + memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2)); + + memset(st->nbases, 0, st->max_amp * sizeof(*st->nbases)); + memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2)); + + memset(st->coverage, 0, st->max_amp * st->max_amp_len + * sizeof(*st->coverage)); + memset(st->covered_perc, 0, st->max_amp * sizeof(*st->covered_perc)); + memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2)); + + // Keep the allocated entries as it's likely all files will share + // the same keys. Instead we reset counters to zero for common ones + // and delete rare ones. + int i; + for (i = 0; i <= st->max_amp; i++) { + khiter_t k; + for (k = kh_begin(st->tcoord[i]); + k != kh_end(st->tcoord[i]); k++) + if (kh_exist(st->tcoord[i], k)) { + if (kh_value(st->tcoord[i], k) < 5) + kh_del(tcoord, st->tcoord[i], k); + else + kh_value(st->tcoord[i], k) = 0; + } + } + + khiter_t k; + for (k = kh_begin(st->qend); k != kh_end(st->qend); k++) + if (kh_exist(st->qend, k)) + free((void *)kh_key(st->qend, k)); + kh_clear(qname, st->qend); + + memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid)); + memset(st->depth_all, 0, st->max_len * sizeof(*st->depth_all)); + memset(st->amp_dist, 0, st->max_amp * sizeof(*st->amp_dist)); +} + +static void amp_stats_reset(amplicons_t *amps, int nref) { + int i; + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + stats_reset(amps[i].lstats); + } +} + +static int accumulate_stats(astats_args_t *args, amplicons_t *amps, + bam1_t *b) { + int ref = b->core.tid; + amplicon_t *amp = amps[ref].amp; + astats_t *stats = amps[ref].lstats; + int len = amps[ref].len; + + if (!stats) + return 0; + + stats->nseq++; + if ((b->core.flag & args->flag_require) != args->flag_require || + (b->core.flag & args->flag_filter) != 0) { + stats->nfiltered++; + return 0; + } + + int64_t start = b->core.pos, mstart = start; // modified start + int64_t end = bam_endpos(b), i; + + // Compute all-template-depth and valid-template-depth. + // We track current end location per read name so we can remove overlaps. + // Potentially we could use this data for a better amplicon-depth + // count too, but for now it's purely for the per-base plots. + int ret; + khiter_t k; + int prev_start = 0, prev_end = 0; + if ((b->core.flag & BAM_FPAIRED) + && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) { + k = kh_put(qname, stats->qend, bam_get_qname(b), &ret); + if (ret == 0) { + prev_start = kh_value(stats->qend, k) & 0xffffffff; + prev_end = kh_value(stats->qend, k)>>32; + mstart = MAX(mstart, prev_end); + // Ideally we'd reuse strings so we don't thrash free/malloc. + // However let's see if the official way of doing that (malloc + // itself) is fast enough first. + free((void *)kh_key(stats->qend, k)); + kh_del(qname, stats->qend, k); + //fprintf(stderr, "remove overlap %d to %d\n", (int)start, (int)mstart); + } else { + if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b)))) + return -1; + + kh_value(stats->qend, k) = start | (end << 32); + } + } + for (i = mstart; i < end && i < len; i++) + stats->depth_all[i]++; + if (i < end) { + print_error("ampliconstats", "record %s overhangs end of reference", + bam_get_qname(b)); + // But keep going, as it's harmless. + } + + // On single ended runs, eg ONT or PacBio, we just use the start/end + // of the template to assign. + int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED) + ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1) + : (start >= 0 && start < len ? pos2start[start] : -1); + + // ivar sometimes soft-clips 100% of the bases. + // This is essentially unmapped + if (end == start && (args->flag_filter & BAM_FUNMAP)) { + stats->nfiltered++; + return 0; + } + + if (anum == -1) + stats->nfailprimer++; + + if (anum >= 0) { + int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left); + if (c > 0) { + stats->nreads[anum]++; + // NB: ref bases rather than read bases + stats->nbases[anum] += c; + + int64_t i; + if (start < 0) start = 0; + if (end > len) end = len; + + int64_t ostart = MAX(start, amp[anum].min_left-1); + int64_t oend = MIN(end, amp[anum].max_right); + int64_t offset = amp[anum].min_left-1; + for (i = ostart; i < oend; i++) + stats->coverage[anum*stats->max_amp_len + i-offset]++; + } else { + stats->nfailprimer++; + } + } + + // Template length in terms of amplicon number to amplicon number. + // We expect left to right of same amplicon (len 0), but it may go + // to next amplicon (len 1) or prev (len -1), etc. + int64_t t_end; + int oth_anum = -1; + + if (b->core.flag & BAM_FPAIRED) { + t_end = (b->core.flag & BAM_FREVERSE ? end : start) + + b->core.isize; + + // If we've clipped the primers but not followed up with a fixmates + // then our start+TLEN will take us to a location which is + // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away. + // + // The correct solution is to run samtools fixmate so TLEN is correct. + // The hacky solution is to fudge the expected tlen by double the + // average primer length (e.g. 50). + t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj; + + if (t_end > 0 && t_end < len && b->core.isize != 0) + oth_anum = (b->core.flag & BAM_FREVERSE) + ? pos2start[t_end] + : pos2end[t_end]; + } else { + // Not paired (see int anum = (REV || !PAIR) ?en :st expr above) + oth_anum = pos2start[start]; + t_end = end; + } + + // We don't want to count our pairs twice. + // If both left/right are known, count it on left only. + // If only one is known, we'll only get to this code once + // so we can also count it. + int astatus = 2; + if (anum != -1 && oth_anum != -1) { + astatus = oth_anum == anum ? 0 : 1; + if (start <= t_end) + stats->amp_dist[anum][astatus]++; + } else if (anum >= 0) { + stats->amp_dist[anum][astatus = 2]++; + } + + if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) { + if (prev_end && mstart > prev_end) { + // 2nd read with gap to 1st; undo previous increment. + for (i = prev_start; i < prev_end; i++) + stats->depth_valid[i]--; + stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1; + } else { + // 1st read, or 2nd read that overlaps 1st + for (i = mstart; i < end; i++) + stats->depth_valid[i]++; + stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1; + } + } + + // Track template start,end frequencies, so we can give stats on + // amplicon primer usage. + if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0) + // left to right only, so we don't double count template positions. + return 0; + + start = b->core.pos; + t_end = b->core.flag & BAM_FPAIRED + ? start + b->core.isize-1 + : end; + uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32); + k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret); + if (ret < 0) + return -1; + if (ret == 0) + kh_value(stats->tcoord[anum+1], k)++; + else + kh_value(stats->tcoord[anum+1], k)=1; + kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32); + + return 0; +} + +// Append file local stats to global stats +int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) { + gstats->nseq += lstats->nseq; + gstats->nfiltered += lstats->nfiltered; + gstats->nfailprimer += lstats->nfailprimer; + + int a; + for (a = -1; a < namp; a++) { + // Add khash local (kl) to khash global (kg) + khiter_t kl, kg; + for (kl = kh_begin(lstats->tcoord[a+1]); + kl != kh_end(lstats->tcoord[a+1]); kl++) { + if (!kh_exist(lstats->tcoord[a+1], kl) || + kh_value(lstats->tcoord[a+1], kl) == 0) + continue; + + int ret; + kg = kh_put(tcoord, gstats->tcoord[a+1], + kh_key(lstats->tcoord[a+1], kl), + &ret); + if (ret < 0) + return -1; + + kh_value(gstats->tcoord[a+1], kg) = + (ret == 0 + ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF) + : 0) + + kh_value(lstats->tcoord[a+1], kl); + } + if (a == -1) continue; + + gstats->nreads[a] += lstats->nreads[a]; + gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a]; + gstats->nfull_reads[a] += lstats->nfull_reads[a]; + + // To get mean & sd for amplicon read percentage, we need + // to do the divisions here as nseq differs for each sample. + double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0; + gstats->nrperc[a] += nrperc; + gstats->nrperc2[a] += nrperc*nrperc; + + gstats->nbases[a] += lstats->nbases[a]; + gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a]; + + int d; + for (d = 0; d < MAX_DEPTH; d++) { + gstats->covered_perc[a][d] += lstats->covered_perc[a][d]; + gstats->covered_perc2[a][d] += lstats->covered_perc[a][d] + * lstats->covered_perc[a][d]; + } + + for (d = 0; d < 3; d++) + gstats->amp_dist[a][d] += lstats->amp_dist[a][d]; + } + + for (a = 0; a < lstats->max_len; a++) { + gstats->depth_valid[a] += lstats->depth_valid[a]; + gstats->depth_all[a] += lstats->depth_all[a]; + } + + return 0; +} + +int append_stats(amplicons_t *amps, int nref) { + int i, r, all_nseq = 0; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = amps[r].lstats; + all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer; + } + + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp, + all_nseq) < 0) + return -1; + } + + return 0; +} + +typedef struct { + int32_t start, end; + uint32_t freq; + uint32_t status; +} tcoord_t; + +// Sort tcoord by descending frequency and then ascending start and end. +static int tcoord_freq_sort(const void *vp1, const void *vp2) { + const tcoord_t *t1 = (const tcoord_t *)vp1; + const tcoord_t *t2 = (const tcoord_t *)vp2; + + if (t1->freq != t2->freq) + return t2->freq - t1->freq; + + if (t1->start != t2->start) + return t1->start - t2->start; + + return t1->end - t2->end; +} + + +/* + * Merges tcoord start,end,freq,status tuples if their coordinates are + * close together. We aim to keep the start,end for the most frequent + * value and assume that is the correct coordinate and all others are + * minor fluctuations due to errors or variants. + * + * We sort by frequency first and then merge later items in the list into + * the earlier more frequent ones. It's O(N^2), but sufficient for now + * given current scale of projects. + * + * If we ever need to resolve that then consider sorting by start + * coordinate and scanning the list to find all items within X, find + * the most frequent of those, and then cluster that way. (I'd have + * done that had I thought of it at the time!) + */ +static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){ + size_t n = *np, j, j2, j3, k; + + // Sort by frequency and cluster infrequent coords into frequent + // ones provided they're close by. + // This is O(N^2), but we've already binned by tcoord_bin/2 so + // the list isn't intended to be vast at this point. + qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort); + + // For frequency ties, find mid start coord, and then find mid end + // coord of those matching start. + // We make that the first item so we merge into that mid point. + for (j = 0; j < n; j++) { + for (j2 = j+1; j2 < n; j2++) { + if (tpos[j].freq != tpos[j2].freq) + break; + if (tpos[j2].start - tpos[j].start >= args->tcoord_bin) + break; + } + + // j to j2 all within bin of a common start, + // m is the mid start. + if (j2-1 > j) { + size_t m = (j2-1 + j)/2; + + // Find mid end for this same start + while (m > 1 && tpos[m].start == tpos[m-1].start) + m--; + for (j3 = m+1; j3 < j2; j3++) { + if (tpos[m].start != tpos[j3].start) + break; + if (tpos[m].end - tpos[j3].end >= args->tcoord_bin) + break; + } + if (j3-1 > m) + m = (j3-1 + m)/2; + + // Swap with first item. + tcoord_t tmp = tpos[j]; + tpos[j] = tpos[m]; + tpos[m] = tmp; + j = j2-1; + } + } + + // Now merge in coordinates. + // This bit is O(N^2), so consider binning first to reduce the + // size of the list if we have excessive positional variation. + for (k = j = 0; j < n; j++) { + if (!tpos[j].freq) + continue; + + if (k < j) + tpos[k] = tpos[j]; + + for (j2 = j+1; j2 < n; j2++) { + if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 && + ABS(tpos[j].end -tpos[j2].end) < args->tcoord_bin/2 && + tpos[j].status == tpos[j2].status) { + tpos[k].freq += tpos[j2].freq; + tpos[j2].freq = 0; + } + } + k++; + } + + *np = k; +} + +int dump_stats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref, int local) { + int i, r; + FILE *ofp = args->out_fp; + tcoord_t *tpos = NULL; + size_t ntcoord = 0; + + // summary stats for this sample (or for all samples) + fprintf(ofp, "# Summary stats.\n"); + fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type); + + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer; + char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2); + if (!name_ref) + return -1; + if (args->multi_ref) + sprintf(name_ref, "%s\t%s", name, amps[r].ref); + else + sprintf(name_ref, "%s", name); + fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n", + type, name_ref, stats->nseq); + fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n", + type, name_ref, stats->nfiltered); + fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n", + type, name_ref, stats->nfailprimer); + fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n", + type, name_ref, nmatch); + + int d = 0; + do { + // From first to last amplicon only, so not entire consensus. + // If contig length is known, maybe we want to add the missing + // count to < DEPTH figures? + int64_t start = 0, covered = 0, total = 0; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int64_t j, offset = amp[i].min_left-1; + if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) { + fprintf(stderr, "[ampliconstats] error: " + "Maximum amplicon length (%d) exceeded for '%s'\n", + stats->max_amp, name); + return -1; + } + for (j = MAX(start, amp[i].max_left-1); + j < MAX(start, amp[i].min_right); j++) { + if (stats->coverage[i*stats->max_amp_len + j-offset] + >= args->min_depth[d]) + covered++; + total++; + } + start = MAX(start, amp[i].min_right); + } + fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%" + PRId64"\t%"PRId64"\n", type, name_ref, + args->min_depth[d], args->min_depth[d], + total-covered, covered); + } while (++d < MAX_DEPTH && args->min_depth[d]); + + free(name_ref); + } + + // Read count + fprintf(ofp, "# Absolute matching read counts per amplicon.\n"); + fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type); + fprintf(ofp, "%cREADS\t%s", type, name); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%"PRId64, stats->nreads[i]); + } + } + fprintf(ofp, "\n"); + + // Valid depth is the number of full length reads (already divided + // by the number we expect to cover), so +0.5 per read in pair. + // A.k.a "usable depth" in the plots. + fprintf(ofp, "%cVDEPTH\t%s", type, name); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) + fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]); + } + fprintf(ofp, "\n"); + + if (type == 'C') { + // For combined we can compute mean & standard deviation too + fprintf(ofp, "CREADS\tMEAN"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile); + } + } + fprintf(ofp, "\n"); + + fprintf(ofp, "CREADS\tSTDDEV"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + double n1 = stats->nreads[i]; + fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0 + ? sqrt(stats->nreads2[i]/(double)nfile + - (n1/nfile)*(n1/nfile)) + : 0); + } + } + fprintf(ofp, "\n"); + } + + fprintf(ofp, "# Read percentage of distribution between amplicons.\n"); + fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type); + fprintf(ofp, "%cRPERC\t%s", type, name); + int all_nseq = 0; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer; + } + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + if (type == 'C') { + fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile); + } else { + fprintf(ofp, "\t%.3f", + all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0); + } + } + } + fprintf(ofp, "\n"); + + if (type == 'C') { + // For combined we compute mean and standard deviation too + fprintf(ofp, "CRPERC\tMEAN"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile); + } + } + fprintf(ofp, "\n"); + + fprintf(ofp, "CRPERC\tSTDDEV"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + // variance = SUM(X^2) - ((SUM(X)^2) / N) + double n1 = stats->nrperc[i]; + double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile); + fprintf(ofp, "\t%.3f", v>0?sqrt(v):0); + } + } + fprintf(ofp, "\n"); + } + + // Base depth + fprintf(ofp, "# Read depth per amplicon.\n"); + fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type); + fprintf(ofp, "%cDEPTH\t%s", type, name); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; + int64_t alen = amp[i].min_right - amp[i].max_left+1; + fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0); + } + } + fprintf(ofp, "\n"); + + if (type == 'C') { + // For combined we can compute mean & standard deviation too + fprintf(ofp, "CDEPTH\tMEAN"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; + for (i = 0; i < amps[r].namp; i++) { + int64_t alen = amp[i].min_right - amp[i].max_left+1; + fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0); + } + } + fprintf(ofp, "\n"); + + fprintf(ofp, "CDEPTH\tSTDDEV"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + double alen = amp[i].min_right - amp[i].max_left+1; + double n1 = stats->nbases[i] / alen; + double v = stats->nbases2[i] / (alen*alen) /nfile + - (n1/nfile)*(n1/nfile); + fprintf(ofp, "\t%.1f", v>0?sqrt(v):0); + } + } + fprintf(ofp, "\n"); + } + + // Percent Coverage + if (type == 'F') { + fprintf(ofp, "# Percentage coverage per amplicon\n"); + fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type); + int d = 0; + do { + fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name); + + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int covered = 0; + if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) { + fprintf(stderr, "[ampliconstats] error: " + "Maximum amplicon length (%d) exceeded for '%s'\n", + stats->max_amp, name); + return -1; + } + int64_t j, offset = amp[i].min_left-1; + for (j = amp[i].max_left-1; j < amp[i].min_right; j++) { + int apos = i*stats->max_amp_len + j-offset; + if (stats->coverage[apos] >= args->min_depth[d]) + covered++; + } + int64_t alen = amp[i].min_right - amp[i].max_left+1; + stats->covered_perc[i][d] = 100.0 * covered / alen; + fprintf(ofp, "\t%.2f", 100.0 * covered / alen); + } + } + fprintf(ofp, "\n"); + } while (++d < MAX_DEPTH && args->min_depth[d]); + + } else if (type == 'C') { + // For combined we can compute mean & standard deviation too + int d = 0; + do { + fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile); + } + } + fprintf(ofp, "\n"); + + fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + double n1 = stats->covered_perc[i][d] / nfile; + double v = stats->covered_perc2[i][d] / nfile - n1*n1; + fprintf(ofp, "\t%.1f", v>0?sqrt(v):0); + } + } + fprintf(ofp, "\n"); + } while (++d < MAX_DEPTH && args->min_depth[d]); + } + + // Plus base depth for all reads, irrespective of amplicon. + // This is post overlap removal, if reads in the read-pair overlap. + fprintf(ofp, "# Depth per reference base for ALL data.\n"); + fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n", + type); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + if (args->multi_ref) + fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref); + else + fprintf(ofp, "%cDP_ALL\t%s", type, name); + + for (i = 0; i < amps[r].len; i++) { + // Basic run-length encoding provided all values are within + // +- depth_bin fraction of the mid-point. + int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j; + double dmid = (dmin + dmax)/2.0; + double low = dmid*(1-args->depth_bin); + double high = dmid*(1+args->depth_bin); + for (j = i+1; j < amps[r].len; j++) { + int d = stats->depth_all[j]; + if (d < low || d > high) + break; + if (dmin > d) { + dmin = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } else if (dmax < d) { + dmax = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } + } + fprintf(ofp, "\t%d,%d", (int)dmid, j-i); + i = j-1; + } + fprintf(ofp, "\n"); + } + + // And depth for only reads matching to a single amplicon for full + // length. This is post read overlap removal. + fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n"); + fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this " + "part.\n", type); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + if (args->multi_ref) + fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref); + else + fprintf(ofp, "%cDP_VALID\t%s", type, name); + + for (i = 0; i < amps[r].len; i++) { + int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j; + double dmid = (dmin + dmax)/2.0; + double low = dmid*(1-args->depth_bin); + double high = dmid*(1+args->depth_bin); + for (j = i+1; j < amps[r].len; j++) { + int d = stats->depth_valid[j]; + if (d < low || d > high) + break; + if (dmin > d) { + dmin = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } else if (dmax < d) { + dmax = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } + } + fprintf(ofp, "\t%d,%d", (int)dmid, j-i); + i = j-1; + } + fprintf(ofp, "\n"); + } + + // TCOORD (start to end) distribution + fprintf(ofp, "# Distribution of aligned template coordinates.\n"); + fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0 - (nref==1); i < amps[r].namp; i++) { + if (ntcoord < kh_size(stats->tcoord[i+1])) { + ntcoord = kh_size(stats->tcoord[i+1]); + tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp)); + if (!tmp) { + free(tpos); + return -1; + } + tpos = tmp; + } + + khiter_t k; + size_t n = 0, j; + for (k = kh_begin(stats->tcoord[i+1]); + k != kh_end(stats->tcoord[i+1]); k++) { + if (!kh_exist(stats->tcoord[i+1], k) || + (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0) + continue; + // Key is start,end in 32-bit quantities. + // Yes this limits us to 4Gb references, but just how + // many primers are we planning on making? Not that many + // I hope. + tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff; + tpos[n].end = kh_key(stats->tcoord[i+1], k)>>32; + + // Value is frequency (top 32-bits) and status (bottom 32). + tpos[n].freq = kh_value(stats->tcoord[i+1], k)&0xffffffff; + tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32; + n++; + } + + if (args->tcoord_bin > 1) + aggregate_tcoord(args, tpos, &n); + + fprintf(ofp, "%cTCOORD\t%s\t%d", type, name, + i+1+amps[r].first_amp); // per amplicon + for (j = 0; j < n; j++) { + if (tpos[j].freq < args->tcoord_min_count) + continue; + fprintf(ofp, "\t%d,%d,%u,%u", + tpos[j].start, + tpos[j].end, + tpos[j].freq, + tpos[j].status); + } + fprintf(ofp, "\n"); + } + } + + + // AMP length distribution. + // 0 = both ends in this amplicon + // 1 = ends in different amplicons + // 2 = other end matching an unknown amplicon site + // (see tcoord for further analysis of where) + fprintf(ofp, "# Classification of amplicon status. Columns are\n"); + fprintf(ofp, "# number with both primers from this amplicon, number with\n"); + fprintf(ofp, "# primers from different amplicon, and number with a position\n"); + fprintf(ofp, "# not matching any valid amplicon primer site\n"); + fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type); + + fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged + int amp_dist[3] = {0}; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps + amp_dist[0] += stats->amp_dist[i][0]; + amp_dist[1] += stats->amp_dist[i][1]; + amp_dist[2] += stats->amp_dist[i][2]; + } + } + fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]); + + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + // per amplicon + fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp); + fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0], + stats->amp_dist[i][1], stats->amp_dist[i][2]); + } + } + + free(tpos); + return 0; +} + +int dump_lstats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref) { + return dump_stats(args, type, name, nfile, amps, nref, 1); +} + +int dump_gstats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref) { + return dump_stats(args, type, name, nfile, amps, nref, 0); +} + +char const *get_sample_name(sam_hdr_t *header, char *RG) { + kstring_t ks = {0}; + sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks); + return ks.s; +} + +// Return maximum reference length (SQ is NULL) or the length +// of the specified reference in SQ. +int64_t get_ref_len(sam_hdr_t *header, const char *SQ) { + if (SQ) { + int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0; + return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1; + } else { + int nref = sam_hdr_nref(header), tid;; + int64_t len = 0; + for (tid = 0; tid < nref; tid++) { + int64_t rl = sam_hdr_tid2len(header, tid); + if (len < rl) + len = rl; + } + return len; + } +} + +static int amplicon_stats(astats_args_t *args, + khash_t(bed_list_hash) *bed_hash, + char **filev, int filec) { + int i, ref = -1, ref_tid = -1, ret = -1, nref = 0; + samFile *fp = NULL; + sam_hdr_t *header = NULL; + bam1_t *b = bam_init1(); + FILE *ofp = args->out_fp; + char sname_[8192], *sname = NULL; + amplicons_t *amps = NULL; + + // Report initial SS header. We gather data from the bed_hash entries + // as well as from the first SAM header (with the requirement that all + // headers should be compatible). + if (filec) { + if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) { + print_error_errno("ampliconstats", + "Cannot open input file \"%s\"", + filev[0]); + goto err; + } + if (!(header = sam_hdr_read(fp))) + goto err; + + if (!amps) { + amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps)); + if (!amps) + goto err; + fprintf(ofp, "# Summary statistics, used for scaling the plots.\n"); + fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version()); + fprintf(ofp, "SS\tCommand line: %s\n", args->argv); + fprintf(ofp, "SS\tNumber of files:\t%d\n", filec); + + // Note: order of hash entries will be different to order of + // BED file which may also differ to order of SQ headers. + // SQ header is canonical ordering (pos sorted file). + khiter_t k; + int bam_nref = sam_hdr_nref(header); + for (i = 0; i < bam_nref; i++) { + k = kh_get(bed_list_hash, bed_hash, + sam_hdr_tid2name(header, i)); + if (!kh_exist(bed_hash, k)) + continue; + + bed_entry_list_t *sites = &kh_value(bed_hash, k); + + ref = i; + amps[ref].ref = kh_key(bed_hash, k); + amps[ref].sites = sites; + amps[ref].namp = count_amplicon(sites); + amps[ref].amp = calloc(sites->length, + sizeof(*amps[ref].amp)); + if (!amps[ref].amp) + goto err; + if (args->multi_ref) + fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n", + kh_key(bed_hash, k), amps[ref].namp); + else + fprintf(ofp, "SS\tNumber of amplicons:\t%d\n", + amps[ref].namp); + + amps[ref].tid = ref; + if (ref_tid == -1) + ref_tid = ref; + + int64_t len = get_ref_len(header, kh_key(bed_hash, k)); + amps[ref].len = len; + if (args->multi_ref) + fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n", + kh_key(bed_hash, k), len); + else + fprintf(ofp, "SS\tReference length:\t%"PRId64"\n", + len); + + amps[ref].lstats = stats_alloc(len, args->max_amp, + args->max_amp_len); + amps[ref].gstats = stats_alloc(len, args->max_amp, + args->max_amp_len); + if (!amps[ref].lstats || !amps[ref].gstats) + goto err; + } + } + + sam_hdr_destroy(header); + header = NULL; + if (sam_close(fp) < 0) { + fp = NULL; + goto err; + } + fp = NULL; + } + fprintf(ofp, "SS\tEnd of summary\n"); + + // Extract the bits of amplicon data we need from bed hash and turn + // it into a position-to-amplicon lookup table. + int offset = 0; + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + + amps[i].first_amp = offset; + if (bed2amplicon(args, amps[i].sites, amps[i].amp, + &s[i].namp, i==0, amps[i].ref, offset) < 0) + goto err; + + offset += amps[i].namp; // cumulative amplicon number across refs + } + + // Now iterate over file contents, one at a time. + for (i = 0; i < filec; i++) { + char *nstart = filev[i]; + + fp = sam_open_format(filev[i], "r", &args->ga.in); + if (!fp) { + print_error_errno("ampliconstats", + "Cannot open input file \"%s\"", + filev[i]); + goto err; + } + + if (args->ga.nthreads > 0) + hts_set_threads(fp, args->ga.nthreads); + + if (!(header = sam_hdr_read(fp))) + goto err; + + if (nref != sam_hdr_nref(header)) { + print_error_errno("ampliconstats", + "SAM headers are not consistent across input files"); + goto err; + } + int r; + for (r = 0; r < nref; r++) { + if (!amps[r].ref || + strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 || + amps[r].len != sam_hdr_tid2len(header, r)) { + print_error_errno("ampliconstats", + "SAM headers are not consistent across " + "input files"); + goto err; + } + } + + if (args->use_sample_name) + sname = (char *)get_sample_name(header, NULL); + + if (!sname) { + sname = sname_; + char *nend = filev[i] + strlen(filev[i]), *cp; + if ((cp = strrchr(filev[i], '/'))) + nstart = cp+1; + if ((cp = strrchr(nstart, '.')) && + (strcmp(cp, ".bam") == 0 || + strcmp(cp, ".sam") == 0 || + strcmp(cp, ".cram") == 0)) + nend = cp; + if (nend - nstart >= 8192) nend = nstart+8191; + memcpy(sname, nstart, nend-nstart); + sname[nend-nstart] = 0; + } + + // Stats local to this sample only + amp_stats_reset(amps, nref); + + int last_ref = -9; + while ((r = sam_read1(fp, header, b)) >= 0) { + // Other filter options useful here? + if (b->core.tid < 0) + continue; + + if (last_ref != b->core.tid) { + last_ref = b->core.tid; + if (initialise_amp_pos_lookup(args, amps, last_ref) < 0) + goto err; + } + + if (accumulate_stats(args, amps, b) < 0) + goto err; + } + + if (r < -1) { + print_error_errno("ampliconstats", "Fail reading record"); + goto err; + } + + sam_hdr_destroy(header); + if (sam_close(fp) < 0) { + fp = NULL; + goto err; + } + + fp = NULL; + header = NULL; + + if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0) + goto err; + + if (append_stats(amps, nref) < 0) + goto err; + + if (sname && sname != sname_) + free(sname); + sname = NULL; + } + + if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0) + goto err; + + ret = 0; + err: + bam_destroy1(b); + if (ret) { + if (header) + sam_hdr_destroy(header); + if (fp) + sam_close(fp); + } + for (i = 0; i < nref; i++) { + stats_free(amps[i].lstats); + stats_free(amps[i].gstats); + free(amps[i].amp); + } + free(amps); + free(pos2start); + free(pos2end); + if (ret) { + if (sname && sname != sname_) + free(sname); + } + + return ret; +} + +static int usage(astats_args_t *args, FILE *fp, int exit_status) { + fprintf(fp, +"\n" +"Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n" +"\n" +"Options:\n"); + fprintf(fp, " -f, --required-flag STR|INT\n" + " Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require); + fprintf(fp, " -F, --filter-flag STR|INT\n" + " Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff); + fprintf(fp, " -a, --max-amplicons INT\n" + " Change the maximum number of amplicons permitted [%d]\n", MAX_AMP); + fprintf(fp, " -l, --max-amplicon-length INT\n" + " Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN); + fprintf(fp, " -d, --min-depth INT[,INT]...\n" + " Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]); + fprintf(fp, " -m, --pos-margin INT\n" + " Margin of error for matching primer positions [%d]\n", args->max_delta); + fprintf(fp, " -o, --output FILE\n" + " Specify output file [stdout if unset]\n"); + fprintf(fp, " -s, --use-sample-name\n" + " Use the sample name from the first @RG header line\n"); + fprintf(fp, " -t, --tlen-adjust INT\n" + " Add/subtract from TLEN; use when clipping but no fixmate step\n"); + fprintf(fp, " -b, --tcoord-bin INT\n" + " Bin template start,end positions into multiples of INT[1]\n"); + fprintf(fp, " -c, --tcoord-min-count INT\n" + " Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT); + fprintf(fp, " -D, --depth-bin FRACTION\n" + " Merge FDP values within +/- FRACTION together\n"); + fprintf(fp, " -S, --single-ref\n" + " Force single-ref (<=1.12) output format\n"); + sam_global_opt_help(fp, "I.--.@"); + + return exit_status; +} + +int main_ampliconstats(int argc, char **argv) { + astats_args_t args = { + .ga = SAM_GLOBAL_ARGS_INIT, + .flag_require = 0, + .flag_filter = 0x10B04, + //.sites = BED_LIST_INIT, + .max_delta = 30, // large enough to cope with alt primers + .min_depth = {1}, + .use_sample_name = 0, + .max_amp = MAX_AMP, + .max_amp_len = MAX_AMP_LEN, + .tlen_adj = 0, + .out_fp = stdout, + .tcoord_min_count = TCOORD_MIN_COUNT, + .tcoord_bin = 1, + .depth_bin = 0.01, + .multi_ref = 1 + }, oargs = args; + + static const struct option loptions[] = + { + SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'), + {"help", no_argument, NULL, 'h'}, + {"flag-require", required_argument, NULL, 'f'}, + {"flag-filter", required_argument, NULL, 'F'}, + {"min-depth", required_argument, NULL, 'd'}, + {"output", required_argument, NULL, 'o'}, + {"pos-margin", required_argument, NULL, 'm'}, + {"use-sample-name", no_argument, NULL, 's'}, + {"max-amplicons", required_argument, NULL, 'a'}, + {"max-amplicon-length", required_argument, NULL, 'l'}, + {"tlen-adjust", required_argument, NULL, 't'}, + {"tcoord-min-count", required_argument, NULL, 'c'}, + {"tcoord-bin", required_argument, NULL, 'b'}, + {"depth-bin", required_argument, NULL, 'D'}, + {"single-ref", no_argument, NULL, 'S'}, + {NULL, 0, NULL, 0} + }; + int opt; + + while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) { + switch (opt) { + case 'f': args.flag_require = bam_str2flag(optarg); break; + case 'F': + if (args.flag_filter & 0x10000) + args.flag_filter = 0; // strip default on first -F usage + args.flag_filter |= bam_str2flag(optarg); break; + + case 'm': args.max_delta = atoi(optarg); break; // margin + case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction + case 'd': { + int d = 0; + char *cp = optarg, *ep; + do { + long n = strtol(cp, &ep, 10); + args.min_depth[d++] = n; + if (*ep != ',') + break; + cp = ep+1; + } while (d < MAX_DEPTH); + break; + } + + case 'a': args.max_amp = atoi(optarg)+1;break; + case 'l': args.max_amp_len = atoi(optarg)+1;break; + + case 'c': args.tcoord_min_count = atoi(optarg);break; + case 'b': + args.tcoord_bin = atoi(optarg); + if (args.tcoord_bin < 1) + args.tcoord_bin = 1; + break; + + case 't': args.tlen_adj = atoi(optarg);break; + + case 's': args.use_sample_name = 1;break; + + case 'o': + if (!(args.out_fp = fopen(optarg, "w"))) { + perror(optarg); + return 1; + } + break; + + case 'S': + args.multi_ref = 0; + break; + + case '?': return usage(&oargs, stderr, EXIT_FAILURE); + case 'h': return usage(&oargs, stdout, EXIT_SUCCESS); + + default: + if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0) + usage(&oargs,stderr, EXIT_FAILURE); + break; + } + } + + if (argc <= optind) + return usage(&oargs, stdout, EXIT_SUCCESS); + if (argc <= optind+1 && isatty(STDIN_FILENO)) + return usage(&oargs, stderr, EXIT_FAILURE); + + khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash); + if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) { + print_error_errno("ampliconstats", + "Could not read file \"%s\"", argv[optind]); + return 1; + + } + + khiter_t k, ref_count = 0; + for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) { + if (!kh_exist(bed_hash, k)) + continue; + ref_count++; + } + if (ref_count == 0) + return 1; + if (ref_count > 1 && args.multi_ref == 0) { + print_error("ampliconstats", + "Single-ref mode is not permitted for BED files\n" + "containing more than one reference."); + return 1; + } + + args.argv = stringify_argv(argc, argv); + int ret; + if (argc == ++optind) { + char *av = "-"; + ret = amplicon_stats(&args, bed_hash, &av, 1); + } else { + ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind); + } + + free(args.argv); + destroy_bed_hash(bed_hash); + + return ret; +} diff --git a/samtools/amplicon_stats.c.pysam.c b/samtools/amplicon_stats.c.pysam.c new file mode 100644 index 0000000..aa09459 --- /dev/null +++ b/samtools/amplicon_stats.c.pysam.c @@ -0,0 +1,1756 @@ +#include "samtools.pysam.h" + +/* stats.c -- This is the former bamcheck integrated into samtools/htslib. + + Copyright (C) 2020-2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + * This tool is designed to give "samtools stats" style output, but dedicated + * to small amplicon sequencing projects. It gathers stats on the + * distribution of reads across amplicons. + */ + +/* + * TODO: + * - Cope with multiple references. What do we do here? Just request one? + * - Permit regions rather than consuming whole file (maybe solves above). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "samtools.h" +#include "sam_opts.h" +#include "bam_ampliconclip.h" + +KHASH_MAP_INIT_INT64(tcoord, int64_t) +KHASH_MAP_INIT_STR(qname, int64_t) + +#ifndef MIN +#define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef MAX +#define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +#ifndef ABS +#define ABS(a) ((a)>=0?(a):-(a)) +#endif + +#define TCOORD_MIN_COUNT 10 +#define MAX_AMP 1000 // Default maximum number of amplicons +#define MAX_AMP_LEN 1000 // Default maximum length of any single amplicon +#define MAX_PRIMER_PER_AMPLICON 4 // Max primers per LEFT/RIGHT +#define MAX_DEPTH 5 // Number of different depths permitted + +typedef struct { + sam_global_args ga; + uint32_t flag_require; + uint32_t flag_filter; + int max_delta; // Used for matching read to amplicon primer loc + int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep + int use_sample_name; + int max_amp; // Total number of amplicons + int max_amp_len; // Maximum length of an individual amplicon + double depth_bin;// aggregate depth within this fraction + int tlen_adj; // Adjust tlen by this amount, due to clip but no fixmate + FILE *out_fp; + char *argv; + int tcoord_min_count; + int tcoord_bin; + int multi_ref; +} astats_args_t; + +typedef struct { + int nseq; // total sequence count + int nfiltered; // sequence filtered + int nfailprimer;// count of sequences not matching the primer locations + + // Sizes of memory allocated below, to permit reset + int max_amp, max_amp_len, max_len; + + // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc + int64_t *nreads, *nreads2; // [max_amp] + double *nfull_reads; // [max_amp]; 0.5/read if paired. + double *nrperc, *nrperc2; // [max_amp] + int64_t *nbases, *nbases2; // [max_amp] + int64_t *coverage; // [max_amp][max_amp_len] + double (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH] + double (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH]; + khash_t(tcoord) **tcoord; // [max_amp+1] + + // 0 is correct pair, 1 is incorrect pair, 2 is unidentified + int (*amp_dist)[3]; // [MAX_AMP][3]; + + int *depth_valid; // [max_len] + int *depth_all; // [max_len] + khash_t(qname) *qend; // queryname end, for overlap removal +} astats_t; + +// We can have multiple primers for LEFT / RIGHT, so this +// permits detection by any compatible combination. +// One reference: +typedef struct { + int64_t left[MAX_PRIMER_PER_AMPLICON]; + int nleft; + int64_t right[MAX_PRIMER_PER_AMPLICON]; + int nright; + int64_t max_left, min_right; // inner dimensions + int64_t min_left, max_right; // outer dimensions +} amplicon_t; + +// Multiple references, we have an array of amplicons_t - one per used ref. +// We have per reference local and global stats here, as some of the stats +// are coordinate based. However we report them combined together as a single +// list across all references. +// "namp" is the number of amplicons in this reference, but they're +// numbered first_amp to first_amp+namp-1 inclusively. +typedef struct { + int tid, namp; + int64_t len; + bed_entry_list_t *sites; + amplicon_t *amp; + astats_t *lstats, *gstats; // local (1 file) and global (all file) stats + const char *ref; // ref name (pointer to the bed hash table key) + int first_amp; // first amplicon number for this ref +} amplicons_t; + +// Reinitialised for each new reference/chromosome. +// Counts from 1 to namp, -1 for no match and 0 for ?. +static int *pos2start = NULL; +static int *pos2end = NULL; +static int pos2size = 0; // allocated size of pos2start/end + +// Lookup table to go from position to amplicon based on +// read start / end. +static int initialise_amp_pos_lookup(astats_args_t *args, + amplicons_t *amps, + int ref) { + int64_t i, j; + amplicon_t *amp = amps[ref].amp; + int64_t max_len = amps[ref].len; + int namp = amps[ref].namp; + + if (max_len+1 > pos2size) { + if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start)))) + return -1; + if (!(pos2end = realloc(pos2end, (max_len+1)*sizeof(*pos2end)))) + return -1; + pos2size = max_len; + } + for (i = 0; i < max_len; i++) + pos2start[i] = pos2end[i] = -1; + + for (i = 0; i < namp; i++) { + for (j = 0; j < amp[i].nleft; j++) { + int64_t p; + for (p = amp[i].left[j] - args->max_delta; + p <= amp[i].left[j] + args->max_delta; p++) { + if (p < 1 || p > max_len) + continue; + pos2start[p-1] = i; + } + } + for (j = 0; j < amp[i].nright; j++) { + int64_t p; + for (p = amp[i].right[j] - args->max_delta; + p <= amp[i].right[j] + args->max_delta; p++) { + if (p < 1 || p > max_len) + continue; + pos2end[p-1] = i; + } + } + } + + return 0; +} + +// Counts amplicons. +// Assumption: input BED file alternates between LEFT and RIGHT primers +// per amplicon, thus we can count the number based on the switching +// orientation. +static int count_amplicon(bed_entry_list_t *sites) { + int i, namp, last_rev = 0; + for (i = namp = 0; i < sites->length; i++) { + if (sites->bp[i].rev == 0 && last_rev) + namp++; + last_rev = sites->bp[i].rev; + } + + return ++namp; +} + +// We're only interest in the internal part of the amplicon. +// Our bed file has LEFT start/end followed by RIGHT start/end, +// so collapse these to LEFT end / RIGHT start. +// +// Returns right most amplicon position on success, +// < 0 on error +static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites, + amplicon_t *amp, int *namp, int do_title, + const char *ref, int first_amp) { + int i, j; + int64_t max_right = 0; + FILE *ofp = args->out_fp; + + *namp = 0; + + // Assume all primers for the same amplicon are adjacent in BED + // with all + followed by all -. Thus - to + signifies next primer set. + int last_rev = 0; + amp[0].max_left = 0; + amp[0].min_right = INT64_MAX; + amp[0].min_left = INT64_MAX; + amp[0].max_right = 0; + if (do_title) { + fprintf(ofp, "# Amplicon locations from BED file.\n"); + fprintf(ofp, "# LEFT/RIGHT are - format and " + "comma-separated for alt-primers.\n"); + if (args->multi_ref) + fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n"); + else + fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n"); + } + for (i = j = 0; i < sites->length; i++) { + if (i == 0 && sites->bp[i].rev != 0) { + fprintf(samtools_stderr, "[ampliconstats] error: BED file should start" + " with the + strand primer\n"); + return -1; + } + if (sites->bp[i].rev == 0 && last_rev) { + j++; + if (j >= args->max_amp) { + fprintf(samtools_stderr, "[ampliconstats] error: too many amplicons" + " (%d). Use -a option to raise this.\n", j); + return -1; + } + amp[j].max_left = 0; + amp[j].min_right = INT64_MAX; + amp[j].min_left = INT64_MAX; + amp[j].max_right = 0; + } + if (sites->bp[i].rev == 0) { + if (i == 0 || last_rev) { + if (j>0) fprintf(ofp, "\n"); + if (args->multi_ref) + fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp); + else + fprintf(ofp, "AMPLICON\t%d", j+1); + } + if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) { + print_error_errno("ampliconstats", + "too many primers per amplicon (%d).\n", + MAX_PRIMER_PER_AMPLICON); + return -1; + } + amp[j].left[amp[j].nleft++] = sites->bp[i].right; + if (amp[j].max_left < sites->bp[i].right+1) + amp[j].max_left = sites->bp[i].right+1; + if (amp[j].min_left > sites->bp[i].right+1) + amp[j].min_left = sites->bp[i].right+1; + // BED file, so left+1 as zero based. right(+1-1) as + // BED goes one beyond end (and we want inclusive range). + fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1], + sites->bp[i].left+1, sites->bp[i].right); + } else { + if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) { + print_error_errno("ampliconstats", + "too many primers per amplicon (%d)", + MAX_PRIMER_PER_AMPLICON); + return -1; + } + amp[j].right[amp[j].nright++] = sites->bp[i].left; + if (amp[j].min_right > sites->bp[i].left-1) + amp[j].min_right = sites->bp[i].left-1; + if (amp[j].max_right < sites->bp[i].left-1) { + amp[j].max_right = sites->bp[i].left-1; + if (amp[j].max_right - amp[j].min_left + 1 >= + args->max_amp_len) { + fprintf(samtools_stderr, "[ampliconstats] error: amplicon " + "longer (%d) than max_amp_len option (%d)\n", + (int)(amp[j].max_right - amp[j].min_left + 2), + args->max_amp_len); + return -1; + } + if (max_right < amp[j].max_right) + max_right = amp[j].max_right; + } + fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1], + sites->bp[i].left+1, sites->bp[i].right); + } + last_rev = sites->bp[i].rev; + } + if (last_rev != 1) { + fprintf(ofp, "\n"); // useful if going to samtools_stdout + fprintf(samtools_stderr, "[ampliconstats] error: bed file does not end on" + " a reverse strand primer.\n"); + return -1; + } + *namp = ++j; + if (j) fprintf(ofp, "\n"); + + if (j >= args->max_amp) { + fprintf(samtools_stderr, "[ampliconstats] error: " + "too many amplicons (%d). Use -a option to raise this.", j); + return -1; + } + +// for (i = 0; i < *namp; i++) { +// fprintf(samtools_stdout, "%d\t%ld", i, amp[i].length); +// for (j = 0; j < amp[i].nleft; j++) +// fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].left[j]); +// for (j = 0; j < amp[i].nright; j++) +// fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].right[j]); +// fprintf(samtools_stdout, "\n"); +// } + + return max_right; +} + +void stats_free(astats_t *st) { + if (!st) + return; + + free(st->nreads); + free(st->nreads2); + free(st->nfull_reads); + free(st->nrperc); + free(st->nrperc2); + free(st->nbases); + free(st->nbases2); + free(st->coverage); + free(st->covered_perc); + free(st->covered_perc2); + free(st->amp_dist); + + free(st->depth_valid); + free(st->depth_all); + + if (st->tcoord) { + int i; + for (i = 0; i <= st->max_amp; i++) { + if (st->tcoord[i]) + kh_destroy(tcoord, st->tcoord[i]); + } + free(st->tcoord); + } + + khiter_t k; + for (k = kh_begin(st->qend); k != kh_end(st->qend); k++) + if (kh_exist(st->qend, k)) + free((void *)kh_key(st->qend, k)); + kh_destroy(qname, st->qend); + + free(st); +} + +astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) { + astats_t *st = calloc(1, sizeof(*st)); + if (!st) + return NULL; + + st->max_amp = max_amp; + st->max_amp_len = max_amp_len; + st->max_len = max_len; + + if (!(st->nreads = calloc(max_amp, sizeof(*st->nreads)))) goto err; + if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err; + if (!(st->nrperc = calloc(max_amp, sizeof(*st->nrperc)))) goto err; + if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err; + if (!(st->nbases = calloc(max_amp, sizeof(*st->nbases)))) goto err; + if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err; + + if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads)))) + goto err; + + if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage)))) + goto err; + + if (!(st->covered_perc = calloc(max_amp, sizeof(*st->covered_perc)))) + goto err; + if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2)))) + goto err; + + if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err; + int i; + for (i = 0; i <= st->max_amp; i++) + if (!(st->tcoord[i] = kh_init(tcoord))) + goto err; + + if (!(st->qend = kh_init(qname))) + goto err; + + if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid)))) + goto err; + if (!(st->depth_all = calloc(max_len, sizeof(*st->depth_all)))) + goto err; + + if (!(st->amp_dist = calloc(max_amp, sizeof(*st->amp_dist)))) goto err; + + return st; + + err: + stats_free(st); + return NULL; +} + +static void stats_reset(astats_t *st) { + st->nseq = 0; + st->nfiltered = 0; + st->nfailprimer = 0; + + memset(st->nreads, 0, st->max_amp * sizeof(*st->nreads)); + memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2)); + memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads)); + + memset(st->nrperc, 0, st->max_amp * sizeof(*st->nrperc)); + memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2)); + + memset(st->nbases, 0, st->max_amp * sizeof(*st->nbases)); + memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2)); + + memset(st->coverage, 0, st->max_amp * st->max_amp_len + * sizeof(*st->coverage)); + memset(st->covered_perc, 0, st->max_amp * sizeof(*st->covered_perc)); + memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2)); + + // Keep the allocated entries as it's likely all files will share + // the same keys. Instead we reset counters to zero for common ones + // and delete rare ones. + int i; + for (i = 0; i <= st->max_amp; i++) { + khiter_t k; + for (k = kh_begin(st->tcoord[i]); + k != kh_end(st->tcoord[i]); k++) + if (kh_exist(st->tcoord[i], k)) { + if (kh_value(st->tcoord[i], k) < 5) + kh_del(tcoord, st->tcoord[i], k); + else + kh_value(st->tcoord[i], k) = 0; + } + } + + khiter_t k; + for (k = kh_begin(st->qend); k != kh_end(st->qend); k++) + if (kh_exist(st->qend, k)) + free((void *)kh_key(st->qend, k)); + kh_clear(qname, st->qend); + + memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid)); + memset(st->depth_all, 0, st->max_len * sizeof(*st->depth_all)); + memset(st->amp_dist, 0, st->max_amp * sizeof(*st->amp_dist)); +} + +static void amp_stats_reset(amplicons_t *amps, int nref) { + int i; + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + stats_reset(amps[i].lstats); + } +} + +static int accumulate_stats(astats_args_t *args, amplicons_t *amps, + bam1_t *b) { + int ref = b->core.tid; + amplicon_t *amp = amps[ref].amp; + astats_t *stats = amps[ref].lstats; + int len = amps[ref].len; + + if (!stats) + return 0; + + stats->nseq++; + if ((b->core.flag & args->flag_require) != args->flag_require || + (b->core.flag & args->flag_filter) != 0) { + stats->nfiltered++; + return 0; + } + + int64_t start = b->core.pos, mstart = start; // modified start + int64_t end = bam_endpos(b), i; + + // Compute all-template-depth and valid-template-depth. + // We track current end location per read name so we can remove overlaps. + // Potentially we could use this data for a better amplicon-depth + // count too, but for now it's purely for the per-base plots. + int ret; + khiter_t k; + int prev_start = 0, prev_end = 0; + if ((b->core.flag & BAM_FPAIRED) + && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) { + k = kh_put(qname, stats->qend, bam_get_qname(b), &ret); + if (ret == 0) { + prev_start = kh_value(stats->qend, k) & 0xffffffff; + prev_end = kh_value(stats->qend, k)>>32; + mstart = MAX(mstart, prev_end); + // Ideally we'd reuse strings so we don't thrash free/malloc. + // However let's see if the official way of doing that (malloc + // itself) is fast enough first. + free((void *)kh_key(stats->qend, k)); + kh_del(qname, stats->qend, k); + //fprintf(samtools_stderr, "remove overlap %d to %d\n", (int)start, (int)mstart); + } else { + if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b)))) + return -1; + + kh_value(stats->qend, k) = start | (end << 32); + } + } + for (i = mstart; i < end && i < len; i++) + stats->depth_all[i]++; + if (i < end) { + print_error("ampliconstats", "record %s overhangs end of reference", + bam_get_qname(b)); + // But keep going, as it's harmless. + } + + // On single ended runs, eg ONT or PacBio, we just use the start/end + // of the template to assign. + int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED) + ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1) + : (start >= 0 && start < len ? pos2start[start] : -1); + + // ivar sometimes soft-clips 100% of the bases. + // This is essentially unmapped + if (end == start && (args->flag_filter & BAM_FUNMAP)) { + stats->nfiltered++; + return 0; + } + + if (anum == -1) + stats->nfailprimer++; + + if (anum >= 0) { + int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left); + if (c > 0) { + stats->nreads[anum]++; + // NB: ref bases rather than read bases + stats->nbases[anum] += c; + + int64_t i; + if (start < 0) start = 0; + if (end > len) end = len; + + int64_t ostart = MAX(start, amp[anum].min_left-1); + int64_t oend = MIN(end, amp[anum].max_right); + int64_t offset = amp[anum].min_left-1; + for (i = ostart; i < oend; i++) + stats->coverage[anum*stats->max_amp_len + i-offset]++; + } else { + stats->nfailprimer++; + } + } + + // Template length in terms of amplicon number to amplicon number. + // We expect left to right of same amplicon (len 0), but it may go + // to next amplicon (len 1) or prev (len -1), etc. + int64_t t_end; + int oth_anum = -1; + + if (b->core.flag & BAM_FPAIRED) { + t_end = (b->core.flag & BAM_FREVERSE ? end : start) + + b->core.isize; + + // If we've clipped the primers but not followed up with a fixmates + // then our start+TLEN will take us to a location which is + // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away. + // + // The correct solution is to run samtools fixmate so TLEN is correct. + // The hacky solution is to fudge the expected tlen by double the + // average primer length (e.g. 50). + t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj; + + if (t_end > 0 && t_end < len && b->core.isize != 0) + oth_anum = (b->core.flag & BAM_FREVERSE) + ? pos2start[t_end] + : pos2end[t_end]; + } else { + // Not paired (see int anum = (REV || !PAIR) ?en :st expr above) + oth_anum = pos2start[start]; + t_end = end; + } + + // We don't want to count our pairs twice. + // If both left/right are known, count it on left only. + // If only one is known, we'll only get to this code once + // so we can also count it. + int astatus = 2; + if (anum != -1 && oth_anum != -1) { + astatus = oth_anum == anum ? 0 : 1; + if (start <= t_end) + stats->amp_dist[anum][astatus]++; + } else if (anum >= 0) { + stats->amp_dist[anum][astatus = 2]++; + } + + if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) { + if (prev_end && mstart > prev_end) { + // 2nd read with gap to 1st; undo previous increment. + for (i = prev_start; i < prev_end; i++) + stats->depth_valid[i]--; + stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1; + } else { + // 1st read, or 2nd read that overlaps 1st + for (i = mstart; i < end; i++) + stats->depth_valid[i]++; + stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1; + } + } + + // Track template start,end frequencies, so we can give stats on + // amplicon primer usage. + if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0) + // left to right only, so we don't double count template positions. + return 0; + + start = b->core.pos; + t_end = b->core.flag & BAM_FPAIRED + ? start + b->core.isize-1 + : end; + uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32); + k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret); + if (ret < 0) + return -1; + if (ret == 0) + kh_value(stats->tcoord[anum+1], k)++; + else + kh_value(stats->tcoord[anum+1], k)=1; + kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32); + + return 0; +} + +// Append file local stats to global stats +int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) { + gstats->nseq += lstats->nseq; + gstats->nfiltered += lstats->nfiltered; + gstats->nfailprimer += lstats->nfailprimer; + + int a; + for (a = -1; a < namp; a++) { + // Add khash local (kl) to khash global (kg) + khiter_t kl, kg; + for (kl = kh_begin(lstats->tcoord[a+1]); + kl != kh_end(lstats->tcoord[a+1]); kl++) { + if (!kh_exist(lstats->tcoord[a+1], kl) || + kh_value(lstats->tcoord[a+1], kl) == 0) + continue; + + int ret; + kg = kh_put(tcoord, gstats->tcoord[a+1], + kh_key(lstats->tcoord[a+1], kl), + &ret); + if (ret < 0) + return -1; + + kh_value(gstats->tcoord[a+1], kg) = + (ret == 0 + ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF) + : 0) + + kh_value(lstats->tcoord[a+1], kl); + } + if (a == -1) continue; + + gstats->nreads[a] += lstats->nreads[a]; + gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a]; + gstats->nfull_reads[a] += lstats->nfull_reads[a]; + + // To get mean & sd for amplicon read percentage, we need + // to do the divisions here as nseq differs for each sample. + double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0; + gstats->nrperc[a] += nrperc; + gstats->nrperc2[a] += nrperc*nrperc; + + gstats->nbases[a] += lstats->nbases[a]; + gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a]; + + int d; + for (d = 0; d < MAX_DEPTH; d++) { + gstats->covered_perc[a][d] += lstats->covered_perc[a][d]; + gstats->covered_perc2[a][d] += lstats->covered_perc[a][d] + * lstats->covered_perc[a][d]; + } + + for (d = 0; d < 3; d++) + gstats->amp_dist[a][d] += lstats->amp_dist[a][d]; + } + + for (a = 0; a < lstats->max_len; a++) { + gstats->depth_valid[a] += lstats->depth_valid[a]; + gstats->depth_all[a] += lstats->depth_all[a]; + } + + return 0; +} + +int append_stats(amplicons_t *amps, int nref) { + int i, r, all_nseq = 0; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = amps[r].lstats; + all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer; + } + + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp, + all_nseq) < 0) + return -1; + } + + return 0; +} + +typedef struct { + int32_t start, end; + uint32_t freq; + uint32_t status; +} tcoord_t; + +// Sort tcoord by descending frequency and then ascending start and end. +static int tcoord_freq_sort(const void *vp1, const void *vp2) { + const tcoord_t *t1 = (const tcoord_t *)vp1; + const tcoord_t *t2 = (const tcoord_t *)vp2; + + if (t1->freq != t2->freq) + return t2->freq - t1->freq; + + if (t1->start != t2->start) + return t1->start - t2->start; + + return t1->end - t2->end; +} + + +/* + * Merges tcoord start,end,freq,status tuples if their coordinates are + * close together. We aim to keep the start,end for the most frequent + * value and assume that is the correct coordinate and all others are + * minor fluctuations due to errors or variants. + * + * We sort by frequency first and then merge later items in the list into + * the earlier more frequent ones. It's O(N^2), but sufficient for now + * given current scale of projects. + * + * If we ever need to resolve that then consider sorting by start + * coordinate and scanning the list to find all items within X, find + * the most frequent of those, and then cluster that way. (I'd have + * done that had I thought of it at the time!) + */ +static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){ + size_t n = *np, j, j2, j3, k; + + // Sort by frequency and cluster infrequent coords into frequent + // ones provided they're close by. + // This is O(N^2), but we've already binned by tcoord_bin/2 so + // the list isn't intended to be vast at this point. + qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort); + + // For frequency ties, find mid start coord, and then find mid end + // coord of those matching start. + // We make that the first item so we merge into that mid point. + for (j = 0; j < n; j++) { + for (j2 = j+1; j2 < n; j2++) { + if (tpos[j].freq != tpos[j2].freq) + break; + if (tpos[j2].start - tpos[j].start >= args->tcoord_bin) + break; + } + + // j to j2 all within bin of a common start, + // m is the mid start. + if (j2-1 > j) { + size_t m = (j2-1 + j)/2; + + // Find mid end for this same start + while (m > 1 && tpos[m].start == tpos[m-1].start) + m--; + for (j3 = m+1; j3 < j2; j3++) { + if (tpos[m].start != tpos[j3].start) + break; + if (tpos[m].end - tpos[j3].end >= args->tcoord_bin) + break; + } + if (j3-1 > m) + m = (j3-1 + m)/2; + + // Swap with first item. + tcoord_t tmp = tpos[j]; + tpos[j] = tpos[m]; + tpos[m] = tmp; + j = j2-1; + } + } + + // Now merge in coordinates. + // This bit is O(N^2), so consider binning first to reduce the + // size of the list if we have excessive positional variation. + for (k = j = 0; j < n; j++) { + if (!tpos[j].freq) + continue; + + if (k < j) + tpos[k] = tpos[j]; + + for (j2 = j+1; j2 < n; j2++) { + if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 && + ABS(tpos[j].end -tpos[j2].end) < args->tcoord_bin/2 && + tpos[j].status == tpos[j2].status) { + tpos[k].freq += tpos[j2].freq; + tpos[j2].freq = 0; + } + } + k++; + } + + *np = k; +} + +int dump_stats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref, int local) { + int i, r; + FILE *ofp = args->out_fp; + tcoord_t *tpos = NULL; + size_t ntcoord = 0; + + // summary stats for this sample (or for all samples) + fprintf(ofp, "# Summary stats.\n"); + fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type); + + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer; + char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2); + if (!name_ref) + return -1; + if (args->multi_ref) + sprintf(name_ref, "%s\t%s", name, amps[r].ref); + else + sprintf(name_ref, "%s", name); + fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n", + type, name_ref, stats->nseq); + fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n", + type, name_ref, stats->nfiltered); + fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n", + type, name_ref, stats->nfailprimer); + fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n", + type, name_ref, nmatch); + + int d = 0; + do { + // From first to last amplicon only, so not entire consensus. + // If contig length is known, maybe we want to add the missing + // count to < DEPTH figures? + int64_t start = 0, covered = 0, total = 0; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int64_t j, offset = amp[i].min_left-1; + if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) { + fprintf(samtools_stderr, "[ampliconstats] error: " + "Maximum amplicon length (%d) exceeded for '%s'\n", + stats->max_amp, name); + return -1; + } + for (j = MAX(start, amp[i].max_left-1); + j < MAX(start, amp[i].min_right); j++) { + if (stats->coverage[i*stats->max_amp_len + j-offset] + >= args->min_depth[d]) + covered++; + total++; + } + start = MAX(start, amp[i].min_right); + } + fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%" + PRId64"\t%"PRId64"\n", type, name_ref, + args->min_depth[d], args->min_depth[d], + total-covered, covered); + } while (++d < MAX_DEPTH && args->min_depth[d]); + + free(name_ref); + } + + // Read count + fprintf(ofp, "# Absolute matching read counts per amplicon.\n"); + fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type); + fprintf(ofp, "%cREADS\t%s", type, name); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%"PRId64, stats->nreads[i]); + } + } + fprintf(ofp, "\n"); + + // Valid depth is the number of full length reads (already divided + // by the number we expect to cover), so +0.5 per read in pair. + // A.k.a "usable depth" in the plots. + fprintf(ofp, "%cVDEPTH\t%s", type, name); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) + fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]); + } + fprintf(ofp, "\n"); + + if (type == 'C') { + // For combined we can compute mean & standard deviation too + fprintf(ofp, "CREADS\tMEAN"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile); + } + } + fprintf(ofp, "\n"); + + fprintf(ofp, "CREADS\tSTDDEV"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + double n1 = stats->nreads[i]; + fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0 + ? sqrt(stats->nreads2[i]/(double)nfile + - (n1/nfile)*(n1/nfile)) + : 0); + } + } + fprintf(ofp, "\n"); + } + + fprintf(ofp, "# Read percentage of distribution between amplicons.\n"); + fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type); + fprintf(ofp, "%cRPERC\t%s", type, name); + int all_nseq = 0; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer; + } + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + if (type == 'C') { + fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile); + } else { + fprintf(ofp, "\t%.3f", + all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0); + } + } + } + fprintf(ofp, "\n"); + + if (type == 'C') { + // For combined we compute mean and standard deviation too + fprintf(ofp, "CRPERC\tMEAN"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile); + } + } + fprintf(ofp, "\n"); + + fprintf(ofp, "CRPERC\tSTDDEV"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + // variance = SUM(X^2) - ((SUM(X)^2) / N) + double n1 = stats->nrperc[i]; + double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile); + fprintf(ofp, "\t%.3f", v>0?sqrt(v):0); + } + } + fprintf(ofp, "\n"); + } + + // Base depth + fprintf(ofp, "# Read depth per amplicon.\n"); + fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type); + fprintf(ofp, "%cDEPTH\t%s", type, name); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; + int64_t alen = amp[i].min_right - amp[i].max_left+1; + fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0); + } + } + fprintf(ofp, "\n"); + + if (type == 'C') { + // For combined we can compute mean & standard deviation too + fprintf(ofp, "CDEPTH\tMEAN"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; + for (i = 0; i < amps[r].namp; i++) { + int64_t alen = amp[i].min_right - amp[i].max_left+1; + fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0); + } + } + fprintf(ofp, "\n"); + + fprintf(ofp, "CDEPTH\tSTDDEV"); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + double alen = amp[i].min_right - amp[i].max_left+1; + double n1 = stats->nbases[i] / alen; + double v = stats->nbases2[i] / (alen*alen) /nfile + - (n1/nfile)*(n1/nfile); + fprintf(ofp, "\t%.1f", v>0?sqrt(v):0); + } + } + fprintf(ofp, "\n"); + } + + // Percent Coverage + if (type == 'F') { + fprintf(ofp, "# Percentage coverage per amplicon\n"); + fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type); + int d = 0; + do { + fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name); + + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int covered = 0; + if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) { + fprintf(samtools_stderr, "[ampliconstats] error: " + "Maximum amplicon length (%d) exceeded for '%s'\n", + stats->max_amp, name); + return -1; + } + int64_t j, offset = amp[i].min_left-1; + for (j = amp[i].max_left-1; j < amp[i].min_right; j++) { + int apos = i*stats->max_amp_len + j-offset; + if (stats->coverage[apos] >= args->min_depth[d]) + covered++; + } + int64_t alen = amp[i].min_right - amp[i].max_left+1; + stats->covered_perc[i][d] = 100.0 * covered / alen; + fprintf(ofp, "\t%.2f", 100.0 * covered / alen); + } + } + fprintf(ofp, "\n"); + } while (++d < MAX_DEPTH && args->min_depth[d]); + + } else if (type == 'C') { + // For combined we can compute mean & standard deviation too + int d = 0; + do { + fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile); + } + } + fprintf(ofp, "\n"); + + fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + double n1 = stats->covered_perc[i][d] / nfile; + double v = stats->covered_perc2[i][d] / nfile - n1*n1; + fprintf(ofp, "\t%.1f", v>0?sqrt(v):0); + } + } + fprintf(ofp, "\n"); + } while (++d < MAX_DEPTH && args->min_depth[d]); + } + + // Plus base depth for all reads, irrespective of amplicon. + // This is post overlap removal, if reads in the read-pair overlap. + fprintf(ofp, "# Depth per reference base for ALL data.\n"); + fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n", + type); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + if (args->multi_ref) + fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref); + else + fprintf(ofp, "%cDP_ALL\t%s", type, name); + + for (i = 0; i < amps[r].len; i++) { + // Basic run-length encoding provided all values are within + // +- depth_bin fraction of the mid-point. + int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j; + double dmid = (dmin + dmax)/2.0; + double low = dmid*(1-args->depth_bin); + double high = dmid*(1+args->depth_bin); + for (j = i+1; j < amps[r].len; j++) { + int d = stats->depth_all[j]; + if (d < low || d > high) + break; + if (dmin > d) { + dmin = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } else if (dmax < d) { + dmax = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } + } + fprintf(ofp, "\t%d,%d", (int)dmid, j-i); + i = j-1; + } + fprintf(ofp, "\n"); + } + + // And depth for only reads matching to a single amplicon for full + // length. This is post read overlap removal. + fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n"); + fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this " + "part.\n", type); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + if (args->multi_ref) + fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref); + else + fprintf(ofp, "%cDP_VALID\t%s", type, name); + + for (i = 0; i < amps[r].len; i++) { + int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j; + double dmid = (dmin + dmax)/2.0; + double low = dmid*(1-args->depth_bin); + double high = dmid*(1+args->depth_bin); + for (j = i+1; j < amps[r].len; j++) { + int d = stats->depth_valid[j]; + if (d < low || d > high) + break; + if (dmin > d) { + dmin = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } else if (dmax < d) { + dmax = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } + } + fprintf(ofp, "\t%d,%d", (int)dmid, j-i); + i = j-1; + } + fprintf(ofp, "\n"); + } + + // TCOORD (start to end) distribution + fprintf(ofp, "# Distribution of aligned template coordinates.\n"); + fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0 - (nref==1); i < amps[r].namp; i++) { + if (ntcoord < kh_size(stats->tcoord[i+1])) { + ntcoord = kh_size(stats->tcoord[i+1]); + tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp)); + if (!tmp) { + free(tpos); + return -1; + } + tpos = tmp; + } + + khiter_t k; + size_t n = 0, j; + for (k = kh_begin(stats->tcoord[i+1]); + k != kh_end(stats->tcoord[i+1]); k++) { + if (!kh_exist(stats->tcoord[i+1], k) || + (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0) + continue; + // Key is start,end in 32-bit quantities. + // Yes this limits us to 4Gb references, but just how + // many primers are we planning on making? Not that many + // I hope. + tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff; + tpos[n].end = kh_key(stats->tcoord[i+1], k)>>32; + + // Value is frequency (top 32-bits) and status (bottom 32). + tpos[n].freq = kh_value(stats->tcoord[i+1], k)&0xffffffff; + tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32; + n++; + } + + if (args->tcoord_bin > 1) + aggregate_tcoord(args, tpos, &n); + + fprintf(ofp, "%cTCOORD\t%s\t%d", type, name, + i+1+amps[r].first_amp); // per amplicon + for (j = 0; j < n; j++) { + if (tpos[j].freq < args->tcoord_min_count) + continue; + fprintf(ofp, "\t%d,%d,%u,%u", + tpos[j].start, + tpos[j].end, + tpos[j].freq, + tpos[j].status); + } + fprintf(ofp, "\n"); + } + } + + + // AMP length distribution. + // 0 = both ends in this amplicon + // 1 = ends in different amplicons + // 2 = other end matching an unknown amplicon site + // (see tcoord for further analysis of where) + fprintf(ofp, "# Classification of amplicon status. Columns are\n"); + fprintf(ofp, "# number with both primers from this amplicon, number with\n"); + fprintf(ofp, "# primers from different amplicon, and number with a position\n"); + fprintf(ofp, "# not matching any valid amplicon primer site\n"); + fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type); + + fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged + int amp_dist[3] = {0}; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps + amp_dist[0] += stats->amp_dist[i][0]; + amp_dist[1] += stats->amp_dist[i][1]; + amp_dist[2] += stats->amp_dist[i][2]; + } + } + fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]); + + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + // per amplicon + fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp); + fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0], + stats->amp_dist[i][1], stats->amp_dist[i][2]); + } + } + + free(tpos); + return 0; +} + +int dump_lstats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref) { + return dump_stats(args, type, name, nfile, amps, nref, 1); +} + +int dump_gstats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref) { + return dump_stats(args, type, name, nfile, amps, nref, 0); +} + +char const *get_sample_name(sam_hdr_t *header, char *RG) { + kstring_t ks = {0}; + sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks); + return ks.s; +} + +// Return maximum reference length (SQ is NULL) or the length +// of the specified reference in SQ. +int64_t get_ref_len(sam_hdr_t *header, const char *SQ) { + if (SQ) { + int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0; + return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1; + } else { + int nref = sam_hdr_nref(header), tid;; + int64_t len = 0; + for (tid = 0; tid < nref; tid++) { + int64_t rl = sam_hdr_tid2len(header, tid); + if (len < rl) + len = rl; + } + return len; + } +} + +static int amplicon_stats(astats_args_t *args, + khash_t(bed_list_hash) *bed_hash, + char **filev, int filec) { + int i, ref = -1, ref_tid = -1, ret = -1, nref = 0; + samFile *fp = NULL; + sam_hdr_t *header = NULL; + bam1_t *b = bam_init1(); + FILE *ofp = args->out_fp; + char sname_[8192], *sname = NULL; + amplicons_t *amps = NULL; + + // Report initial SS header. We gather data from the bed_hash entries + // as well as from the first SAM header (with the requirement that all + // headers should be compatible). + if (filec) { + if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) { + print_error_errno("ampliconstats", + "Cannot open input file \"%s\"", + filev[0]); + goto err; + } + if (!(header = sam_hdr_read(fp))) + goto err; + + if (!amps) { + amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps)); + if (!amps) + goto err; + fprintf(ofp, "# Summary statistics, used for scaling the plots.\n"); + fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version()); + fprintf(ofp, "SS\tCommand line: %s\n", args->argv); + fprintf(ofp, "SS\tNumber of files:\t%d\n", filec); + + // Note: order of hash entries will be different to order of + // BED file which may also differ to order of SQ headers. + // SQ header is canonical ordering (pos sorted file). + khiter_t k; + int bam_nref = sam_hdr_nref(header); + for (i = 0; i < bam_nref; i++) { + k = kh_get(bed_list_hash, bed_hash, + sam_hdr_tid2name(header, i)); + if (!kh_exist(bed_hash, k)) + continue; + + bed_entry_list_t *sites = &kh_value(bed_hash, k); + + ref = i; + amps[ref].ref = kh_key(bed_hash, k); + amps[ref].sites = sites; + amps[ref].namp = count_amplicon(sites); + amps[ref].amp = calloc(sites->length, + sizeof(*amps[ref].amp)); + if (!amps[ref].amp) + goto err; + if (args->multi_ref) + fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n", + kh_key(bed_hash, k), amps[ref].namp); + else + fprintf(ofp, "SS\tNumber of amplicons:\t%d\n", + amps[ref].namp); + + amps[ref].tid = ref; + if (ref_tid == -1) + ref_tid = ref; + + int64_t len = get_ref_len(header, kh_key(bed_hash, k)); + amps[ref].len = len; + if (args->multi_ref) + fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n", + kh_key(bed_hash, k), len); + else + fprintf(ofp, "SS\tReference length:\t%"PRId64"\n", + len); + + amps[ref].lstats = stats_alloc(len, args->max_amp, + args->max_amp_len); + amps[ref].gstats = stats_alloc(len, args->max_amp, + args->max_amp_len); + if (!amps[ref].lstats || !amps[ref].gstats) + goto err; + } + } + + sam_hdr_destroy(header); + header = NULL; + if (sam_close(fp) < 0) { + fp = NULL; + goto err; + } + fp = NULL; + } + fprintf(ofp, "SS\tEnd of summary\n"); + + // Extract the bits of amplicon data we need from bed hash and turn + // it into a position-to-amplicon lookup table. + int offset = 0; + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + + amps[i].first_amp = offset; + if (bed2amplicon(args, amps[i].sites, amps[i].amp, + &s[i].namp, i==0, amps[i].ref, offset) < 0) + goto err; + + offset += amps[i].namp; // cumulative amplicon number across refs + } + + // Now iterate over file contents, one at a time. + for (i = 0; i < filec; i++) { + char *nstart = filev[i]; + + fp = sam_open_format(filev[i], "r", &args->ga.in); + if (!fp) { + print_error_errno("ampliconstats", + "Cannot open input file \"%s\"", + filev[i]); + goto err; + } + + if (args->ga.nthreads > 0) + hts_set_threads(fp, args->ga.nthreads); + + if (!(header = sam_hdr_read(fp))) + goto err; + + if (nref != sam_hdr_nref(header)) { + print_error_errno("ampliconstats", + "SAM headers are not consistent across input files"); + goto err; + } + int r; + for (r = 0; r < nref; r++) { + if (!amps[r].ref || + strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 || + amps[r].len != sam_hdr_tid2len(header, r)) { + print_error_errno("ampliconstats", + "SAM headers are not consistent across " + "input files"); + goto err; + } + } + + if (args->use_sample_name) + sname = (char *)get_sample_name(header, NULL); + + if (!sname) { + sname = sname_; + char *nend = filev[i] + strlen(filev[i]), *cp; + if ((cp = strrchr(filev[i], '/'))) + nstart = cp+1; + if ((cp = strrchr(nstart, '.')) && + (strcmp(cp, ".bam") == 0 || + strcmp(cp, ".sam") == 0 || + strcmp(cp, ".cram") == 0)) + nend = cp; + if (nend - nstart >= 8192) nend = nstart+8191; + memcpy(sname, nstart, nend-nstart); + sname[nend-nstart] = 0; + } + + // Stats local to this sample only + amp_stats_reset(amps, nref); + + int last_ref = -9; + while ((r = sam_read1(fp, header, b)) >= 0) { + // Other filter options useful here? + if (b->core.tid < 0) + continue; + + if (last_ref != b->core.tid) { + last_ref = b->core.tid; + if (initialise_amp_pos_lookup(args, amps, last_ref) < 0) + goto err; + } + + if (accumulate_stats(args, amps, b) < 0) + goto err; + } + + if (r < -1) { + print_error_errno("ampliconstats", "Fail reading record"); + goto err; + } + + sam_hdr_destroy(header); + if (sam_close(fp) < 0) { + fp = NULL; + goto err; + } + + fp = NULL; + header = NULL; + + if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0) + goto err; + + if (append_stats(amps, nref) < 0) + goto err; + + if (sname && sname != sname_) + free(sname); + sname = NULL; + } + + if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0) + goto err; + + ret = 0; + err: + bam_destroy1(b); + if (ret) { + if (header) + sam_hdr_destroy(header); + if (fp) + sam_close(fp); + } + for (i = 0; i < nref; i++) { + stats_free(amps[i].lstats); + stats_free(amps[i].gstats); + free(amps[i].amp); + } + free(amps); + free(pos2start); + free(pos2end); + if (ret) { + if (sname && sname != sname_) + free(sname); + } + + return ret; +} + +static int usage(astats_args_t *args, FILE *fp, int exit_status) { + fprintf(fp, +"\n" +"Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n" +"\n" +"Options:\n"); + fprintf(fp, " -f, --required-flag STR|INT\n" + " Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require); + fprintf(fp, " -F, --filter-flag STR|INT\n" + " Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff); + fprintf(fp, " -a, --max-amplicons INT\n" + " Change the maximum number of amplicons permitted [%d]\n", MAX_AMP); + fprintf(fp, " -l, --max-amplicon-length INT\n" + " Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN); + fprintf(fp, " -d, --min-depth INT[,INT]...\n" + " Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]); + fprintf(fp, " -m, --pos-margin INT\n" + " Margin of error for matching primer positions [%d]\n", args->max_delta); + fprintf(fp, " -o, --output FILE\n" + " Specify output file [samtools_stdout if unset]\n"); + fprintf(fp, " -s, --use-sample-name\n" + " Use the sample name from the first @RG header line\n"); + fprintf(fp, " -t, --tlen-adjust INT\n" + " Add/subtract from TLEN; use when clipping but no fixmate step\n"); + fprintf(fp, " -b, --tcoord-bin INT\n" + " Bin template start,end positions into multiples of INT[1]\n"); + fprintf(fp, " -c, --tcoord-min-count INT\n" + " Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT); + fprintf(fp, " -D, --depth-bin FRACTION\n" + " Merge FDP values within +/- FRACTION together\n"); + fprintf(fp, " -S, --single-ref\n" + " Force single-ref (<=1.12) output format\n"); + sam_global_opt_help(fp, "I.--.@"); + + return exit_status; +} + +int main_ampliconstats(int argc, char **argv) { + astats_args_t args = { + .ga = SAM_GLOBAL_ARGS_INIT, + .flag_require = 0, + .flag_filter = 0x10B04, + //.sites = BED_LIST_INIT, + .max_delta = 30, // large enough to cope with alt primers + .min_depth = {1}, + .use_sample_name = 0, + .max_amp = MAX_AMP, + .max_amp_len = MAX_AMP_LEN, + .tlen_adj = 0, + .out_fp = samtools_stdout, + .tcoord_min_count = TCOORD_MIN_COUNT, + .tcoord_bin = 1, + .depth_bin = 0.01, + .multi_ref = 1 + }, oargs = args; + + static const struct option loptions[] = + { + SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'), + {"help", no_argument, NULL, 'h'}, + {"flag-require", required_argument, NULL, 'f'}, + {"flag-filter", required_argument, NULL, 'F'}, + {"min-depth", required_argument, NULL, 'd'}, + {"output", required_argument, NULL, 'o'}, + {"pos-margin", required_argument, NULL, 'm'}, + {"use-sample-name", no_argument, NULL, 's'}, + {"max-amplicons", required_argument, NULL, 'a'}, + {"max-amplicon-length", required_argument, NULL, 'l'}, + {"tlen-adjust", required_argument, NULL, 't'}, + {"tcoord-min-count", required_argument, NULL, 'c'}, + {"tcoord-bin", required_argument, NULL, 'b'}, + {"depth-bin", required_argument, NULL, 'D'}, + {"single-ref", no_argument, NULL, 'S'}, + {NULL, 0, NULL, 0} + }; + int opt; + + while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) { + switch (opt) { + case 'f': args.flag_require = bam_str2flag(optarg); break; + case 'F': + if (args.flag_filter & 0x10000) + args.flag_filter = 0; // strip default on first -F usage + args.flag_filter |= bam_str2flag(optarg); break; + + case 'm': args.max_delta = atoi(optarg); break; // margin + case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction + case 'd': { + int d = 0; + char *cp = optarg, *ep; + do { + long n = strtol(cp, &ep, 10); + args.min_depth[d++] = n; + if (*ep != ',') + break; + cp = ep+1; + } while (d < MAX_DEPTH); + break; + } + + case 'a': args.max_amp = atoi(optarg)+1;break; + case 'l': args.max_amp_len = atoi(optarg)+1;break; + + case 'c': args.tcoord_min_count = atoi(optarg);break; + case 'b': + args.tcoord_bin = atoi(optarg); + if (args.tcoord_bin < 1) + args.tcoord_bin = 1; + break; + + case 't': args.tlen_adj = atoi(optarg);break; + + case 's': args.use_sample_name = 1;break; + + case 'o': + if (!(args.out_fp = fopen(optarg, "w"))) { + perror(optarg); + return 1; + } + break; + + case 'S': + args.multi_ref = 0; + break; + + case '?': return usage(&oargs, samtools_stderr, EXIT_FAILURE); + case 'h': return usage(&oargs, samtools_stdout, EXIT_SUCCESS); + + default: + if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0) + usage(&oargs,samtools_stderr, EXIT_FAILURE); + break; + } + } + + if (argc <= optind) + return usage(&oargs, samtools_stdout, EXIT_SUCCESS); + if (argc <= optind+1 && isatty(STDIN_FILENO)) + return usage(&oargs, samtools_stderr, EXIT_FAILURE); + + khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash); + if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) { + print_error_errno("ampliconstats", + "Could not read file \"%s\"", argv[optind]); + return 1; + + } + + khiter_t k, ref_count = 0; + for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) { + if (!kh_exist(bed_hash, k)) + continue; + ref_count++; + } + if (ref_count == 0) + return 1; + if (ref_count > 1 && args.multi_ref == 0) { + print_error("ampliconstats", + "Single-ref mode is not permitted for BED files\n" + "containing more than one reference."); + return 1; + } + + args.argv = stringify_argv(argc, argv); + int ret; + if (argc == ++optind) { + char *av = "-"; + ret = amplicon_stats(&args, bed_hash, &av, 1); + } else { + ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind); + } + + free(args.argv); + destroy_bed_hash(bed_hash); + + return ret; +} diff --git a/samtools/bam.c b/samtools/bam.c index 0c1a06b..926062c 100644 --- a/samtools/bam.c +++ b/samtools/bam.c @@ -1,6 +1,6 @@ /* bam.c -- BAM format. - Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd. + Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -125,21 +125,21 @@ int bam_remove_B(bam1_t *b) uint8_t *seq, *qual, *p; // test if removal is necessary if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing - cigar = bam1_cigar(b); + cigar = bam_get_cigar(b); for (k = 0; k < b->core.n_cigar; ++k) if (bam_cigar_op(cigar[k]) == BAM_CBACK) break; if (k == b->core.n_cigar) return 0; // no 'B' if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed // allocate memory for the new CIGAR - if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory - b->m_data = b->data_len + b->core.n_cigar * 4; + if (b->l_data + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory + b->m_data = b->l_data + b->core.n_cigar * 4; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); - cigar = bam1_cigar(b); // after realloc, cigar may be changed + cigar = bam_get_cigar(b); // after realloc, cigar may be changed } new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data // the core loop - seq = bam1_seq(b); qual = bam1_qual(b); + seq = bam_get_seq(b); qual = bam_get_qual(b); no_qual = (qual[0] == 0xff); // test whether base quality is available i = j = 0; end_j = -1; for (k = l = 0; k < b->core.n_cigar; ++k) { @@ -168,9 +168,9 @@ int bam_remove_B(bam1_t *b) if (i != j) { // no need to copy if i == j int u, c, c0; for (u = 0; u < len; ++u) { // construct the consensus - c = bam1_seqi(seq, i+u); + c = bam_seqi(seq, i+u); if (j + u < end_j) { // in an overlap - c0 = bam1_seqi(seq, j+u); + c0 = bam_seqi(seq, j+u); if (c != c0) { // a mismatch; choose the better base if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better bam1_seq_seti(seq, j+u, c); @@ -202,9 +202,9 @@ int bam_remove_B(bam1_t *b) p = b->data + b->core.l_qname + l * 4; memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ memmove(p, qual, j); p += j; // set QUAL - memmove(p, bam1_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields + memmove(p, bam_get_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length - b->data_len = p - b->data; // update record length + b->l_data = p - b->data; // update record length return 0; rmB_err: diff --git a/samtools/bam.c.pysam.c b/samtools/bam.c.pysam.c index 4c41e23..2f40ca6 100644 --- a/samtools/bam.c.pysam.c +++ b/samtools/bam.c.pysam.c @@ -2,7 +2,7 @@ /* bam.c -- BAM format. - Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd. + Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -127,21 +127,21 @@ int bam_remove_B(bam1_t *b) uint8_t *seq, *qual, *p; // test if removal is necessary if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing - cigar = bam1_cigar(b); + cigar = bam_get_cigar(b); for (k = 0; k < b->core.n_cigar; ++k) if (bam_cigar_op(cigar[k]) == BAM_CBACK) break; if (k == b->core.n_cigar) return 0; // no 'B' if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed // allocate memory for the new CIGAR - if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory - b->m_data = b->data_len + b->core.n_cigar * 4; + if (b->l_data + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory + b->m_data = b->l_data + b->core.n_cigar * 4; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); - cigar = bam1_cigar(b); // after realloc, cigar may be changed + cigar = bam_get_cigar(b); // after realloc, cigar may be changed } new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data // the core loop - seq = bam1_seq(b); qual = bam1_qual(b); + seq = bam_get_seq(b); qual = bam_get_qual(b); no_qual = (qual[0] == 0xff); // test whether base quality is available i = j = 0; end_j = -1; for (k = l = 0; k < b->core.n_cigar; ++k) { @@ -170,9 +170,9 @@ int bam_remove_B(bam1_t *b) if (i != j) { // no need to copy if i == j int u, c, c0; for (u = 0; u < len; ++u) { // construct the consensus - c = bam1_seqi(seq, i+u); + c = bam_seqi(seq, i+u); if (j + u < end_j) { // in an overlap - c0 = bam1_seqi(seq, j+u); + c0 = bam_seqi(seq, j+u); if (c != c0) { // a mismatch; choose the better base if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better bam1_seq_seti(seq, j+u, c); @@ -204,9 +204,9 @@ int bam_remove_B(bam1_t *b) p = b->data + b->core.l_qname + l * 4; memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ memmove(p, qual, j); p += j; // set QUAL - memmove(p, bam1_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields + memmove(p, bam_get_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length - b->data_len = p - b->data; // update record length + b->l_data = p - b->data; // update record length return 0; rmB_err: diff --git a/samtools/bam.h b/samtools/bam.h index 8c9d33a..804d590 100644 --- a/samtools/bam.h +++ b/samtools/bam.h @@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */ @copyright Genome Research Ltd. */ -#define BAM_VERSION "1.10" +#define BAM_VERSION "1.13" #include #include @@ -77,7 +77,7 @@ typedef bam_hdr_t bam_header_t; #define BAM_OFHEX 1 #define BAM_OFSTR 2 -/*! @abstract defautl mask for pileup */ +/*! @abstract default mask for pileup */ #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) /*! @typedef diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c index 104d108..17dedf0 100644 --- a/samtools/bam2bcf_indel.c +++ b/samtools/bam2bcf_indel.c @@ -408,6 +408,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf { // do realignment; this is the bottleneck const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; + if (qend < qbeg) { + fprintf(stderr, "Impossible data in bcf_call_gap_prep\n"); + exit(1); + } qq = calloc(qend - qbeg, 1); bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); if (bq) ++bq; // skip type diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c index 583f99d..6706298 100644 --- a/samtools/bam2bcf_indel.c.pysam.c +++ b/samtools/bam2bcf_indel.c.pysam.c @@ -410,6 +410,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf { // do realignment; this is the bottleneck const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; + if (qend < qbeg) { + fprintf(samtools_stderr, "Impossible data in bcf_call_gap_prep\n"); + samtools_exit(1); + } qq = calloc(qend - qbeg, 1); bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); if (bq) ++bq; // skip type diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index 4b537c7..5253dfa 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -1,9 +1,11 @@ /* bam2depth.c -- depth subcommand. Copyright (C) 2011, 2012 Broad Institute. - Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd. + Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd. + + Author: Heng Li (to 2020) + Author: James Bonfield (2021 rewrite) - Author: Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,7 +26,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* This program demonstrates how to generate pileup from multiple BAMs - * simutaneously, to achieve random access and to use the BED interface. + * simultaneously, to achieve random access and to use the BED interface. * To compile this program separately, you may: * * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz @@ -41,355 +43,913 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" #include "bedidx.h" #include "sam_opts.h" +#include "htslib/khash.h" -#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1) +// From bam_plcmd.c +int read_file_list(const char *file_list, int *n, char **argv[]); -typedef struct { // auxiliary data structure - samFile *fp; // the file handle - sam_hdr_t *hdr; // the file header - hts_itr_t *iter; // NULL if a region not specified - int min_mapQ, min_len; // mapQ filter; length filter - uint32_t flags; // read filtering flags -} aux_t; +// We accumulate to hist[pos & (size-1)]. This is a ring-buffer. +// We track where we last got to in output and what the biggest value +// we've written to so far (in absolute unmasked coordinates) in +// "last_output" and "end_pos" respectively. +// For each new record we just flush anything we haven't written yet +// already, between "last_output" and this read's start position, and +// initialise any newly seen positions between "end_pos" and this read's +// end position. +typedef struct { + size_t size; + int **hist; // hist[nfiles][size] + hts_pos_t *end_pos; // end_pos[nfiles] + hts_pos_t last_output; + int last_ref; + int nfiles; + const char *ref; + kstring_t ks; + hts_pos_t beg, end; // limit to region + int tid; +} depth_hist; -// This function reads a BAM alignment from one BAM file. -static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup -{ - aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure - int ret; - while (1) - { - ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); - if ( ret<0 ) break; - if ( b->core.flag & aux->flags) continue; - if ( (int)b->core.qual < aux->min_mapQ ) continue; - if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; - break; +typedef struct { + int header; + int flag; + int min_qual; + int min_mqual; + int min_len; + int skip_del; + int all_pos; + int remove_overlaps; + FILE *out; + char *reg; + void *bed; +} depth_opt; + +static void zero_region(depth_opt *opt, depth_hist *dh, + const char *name, hts_pos_t start, hts_pos_t end) { + hts_pos_t i; + kstring_t *ks = &dh->ks; + + kputs(name, ks_clear(ks)); + kputc('\t', ks); + size_t cur_l = ks->l; + if (dh->beg >= 0 && start < dh->beg) + start = dh->beg; + if (dh->end >= 0 && end > dh->end) + end = dh->end; + + for (i = start; i < end; i++) { + // Could be optimised, but needs better API to skip to next + // bed region. + if (opt->bed && bed_overlap(opt->bed, name, i, i+1) == 0) + continue; + + ks->l = cur_l; + kputll(i+1, ks); + int n; + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', ks); + kputc_('0', ks); + } + kputc('\n', ks); + fputs(ks->s, opt->out); } - return ret; + ks->l = cur_l; } -int read_file_list(const char *file_list,int *n,char **argv[]); - -static int usage() { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); - fprintf(stderr, "Options:\n"); - fprintf(stderr, " -a output all positions (including zero depth)\n"); - fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); - fprintf(stderr, " -b list of positions or regions\n"); - fprintf(stderr, " -X use customized index files\n"); - fprintf(stderr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(stderr, " -H print a file header\n"); - fprintf(stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); - fprintf(stderr, " -d/-m maximum coverage depth [8000]. If 0, depth is set to the maximum\n" - " integer value, effectively removing any depth limit.\n"); // the htslib's default - fprintf(stderr, " -o FILE where to write output to [stdout]\n"); - fprintf(stderr, " -q base quality threshold [0]\n"); - fprintf(stderr, " -Q mapping quality threshold [0]\n"); - fprintf(stderr, " -r region\n"); - fprintf(stderr, " -g include reads that have any of the specified flags set [0]\n"); - fprintf(stderr, " -G filter out reads that have any of the specified flags set" - " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); - - sam_global_opt_help(stderr, "-.--.--."); - - fprintf(stderr, "\n"); - fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); - fprintf(stderr, "position, and coverage depth. Note that positions with zero coverage may be\n"); - fprintf(stderr, "omitted by default; see the -a option.\n"); - fprintf(stderr, "\n"); - - return EXIT_FAILURE; +// A variation of bam_cigar2qlen which doesn't count soft-clips in to the +// equation. Basically it's the number of bases in query that are aligned +// in some way to the reference (including insertions, which are considered +// to be aligned by dint of being anchored either side). +hts_pos_t qlen_used(bam1_t *b) { + int n_cigar = b->core.n_cigar; + const uint32_t *cigar = bam_get_cigar(b); + + hts_pos_t l; + + if (b->core.l_qseq) { + // Known SEQ permits of short cut of l_qseq minus CSOFT_CLIPs. + // Full scan not needed, which helps on excessively long CIGARs. + l = b->core.l_qseq; + int kl, kr; + for (kl = 0; kl < n_cigar; kl++) + if (bam_cigar_op(cigar[kl]) == BAM_CSOFT_CLIP) + l -= bam_cigar_oplen(cigar[kl]); + else + break; + + for (kr = n_cigar-1; kr > kl; kr--) + if (bam_cigar_op(cigar[kr]) == BAM_CSOFT_CLIP) + l -= bam_cigar_oplen(cigar[kr]); + else + break; + } else { + // Unknown SEQ ("*") needs a full scan through the CIGAR string. + static int query[16] = { + //M I D N S H P = X B ? ? ? ? ? ? + 1,1,0,0, 0,0,0,1, 1,0,0,0, 0,0,0,0 + }; + int k; + for (k = l = 0; k < n_cigar; k++) + if (query[bam_cigar_op(cigar[k])]) + l += bam_cigar_oplen(cigar[k]); + } + return l; + } -int main_depth(int argc, char *argv[]) -{ - int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0; - hts_pos_t beg, end, pos, last_pos = -1; - int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; - const bam_pileup1_t **plp; - char *reg = 0; // specified region - void *bed = 0; // BED data structure - char *file_list = NULL, **fn = NULL; - sam_hdr_t *h = NULL; // BAM header of the 1st input - aux_t **data; - bam_mplp_t mplp; - int last_tid = -1, ret; - int print_header = 0; - char *output_file = NULL; - FILE *file_out = stdout; - uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); - int tflags = 0; +// Adds the depth for a single read to a depth_hist struct. +// For just one file, this is easy. We just have a circular buffer +// where we increment values for bits that overlap existing data +// and initialise values for coordinates which we're seeing for the first +// time. This is tracked by "end_pos" to know where we've got to. +// +// As the input is sorted, we can flush output from "last_output" to +// b->core.pos. +// +// With multiple files, we must feed data in sorted order as if all files +// are merged, but track depth per file. This also means "end_pos" is per +// file too, but "last_output" is global as it corresponds to rows printed. +static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b, + int overlap_clip, int file) { + hts_pos_t i; + size_t hmask = dh->size-1; + int n; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), - { NULL, 0, NULL, 0 } - }; + if (!b || b->core.tid != dh->last_ref) { + // New ref + if (dh->last_ref >= 0) { + // do end + size_t cur_l = dh->ks.l; + int nf = dh->nfiles; + i = dh->last_output; + for (i = dh->last_output; nf; i++) { + nf = 0; + for (n = 0; n < dh->nfiles; n++) { + if (i < dh->end_pos[n]) + nf++; + } + if (!nf) + break; + + if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0) + continue; - // parse the command line - while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) { - switch (n) { - case 'l': min_len = atoi(optarg); break; // minimum query length - case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header - case 'b': - bed = bed_read(optarg); // BED or position list file can be parsed now - if (!bed) { - print_error_errno("depth", "Could not read file \"%s\"", optarg); - return EXIT_FAILURE; + dh->ks.l = cur_l; + kputll(i+1, &dh->ks); + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', &dh->ks); + int d = i < dh->end_pos[n] + ? dh->hist[n][i & hmask] + : 0; + kputuw(d, &dh->ks); } - break; - case 'X': has_index_file = 1; break; - case 'q': baseQ = atoi(optarg); break; // base quality threshold - case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold - case 'f': file_list = optarg; break; - case 'a': all++; break; - case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth - case 'H': print_header = 1; break; - case 'o': output_file = optarg; break; - case 'g': - tflags = bam_str2flag(optarg); - if (tflags < 0 || tflags > BAM_FMAX) { - print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); - return 1; + kputc('\n', &dh->ks); + fputs(dh->ks.s, opt->out); + } + if (opt->all_pos) { + // End of last ref + zero_region(opt, dh, + sam_hdr_tid2name(h, dh->last_ref), + i, sam_hdr_tid2len(h, dh->last_ref)); + } + dh->ks.l = cur_l; + } + + if (opt->all_pos > 1 && !opt->reg) { + // Any previous unused refs + int lr = dh->last_ref < 0 ? 0 : dh->last_ref+1; + int rr = b ? b->core.tid : sam_hdr_nref(h), r; + for (r = lr; r < rr; r++) + zero_region(opt, dh, + sam_hdr_tid2name(h, r), + 0, sam_hdr_tid2len(h, r)); + } + + if (!b) { + // we're just flushing to end of file + if (opt->all_pos && opt->reg && dh->last_ref < 0) + // -a or -aa without a single read being output yet + zero_region(opt, dh, sam_hdr_tid2name(h, dh->tid), dh->beg, + MIN(dh->end, sam_hdr_tid2len(h, dh->tid))); + + return 0; + } + + for (n = 0; dh->end_pos && n < dh->nfiles; n++) + dh->end_pos[n] = 0; + dh->last_output = dh->beg >= 0 + ? MAX(b->core.pos, dh->beg) + : b->core.pos; + dh->last_ref = b->core.tid; + dh->ref = sam_hdr_tid2name(h, b->core.tid); + kputs(dh->ref, ks_clear(&dh->ks)); + kputc('\t', &dh->ks); + + if (opt->all_pos) + // Start of ref + zero_region(opt, dh, dh->ref, 0, b->core.pos); + } else { + if (dh->last_output < b->core.pos) { + // Flush any depth outputs up to start of new read + size_t cur_l = dh->ks.l; + int nf = dh->nfiles; + for (i = dh->last_output; i < b->core.pos; i++) { + nf = 0; + for (n = 0; n < dh->nfiles; n++) { + if (i < dh->end_pos[n]) + nf++; } - flags &= ~tflags; - break; - case 'G': - tflags = bam_str2flag(optarg); - if (tflags < 0 || tflags > BAM_FMAX) { - print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); - return 1; + if (!nf) + break; + + if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0) + continue; + + dh->ks.l = cur_l; + kputll(i+1, &dh->ks); + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', &dh->ks); + int d = i < dh->end_pos[n] + ? dh->hist[n][i & hmask] + : 0; + kputuw(d, &dh->ks); } - flags |= tflags; - break; - default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return usage(); + kputc('\n', &dh->ks); + fputs(dh->ks.s, opt->out); + } + if (opt->all_pos && i < b->core.pos) + // Hole in middle of ref + zero_region(opt, dh, dh->ref, i, b->core.pos); + + dh->ks.l = cur_l; + dh->last_output = b->core.pos; } } - if (optind == argc && !file_list) - return usage(); - - /* output file provided by user */ - if (output_file != NULL && strcmp(output_file,"-")!=0) { - file_out = fopen( output_file, "w" ); - if (file_out == NULL) { - print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file); - return EXIT_FAILURE; - } + + hts_pos_t end_pos = bam_endpos(b); // 0 based, 1 past end. + //printf("%d %d\n", (int)b->core.pos+1, (int)end_pos); + + if (b->core.tid < dh->last_ref || + (dh->last_ref == b->core.tid && end_pos < dh->last_output)) { + print_error_errno("depth", "Data is not position sorted"); + return -1; } + // If needed, grow the circular buffer. + if (end_pos+1 - b->core.pos >= dh->size) { + size_t old_size = dh->size; + size_t old_hmask = hmask; + while (end_pos+1 - b->core.pos >= dh->size) + dh->size = dh->size ? 2*dh->size : 2048; + hmask = dh->size-1; + if (!dh->hist) { + dh->hist = calloc(dh->nfiles, sizeof(*dh->hist)); + dh->end_pos = calloc(dh->nfiles, sizeof(*dh->end_pos)); + if (!dh->hist || !dh->end_pos) + return -1; + } + for (n = 0; n < dh->nfiles; n++) { + int *hist = calloc(dh->size, sizeof(*dh->hist[n])); + if (!hist) + return -1; - // initialize the auxiliary data structures - if (file_list) - { - if (has_index_file) { - print_error("depth", "The -f option cannot be combined with -X"); - return 1; + // Simple approach for now; copy over old histogram verbatim. + for (i = dh->last_output; i < dh->last_output + old_size; i++) + hist[i & hmask] = dh->hist[n][i & old_hmask]; + free(dh->hist[n]); + dh->hist[n] = hist; } - if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE; - n = nfiles; - argv = fn; - optind = 0; } - else if (has_index_file) { // Calculate # of input BAM files - if ((argc - optind) % 2 != 0) { - fprintf(stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n"); - return 1; - } - n = (argc - optind) / 2; + + // Accumulate depth, based on CIGAR + uint32_t *cig = bam_get_cigar(b); + int ncig = b->core.n_cigar, j, k, spos = 0; + + // Zero new (previously unseen) coordinates so increment works later. + hts_pos_t end = MAX(dh->end_pos[file], b->core.pos); + if (end_pos > end && (end & hmask) < (end_pos & hmask)) { + memset(&dh->hist[file][end & hmask], 0, + sizeof(**dh->hist) * (end_pos - end)); } else { - n = argc - optind; + for (i = end; i < end_pos; i++) + dh->hist[file][i & hmask] = 0; } - data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input - reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region - - for (i = 0; i < n; ++i) { - int rf; - data[i] = calloc(1, sizeof(aux_t)); - data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM - if (data[i]->fp == NULL) { - print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; - if (baseQ) rf |= SAM_QUAL; - if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); - status = EXIT_FAILURE; - goto depth_end; - } - if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value"); - status = EXIT_FAILURE; - goto depth_end; - } - data[i]->min_mapQ = mapQ; // set the mapQ filter - data[i]->min_len = min_len; // set the qlen filter - data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header - if (data[i]->hdr == NULL) { - print_error_errno("depth", "Couldn't read header for \"%s\"", - argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - if (reg) { // if a region is specified - hts_idx_t *idx = NULL; - // If index filename has not been specfied, look in BAM folder - if (has_index_file) { - idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index + + i = b->core.pos; + uint8_t *qual = bam_get_qual(b); + int min_qual = opt->min_qual; + for (j = 0; j < ncig; j++) { + int op = bam_cigar_op(cig[j]); + int oplen = bam_cigar_oplen(cig[j]); + + switch (op) { + case BAM_CDEL: + case BAM_CREF_SKIP: + if (op != BAM_CDEL || opt->skip_del) { + // don't increment reference location + if (i + oplen >= dh->end_pos[file]) { + for (k = 0; k < oplen; k++, i++) { + if (i >= dh->end_pos[file]) + // redundant due to zero new elements above? + dh->hist[file][i & hmask] = 0; + } + } else { + i += oplen; + } + } else { // op == BAM_CDEL and we count them (-J option), + // We don't incr spos here, but we still use qual. + // This doesn't make much sense, but it's for compatibility + // with the old code. Arguably DEL shouldn't have a min + // qual and should always pass (as we've explicitly asked to + // include them). + int *hist = dh->hist[file]; + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + break; + } else if (i < overlap_clip) { + k = overlap_clip - i; + i = overlap_clip; + } + } + + // Question: should we even check quality values for DEL? + // We've explicitly asked to include them, and the quality + // is wrong anyway (it's the neighbouring base). We do this + // for now for compatibility with the old depth command. + + if (spos < b->core.l_qseq) + for (; k < oplen; k++, i++) + hist[i & hmask]+=qual[spos]>=min_qual; + else + for (; k < oplen; k++, i++) + hist[i & hmask]++; + } + break; + + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: + if ((i & hmask) < ((i+oplen) & hmask)) { + // Optimisation when not wrapping around + + // Unrolling doesn't help clang, but helps gcc, + // especially when not using -O3. + int *hist = &dh->hist[file][i & hmask]; + if (min_qual || overlap_clip) { + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + spos += oplen; + break; + } else if (i < overlap_clip) { + oplen -= overlap_clip - i; + spos += overlap_clip - i; + hist += overlap_clip - i; + i = overlap_clip; + } + } + + // approx 50% of this func cpu time in this loop + for (; k < (oplen & ~7); k+=8) { + hist[k+0]+=qual[spos+0]>=min_qual; + hist[k+1]+=qual[spos+1]>=min_qual; + hist[k+2]+=qual[spos+2]>=min_qual; + hist[k+3]+=qual[spos+3]>=min_qual; + hist[k+4]+=qual[spos+4]>=min_qual; + hist[k+5]+=qual[spos+5]>=min_qual; + hist[k+6]+=qual[spos+6]>=min_qual; + hist[k+7]+=qual[spos+7]>=min_qual; + spos += 8; + } + } else { + // easier to vectorize when no min_qual + for (k = 0; k < (oplen & ~7); k+=8) { + hist[k+0]++; + hist[k+1]++; + hist[k+2]++; + hist[k+3]++; + hist[k+4]++; + hist[k+5]++; + hist[k+6]++; + hist[k+7]++; + } + spos += k; + } + for (; k < oplen && spos < b->core.l_qseq; k++, spos++) + hist[k]+=qual[spos]>=min_qual; + for (; k < oplen; k++, spos++) + hist[k]++; + i += oplen; } else { - idx = sam_index_load(data[i]->fp, argv[optind+i]); + // Simple to understand case, but slower. + // We use this only for reads with wrap-around. + int *hist = dh->hist[file]; + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + break; + } else if (i < overlap_clip) { + oplen -= overlap_clip - i; + spos += overlap_clip - i; + i = overlap_clip; + } + } + for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++) + hist[i & hmask]+=qual[spos]>=min_qual; + for (; k < oplen; k++, i++, spos++) + hist[i & hmask]++; } - if (idx == NULL) { - print_error("depth", "can't load index for \"%s\"", argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; + break; + + case BAM_CINS: + case BAM_CSOFT_CLIP: + spos += oplen; + break; + + case BAM_CPAD: + case BAM_CHARD_CLIP: + // ignore + break; + + default: + print_error("depth", "Unsupported cigar op '%d'", op); + return -1; + } + } + + if (dh->end >= 0 && end_pos > dh->end) + end_pos = dh->end; + if (dh->end_pos[file] < end_pos) + dh->end_pos[file] = end_pos; + + return 0; +} + +// Hash on name -> alignment end pos. This permits a naive overlap removal. +// Note it cannot analyse the overlapping sequence and qualities, so the +// interaction of basecalls/qualities and the -Q parameter cannot be +// applied here (unlike the full mpileup algorithm). +KHASH_MAP_INIT_STR(olap_hash, hts_pos_t) +typedef khash_t(olap_hash) olap_hash_t; + +static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn, + samFile **fp, hts_itr_t **itr, sam_hdr_t **h) { + int ret = -1, err = 1, i; + olap_hash_t **overlaps = NULL; + depth_hist dh = {0}; + + // An array of bam structs, one per input file, to hold the next entry + bam1_t **b = calloc(nfiles, sizeof(*b)); + int *finished = calloc(nfiles, sizeof(*finished)), to_go = nfiles; + if (!b || !finished) + goto err; + + for (i = 0; i < nfiles; i++) + if (!(b[i] = bam_init1())) + goto err; + + // Do we need one overlap hash per file? Or shared? + if (opt->remove_overlaps) { + if (!(overlaps = calloc(nfiles, sizeof(*overlaps)))) + return -1; + for (i = 0; i < nfiles; i++) { + if (!(overlaps[i] = kh_init(olap_hash))) + return -1; + } + } + + // Create the initial histogram + dh.nfiles = nfiles; + dh.size = 0; + dh.hist = NULL; + dh.last_ref = -99; + dh.end_pos = NULL; + dh.last_output = itr && itr[0] ? itr[0]->beg : 0; + ks_initialize(&dh.ks); + + // Clip results to region if specified + dh.beg = -1; + dh.end = -1; + dh.tid = 0; + if (itr && itr[0]) { + dh.tid = itr[0]->tid; + dh.beg = itr[0]->beg; + dh.end = itr[0]->end; + } + + if (opt->header) { + fprintf(opt->out, "#CHROM\tPOS"); + for (i = 0; i < nfiles; i++) + fprintf(opt->out, "\t%s", fn[i]); + fputc('\n', opt->out); + } + + // Populate first record per file + for (i = 0; i < nfiles; i++) { + for(;;) { + ret = itr && itr[i] + ? sam_itr_next(fp[i], itr[i], b[i]) + : sam_read1(fp[i], h[i], b[i]); + if (ret < -1) + goto err; + if (ret == -1) { + to_go--; + finished[i] = 1; + break; } - data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator - hts_idx_destroy(idx); // the index is not needed any more; free the memory - if (data[i]->iter == NULL) { - print_error("depth", "can't parse region \"%s\"", reg); - status = EXIT_FAILURE; - goto depth_end; + + if (b[i]->core.tid < 0) + continue; + if (b[i]->core.flag & opt->flag) + continue; + if (b[i]->core.qual < opt->min_mqual) + continue; + + // Original samtools depth used the total sequence (l_qseq) + // including soft-clips. This doesn't feel like a useful metric + // to be filtering on. We now only count sequence bases that + // form the used part of the alignment. + if (opt->min_len) { + if (qlen_used(b[i]) < opt->min_len) + continue; } + + break; } - data[i]->flags = flags; } - if (print_header) { - fputs("#CHROM\tPOS", file_out); - for (i = 0; i < n; ++i) { - fputc('\t', file_out); - fputs(argv[optind+i], file_out); + + // Loop through input files, merging in order so we're + // always adding the next record in sequence + while (to_go) { + // Find next record in file list + int best_tid = INT_MAX, best_file = 0; + hts_pos_t best_pos = HTS_POS_MAX; + + for (i = 0; i < nfiles; i++) { + if (finished[i]) + continue; + if (best_tid > b[i]->core.tid) { + best_tid = b[i]->core.tid; + best_pos = b[i]->core.pos; + best_file = i; + } else if (best_tid == b[i]->core.tid && + best_pos > b[i]->core.pos) { + best_pos = b[i]->core.pos; + best_file = i; } - fputc('\n', file_out); } - h = data[0]->hdr; // easy access to the header of the 1st BAM - if (reg) { - beg = data[0]->iter->beg; // and to the parsed region coordinates - end = data[0]->iter->end; - reg_tid = data[0]->iter->tid; - } + i = best_file; - // the core multi-pileup loop - mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization - if (0 < max_depth) - bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth - else if (!max_depth) - bam_mplp_set_maxcnt(mplp,INT_MAX); - n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM - plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) - while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position - if (pos < beg || pos >= end) continue; // out of range; skip - if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file? - if (all) { - while (tid > last_tid) { - if (last_tid >= 0 && !reg) { - // Deal with remainder or entirety of last tid. - while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - // Horribly inefficient, but the bed API is an obfuscated black box. - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, last_tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); - } + hts_pos_t clip = 0; + if (overlaps && (b[i]->core.flag & BAM_FPAIRED) && + !(b[i]->core.flag & BAM_FMUNMAP)) { + khiter_t k = kh_get(olap_hash, overlaps[i], bam_get_qname(b[i])); + if (k == kh_end(overlaps[i])) { + // not seen before + hts_pos_t endpos = bam_endpos(b[i]); + + // Don't add if mate location is known and can't overlap. + if (b[i]->core.mpos == -1 || + (b[i]->core.tid == b[i]->core.mtid && + b[i]->core.mpos <= endpos)) { + k = kh_put(olap_hash, overlaps[i], bam_get_qname(b[i]), + &ret); + if (ret < 0) + return -1; + kh_key(overlaps[i], k) = strdup(bam_get_qname(b[i])); + kh_value(overlaps[i], k) = endpos; } - last_tid++; - last_pos = -1; - if (all < 2) - break; + } else { + // seen before + clip = kh_value(overlaps[i], k); + free((char *)kh_key(overlaps[i], k)); + kh_del(olap_hash, overlaps[i], k); } + } - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (last_pos < beg) continue; // out of range; skip - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); + // Add the next merged BAM record to the depth plot + if ((ret = add_depth(opt, &dh, h[i], b[i], clip, i)) < 0) { + ret = -1; + goto err; + } + + // Populate next record from this file + for(;!finished[i];) { + ret = itr && itr[i] + ? sam_itr_next(fp[i], itr[i], b[i]) + : sam_read1(fp[i], h[i], b[i]); + if (ret < -1) { + ret = -1; + goto err; + } + if (ret == -1) { + to_go--; + finished[i] = 1; + break; } - last_tid = tid; - last_pos = pos; - } - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue; - fputs(sam_hdr_tid2name(h, tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized printf() would be faster - for (i = 0; i < n; ++i) { // base level filters have to go here - int j, m = 0; - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know - if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos - else if (p->qpos < p->b->core.l_qseq && - bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality + if (b[i]->core.tid < 0) + continue; + if (b[i]->core.flag & opt->flag) + continue; + if (b[i]->core.qual < opt->min_mqual) + continue; + + if (opt->min_len) { + if (qlen_used(b[i]) < opt->min_len) + continue; } - fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output + + break; } - fputc('\n', file_out); } - if (ret < 0) status = EXIT_FAILURE; - free(n_plp); free(plp); - bam_mplp_destroy(mplp); - - if (all) { - // Handle terminating region - if (last_tid < 0 && reg) { - last_tid = reg_tid; - last_pos = beg-1; + + // Tidy up end. + ret = add_depth(opt, &dh, h[0], NULL, 0, 0); + err = 0; + + err: + if (ret == 0 && err) + ret = -1; + + for (i = 0; i < nfiles; i++) { + if (b[i]) + bam_destroy1(b[i]); + if (dh.hist && dh.hist[i]) + free(dh.hist[i]); + } + free(b); + free(finished); + ks_free(&dh.ks); + free(dh.hist); + free(dh.end_pos); + if (overlaps) { + khiter_t k; + for (i = 0; i < nfiles; i++) { + if (!overlaps[i]) + continue; + for (k = kh_begin(overlaps[i]); k < kh_end(overlaps[i]); k++) + if (kh_exist(overlaps[i], k)) + free((char *)kh_key(overlaps[i], k)); + kh_destroy(olap_hash, overlaps[i]); } - while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { - while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (last_pos >= end) break; - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, last_tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); + free(overlaps); + } + + return ret; +} + +static void usage_exit(FILE *fp, int exit_status) +{ + fprintf(fp, "Usage: samtools depth [options] in.bam [in.bam ...]\n"); + fprintf(fp, "\nOptions:\n"); + fprintf(fp, " -a Output all positions (including zero depth)\n"); + fprintf(fp, " -a -a, -aa Output absolutely all positions, including unused ref seqs\n"); + fprintf(fp, " -r REG Specify a region in chr or chr:from-to syntax\n"); + fprintf(fp, " -b FILE Use bed FILE for list of regions\n"); + fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n"); + fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n"); + fprintf(fp, " -g INT Remove specified flags from default flag filter\n"); + fprintf(fp, " -G INT Add specified flags to the default flag filter\n"); + fprintf(fp, " -H Print a file header line\n"); + fprintf(fp, " -l INT Minimum read length [0]\n"); + fprintf(fp, " -o FILE Write output to FILE [stdout]\n"); + fprintf(fp, " -q INT Minimum base quality [0]\n"); + fprintf(fp, " -Q INT Minimum mapping quality [0]\n"); + fprintf(fp, " -H Print a file header\n"); + fprintf(fp, " -J Include reads with deletions in depth computation\n"); + fprintf(fp, " -s Do not count overlapping reads within a template\n"); + sam_global_opt_help(fp, "-.---@-."); + exit(exit_status); +} + +int main_depth(int argc, char *argv[]) +{ + int nfiles, i; + samFile **fp; + sam_hdr_t **header; + int c, has_index_file = 0; + char *file_list = NULL, **fn = NULL; + depth_opt opt = { + .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL, + .min_qual = 0, + .min_mqual = 0, + .skip_del = 1, + .header = 0, + .min_len = 0, + .out = stdout, + .all_pos = 0, + .remove_overlaps = 0, + .reg = NULL, + .bed = NULL, + }; + + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "@:q:Q:JHd:m:l:g:G:o:ar:Xf:b:s", + lopts, NULL)) >= 0) { + switch (c) { + case 'a': + opt.all_pos++; + break; + + case 'b': + opt.bed = bed_read(optarg); + if (!opt.bed) { + print_error_errno("depth", "Could not read file \"%s\"", + optarg); + return 1; } - last_tid++; - last_pos = -1; - if (all < 2 || reg) + break; + + case 'f': + file_list = optarg; + break; + + case 'd': + case 'm': + // depth limit - now ignored + break; + + case 'g': + opt.flag &= ~bam_str2flag(optarg); + break; + case 'G': + opt.flag |= bam_str2flag(optarg); + break; + + case 'l': + opt.min_len = atoi(optarg); + break; + + case 'H': + opt.header = 1; + break; + + case 'q': + opt.min_qual = atoi(optarg); + break; + case 'Q': + opt.min_mqual = atoi(optarg); + break; + + case 'J': + opt.skip_del = 0; + break; + + case 'o': + if (opt.out != stdout) break; + opt.out = fopen(optarg, "w"); + if (!opt.out) { + print_error_errno("depth", "Cannot open \"%s\" for writing.", + optarg); + return EXIT_FAILURE; + } + break; + + case 'r': + opt.reg = optarg; + break; + + case 's': + opt.remove_overlaps = 1; + break; + + case 'X': + has_index_file = 1; + break; + + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': + usage_exit(stderr, EXIT_FAILURE); } } -depth_end: - if (((file_out != stdout)? fclose(file_out) : fflush(file_out)) != 0) { - if (status == EXIT_SUCCESS) { - if (file_out != stdout) - print_error_errno("depth", "error on closing \"%s\"", output_file); - else - print_error_errno("depth", "error on flushing standard output"); - status = EXIT_FAILURE; + if (argc < optind+1 && !file_list) { + if (argc == optind) + usage_exit(stdout, EXIT_SUCCESS); + else + usage_exit(stderr, EXIT_FAILURE); + } + + if (file_list) { + if (has_index_file) { + print_error("depth", "The -f option cannot be combined with -X"); + return 1; + } + if (read_file_list(file_list, &nfiles, &fn)) + return 1; + argv = fn; + argc = nfiles; + optind = 0; + } else { + nfiles = argc - optind; + } + + if (has_index_file) { + if (nfiles%1) { + print_error("depth", "-X needs one index specified per bam file"); + return 1; } + nfiles /= 2; + } + fp = malloc(nfiles * sizeof(*fp)); + header = malloc(nfiles * sizeof(*header)); + if (!fp || !header) { + print_error_errno("depth", "Out of memory"); + return 1; } - for (i = 0; i < n && data[i]; ++i) { - sam_hdr_destroy(data[i]->hdr); - if (data[i]->fp) sam_close(data[i]->fp); - hts_itr_destroy(data[i]->iter); - free(data[i]); + hts_itr_t **itr = NULL; + if (opt.reg) { + itr = calloc(nfiles, sizeof(*itr)); + if (!itr) + return 1; } - free(data); free(reg); - if (bed) bed_destroy(bed); - if ( file_list ) - { - for (i=0; i 0) + hts_set_threads(fp[i], ga.nthreads); + + if (hts_set_opt(fp[i], CRAM_OPT_REQUIRED_FIELDS, + SAM_FLAG | SAM_RNAME | SAM_POS | SAM_CIGAR + | (opt.remove_overlaps ? SAM_QNAME|SAM_RNEXT|SAM_PNEXT + : 0) + | (opt.min_mqual ? SAM_MAPQ : 0) + | (opt.min_len ? SAM_SEQ : 0) + | (opt.min_qual ? SAM_QUAL : 0))) { + fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + + if (hts_set_opt(fp[i], CRAM_OPT_DECODE_MD, 0)) { + fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } + + // FIXME: what if headers differ? + header[i] = sam_hdr_read(fp[i]); + if (header == NULL) { + fprintf(stderr, "Failed to read header for \"%s\"\n", + argv[optind]); + return 1; + } + + if (opt.reg) { + hts_idx_t *idx = has_index_file + ? sam_index_load2(fp[i], argv[optind], argv[optind+nfiles]) + : sam_index_load(fp[i], argv[optind]); + if (!idx) { + print_error("depth", "cannot load index for \"%s\"", + argv[optind]); + return 1; + } + if (!(itr[i] = sam_itr_querys(idx, header[i], opt.reg))) { + print_error("depth", "cannot parse region \"%s\"", opt.reg); + return 1; + } + hts_idx_destroy(idx); + } + } + + int ret = fastdepth_core(&opt, nfiles, &argv[argc-nfiles], fp, itr, header) + ? 1 : 0; + + for (i = 0; i < nfiles; i++) { + sam_hdr_destroy(header[i]); + sam_close(fp[i]); + if (itr && itr[i]) + hts_itr_destroy(itr[i]); + } + free(header); + free(fp); + free(itr); + if (file_list) { + for (i=0; i (to 2020) + Author: James Bonfield (2021 rewrite) - Author: Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,7 +28,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* This program demonstrates how to generate pileup from multiple BAMs - * simutaneously, to achieve random access and to use the BED interface. + * simultaneously, to achieve random access and to use the BED interface. * To compile this program separately, you may: * * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz @@ -43,355 +45,913 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" #include "bedidx.h" #include "sam_opts.h" +#include "htslib/khash.h" -#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1) +// From bam_plcmd.c +int read_file_list(const char *file_list, int *n, char **argv[]); -typedef struct { // auxiliary data structure - samFile *fp; // the file handle - sam_hdr_t *hdr; // the file header - hts_itr_t *iter; // NULL if a region not specified - int min_mapQ, min_len; // mapQ filter; length filter - uint32_t flags; // read filtering flags -} aux_t; +// We accumulate to hist[pos & (size-1)]. This is a ring-buffer. +// We track where we last got to in output and what the biggest value +// we've written to so far (in absolute unmasked coordinates) in +// "last_output" and "end_pos" respectively. +// For each new record we just flush anything we haven't written yet +// already, between "last_output" and this read's start position, and +// initialise any newly seen positions between "end_pos" and this read's +// end position. +typedef struct { + size_t size; + int **hist; // hist[nfiles][size] + hts_pos_t *end_pos; // end_pos[nfiles] + hts_pos_t last_output; + int last_ref; + int nfiles; + const char *ref; + kstring_t ks; + hts_pos_t beg, end; // limit to region + int tid; +} depth_hist; -// This function reads a BAM alignment from one BAM file. -static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup -{ - aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure - int ret; - while (1) - { - ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); - if ( ret<0 ) break; - if ( b->core.flag & aux->flags) continue; - if ( (int)b->core.qual < aux->min_mapQ ) continue; - if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; - break; +typedef struct { + int header; + int flag; + int min_qual; + int min_mqual; + int min_len; + int skip_del; + int all_pos; + int remove_overlaps; + FILE *out; + char *reg; + void *bed; +} depth_opt; + +static void zero_region(depth_opt *opt, depth_hist *dh, + const char *name, hts_pos_t start, hts_pos_t end) { + hts_pos_t i; + kstring_t *ks = &dh->ks; + + kputs(name, ks_clear(ks)); + kputc('\t', ks); + size_t cur_l = ks->l; + if (dh->beg >= 0 && start < dh->beg) + start = dh->beg; + if (dh->end >= 0 && end > dh->end) + end = dh->end; + + for (i = start; i < end; i++) { + // Could be optimised, but needs better API to skip to next + // bed region. + if (opt->bed && bed_overlap(opt->bed, name, i, i+1) == 0) + continue; + + ks->l = cur_l; + kputll(i+1, ks); + int n; + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', ks); + kputc_('0', ks); + } + kputc('\n', ks); + fputs(ks->s, opt->out); } - return ret; + ks->l = cur_l; } -int read_file_list(const char *file_list,int *n,char **argv[]); - -static int usage() { - fprintf(samtools_stderr, "\n"); - fprintf(samtools_stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); - fprintf(samtools_stderr, "Options:\n"); - fprintf(samtools_stderr, " -a output all positions (including zero depth)\n"); - fprintf(samtools_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); - fprintf(samtools_stderr, " -b list of positions or regions\n"); - fprintf(samtools_stderr, " -X use customized index files\n"); - fprintf(samtools_stderr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(samtools_stderr, " -H print a file header\n"); - fprintf(samtools_stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); - fprintf(samtools_stderr, " -d/-m maximum coverage depth [8000]. If 0, depth is set to the maximum\n" - " integer value, effectively removing any depth limit.\n"); // the htslib's default - fprintf(samtools_stderr, " -o FILE where to write output to [samtools_stdout]\n"); - fprintf(samtools_stderr, " -q base quality threshold [0]\n"); - fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); - fprintf(samtools_stderr, " -r region\n"); - fprintf(samtools_stderr, " -g include reads that have any of the specified flags set [0]\n"); - fprintf(samtools_stderr, " -G filter out reads that have any of the specified flags set" - " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); - - sam_global_opt_help(samtools_stderr, "-.--.--."); - - fprintf(samtools_stderr, "\n"); - fprintf(samtools_stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); - fprintf(samtools_stderr, "position, and coverage depth. Note that positions with zero coverage may be\n"); - fprintf(samtools_stderr, "omitted by default; see the -a option.\n"); - fprintf(samtools_stderr, "\n"); - - return EXIT_FAILURE; +// A variation of bam_cigar2qlen which doesn't count soft-clips in to the +// equation. Basically it's the number of bases in query that are aligned +// in some way to the reference (including insertions, which are considered +// to be aligned by dint of being anchored either side). +hts_pos_t qlen_used(bam1_t *b) { + int n_cigar = b->core.n_cigar; + const uint32_t *cigar = bam_get_cigar(b); + + hts_pos_t l; + + if (b->core.l_qseq) { + // Known SEQ permits of short cut of l_qseq minus CSOFT_CLIPs. + // Full scan not needed, which helps on excessively long CIGARs. + l = b->core.l_qseq; + int kl, kr; + for (kl = 0; kl < n_cigar; kl++) + if (bam_cigar_op(cigar[kl]) == BAM_CSOFT_CLIP) + l -= bam_cigar_oplen(cigar[kl]); + else + break; + + for (kr = n_cigar-1; kr > kl; kr--) + if (bam_cigar_op(cigar[kr]) == BAM_CSOFT_CLIP) + l -= bam_cigar_oplen(cigar[kr]); + else + break; + } else { + // Unknown SEQ ("*") needs a full scan through the CIGAR string. + static int query[16] = { + //M I D N S H P = X B ? ? ? ? ? ? + 1,1,0,0, 0,0,0,1, 1,0,0,0, 0,0,0,0 + }; + int k; + for (k = l = 0; k < n_cigar; k++) + if (query[bam_cigar_op(cigar[k])]) + l += bam_cigar_oplen(cigar[k]); + } + return l; + } -int main_depth(int argc, char *argv[]) -{ - int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0; - hts_pos_t beg, end, pos, last_pos = -1; - int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; - const bam_pileup1_t **plp; - char *reg = 0; // specified region - void *bed = 0; // BED data structure - char *file_list = NULL, **fn = NULL; - sam_hdr_t *h = NULL; // BAM header of the 1st input - aux_t **data; - bam_mplp_t mplp; - int last_tid = -1, ret; - int print_header = 0; - char *output_file = NULL; - FILE *file_out = samtools_stdout; - uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); - int tflags = 0; +// Adds the depth for a single read to a depth_hist struct. +// For just one file, this is easy. We just have a circular buffer +// where we increment values for bits that overlap existing data +// and initialise values for coordinates which we're seeing for the first +// time. This is tracked by "end_pos" to know where we've got to. +// +// As the input is sorted, we can flush output from "last_output" to +// b->core.pos. +// +// With multiple files, we must feed data in sorted order as if all files +// are merged, but track depth per file. This also means "end_pos" is per +// file too, but "last_output" is global as it corresponds to rows printed. +static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b, + int overlap_clip, int file) { + hts_pos_t i; + size_t hmask = dh->size-1; + int n; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), - { NULL, 0, NULL, 0 } - }; + if (!b || b->core.tid != dh->last_ref) { + // New ref + if (dh->last_ref >= 0) { + // do end + size_t cur_l = dh->ks.l; + int nf = dh->nfiles; + i = dh->last_output; + for (i = dh->last_output; nf; i++) { + nf = 0; + for (n = 0; n < dh->nfiles; n++) { + if (i < dh->end_pos[n]) + nf++; + } + if (!nf) + break; + + if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0) + continue; - // parse the command line - while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) { - switch (n) { - case 'l': min_len = atoi(optarg); break; // minimum query length - case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header - case 'b': - bed = bed_read(optarg); // BED or position list file can be parsed now - if (!bed) { - print_error_errno("depth", "Could not read file \"%s\"", optarg); - return EXIT_FAILURE; + dh->ks.l = cur_l; + kputll(i+1, &dh->ks); + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', &dh->ks); + int d = i < dh->end_pos[n] + ? dh->hist[n][i & hmask] + : 0; + kputuw(d, &dh->ks); } - break; - case 'X': has_index_file = 1; break; - case 'q': baseQ = atoi(optarg); break; // base quality threshold - case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold - case 'f': file_list = optarg; break; - case 'a': all++; break; - case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth - case 'H': print_header = 1; break; - case 'o': output_file = optarg; break; - case 'g': - tflags = bam_str2flag(optarg); - if (tflags < 0 || tflags > BAM_FMAX) { - print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); - return 1; + kputc('\n', &dh->ks); + fputs(dh->ks.s, opt->out); + } + if (opt->all_pos) { + // End of last ref + zero_region(opt, dh, + sam_hdr_tid2name(h, dh->last_ref), + i, sam_hdr_tid2len(h, dh->last_ref)); + } + dh->ks.l = cur_l; + } + + if (opt->all_pos > 1 && !opt->reg) { + // Any previous unused refs + int lr = dh->last_ref < 0 ? 0 : dh->last_ref+1; + int rr = b ? b->core.tid : sam_hdr_nref(h), r; + for (r = lr; r < rr; r++) + zero_region(opt, dh, + sam_hdr_tid2name(h, r), + 0, sam_hdr_tid2len(h, r)); + } + + if (!b) { + // we're just flushing to end of file + if (opt->all_pos && opt->reg && dh->last_ref < 0) + // -a or -aa without a single read being output yet + zero_region(opt, dh, sam_hdr_tid2name(h, dh->tid), dh->beg, + MIN(dh->end, sam_hdr_tid2len(h, dh->tid))); + + return 0; + } + + for (n = 0; dh->end_pos && n < dh->nfiles; n++) + dh->end_pos[n] = 0; + dh->last_output = dh->beg >= 0 + ? MAX(b->core.pos, dh->beg) + : b->core.pos; + dh->last_ref = b->core.tid; + dh->ref = sam_hdr_tid2name(h, b->core.tid); + kputs(dh->ref, ks_clear(&dh->ks)); + kputc('\t', &dh->ks); + + if (opt->all_pos) + // Start of ref + zero_region(opt, dh, dh->ref, 0, b->core.pos); + } else { + if (dh->last_output < b->core.pos) { + // Flush any depth outputs up to start of new read + size_t cur_l = dh->ks.l; + int nf = dh->nfiles; + for (i = dh->last_output; i < b->core.pos; i++) { + nf = 0; + for (n = 0; n < dh->nfiles; n++) { + if (i < dh->end_pos[n]) + nf++; } - flags &= ~tflags; - break; - case 'G': - tflags = bam_str2flag(optarg); - if (tflags < 0 || tflags > BAM_FMAX) { - print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); - return 1; + if (!nf) + break; + + if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0) + continue; + + dh->ks.l = cur_l; + kputll(i+1, &dh->ks); + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', &dh->ks); + int d = i < dh->end_pos[n] + ? dh->hist[n][i & hmask] + : 0; + kputuw(d, &dh->ks); } - flags |= tflags; - break; - default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return usage(); + kputc('\n', &dh->ks); + fputs(dh->ks.s, opt->out); + } + if (opt->all_pos && i < b->core.pos) + // Hole in middle of ref + zero_region(opt, dh, dh->ref, i, b->core.pos); + + dh->ks.l = cur_l; + dh->last_output = b->core.pos; } } - if (optind == argc && !file_list) - return usage(); - - /* output file provided by user */ - if (output_file != NULL && strcmp(output_file,"-")!=0) { - file_out = fopen( output_file, "w" ); - if (file_out == NULL) { - print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file); - return EXIT_FAILURE; - } + + hts_pos_t end_pos = bam_endpos(b); // 0 based, 1 past end. + //printf("%d %d\n", (int)b->core.pos+1, (int)end_pos); + + if (b->core.tid < dh->last_ref || + (dh->last_ref == b->core.tid && end_pos < dh->last_output)) { + print_error_errno("depth", "Data is not position sorted"); + return -1; } + // If needed, grow the circular buffer. + if (end_pos+1 - b->core.pos >= dh->size) { + size_t old_size = dh->size; + size_t old_hmask = hmask; + while (end_pos+1 - b->core.pos >= dh->size) + dh->size = dh->size ? 2*dh->size : 2048; + hmask = dh->size-1; + if (!dh->hist) { + dh->hist = calloc(dh->nfiles, sizeof(*dh->hist)); + dh->end_pos = calloc(dh->nfiles, sizeof(*dh->end_pos)); + if (!dh->hist || !dh->end_pos) + return -1; + } + for (n = 0; n < dh->nfiles; n++) { + int *hist = calloc(dh->size, sizeof(*dh->hist[n])); + if (!hist) + return -1; - // initialize the auxiliary data structures - if (file_list) - { - if (has_index_file) { - print_error("depth", "The -f option cannot be combined with -X"); - return 1; + // Simple approach for now; copy over old histogram verbatim. + for (i = dh->last_output; i < dh->last_output + old_size; i++) + hist[i & hmask] = dh->hist[n][i & old_hmask]; + free(dh->hist[n]); + dh->hist[n] = hist; } - if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE; - n = nfiles; - argv = fn; - optind = 0; } - else if (has_index_file) { // Calculate # of input BAM files - if ((argc - optind) % 2 != 0) { - fprintf(samtools_stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n"); - return 1; - } - n = (argc - optind) / 2; + + // Accumulate depth, based on CIGAR + uint32_t *cig = bam_get_cigar(b); + int ncig = b->core.n_cigar, j, k, spos = 0; + + // Zero new (previously unseen) coordinates so increment works later. + hts_pos_t end = MAX(dh->end_pos[file], b->core.pos); + if (end_pos > end && (end & hmask) < (end_pos & hmask)) { + memset(&dh->hist[file][end & hmask], 0, + sizeof(**dh->hist) * (end_pos - end)); } else { - n = argc - optind; + for (i = end; i < end_pos; i++) + dh->hist[file][i & hmask] = 0; } - data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input - reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region - - for (i = 0; i < n; ++i) { - int rf; - data[i] = calloc(1, sizeof(aux_t)); - data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM - if (data[i]->fp == NULL) { - print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; - if (baseQ) rf |= SAM_QUAL; - if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); - status = EXIT_FAILURE; - goto depth_end; - } - if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value"); - status = EXIT_FAILURE; - goto depth_end; - } - data[i]->min_mapQ = mapQ; // set the mapQ filter - data[i]->min_len = min_len; // set the qlen filter - data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header - if (data[i]->hdr == NULL) { - print_error_errno("depth", "Couldn't read header for \"%s\"", - argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - if (reg) { // if a region is specified - hts_idx_t *idx = NULL; - // If index filename has not been specfied, look in BAM folder - if (has_index_file) { - idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index + + i = b->core.pos; + uint8_t *qual = bam_get_qual(b); + int min_qual = opt->min_qual; + for (j = 0; j < ncig; j++) { + int op = bam_cigar_op(cig[j]); + int oplen = bam_cigar_oplen(cig[j]); + + switch (op) { + case BAM_CDEL: + case BAM_CREF_SKIP: + if (op != BAM_CDEL || opt->skip_del) { + // don't increment reference location + if (i + oplen >= dh->end_pos[file]) { + for (k = 0; k < oplen; k++, i++) { + if (i >= dh->end_pos[file]) + // redundant due to zero new elements above? + dh->hist[file][i & hmask] = 0; + } + } else { + i += oplen; + } + } else { // op == BAM_CDEL and we count them (-J option), + // We don't incr spos here, but we still use qual. + // This doesn't make much sense, but it's for compatibility + // with the old code. Arguably DEL shouldn't have a min + // qual and should always pass (as we've explicitly asked to + // include them). + int *hist = dh->hist[file]; + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + break; + } else if (i < overlap_clip) { + k = overlap_clip - i; + i = overlap_clip; + } + } + + // Question: should we even check quality values for DEL? + // We've explicitly asked to include them, and the quality + // is wrong anyway (it's the neighbouring base). We do this + // for now for compatibility with the old depth command. + + if (spos < b->core.l_qseq) + for (; k < oplen; k++, i++) + hist[i & hmask]+=qual[spos]>=min_qual; + else + for (; k < oplen; k++, i++) + hist[i & hmask]++; + } + break; + + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: + if ((i & hmask) < ((i+oplen) & hmask)) { + // Optimisation when not wrapping around + + // Unrolling doesn't help clang, but helps gcc, + // especially when not using -O3. + int *hist = &dh->hist[file][i & hmask]; + if (min_qual || overlap_clip) { + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + spos += oplen; + break; + } else if (i < overlap_clip) { + oplen -= overlap_clip - i; + spos += overlap_clip - i; + hist += overlap_clip - i; + i = overlap_clip; + } + } + + // approx 50% of this func cpu time in this loop + for (; k < (oplen & ~7); k+=8) { + hist[k+0]+=qual[spos+0]>=min_qual; + hist[k+1]+=qual[spos+1]>=min_qual; + hist[k+2]+=qual[spos+2]>=min_qual; + hist[k+3]+=qual[spos+3]>=min_qual; + hist[k+4]+=qual[spos+4]>=min_qual; + hist[k+5]+=qual[spos+5]>=min_qual; + hist[k+6]+=qual[spos+6]>=min_qual; + hist[k+7]+=qual[spos+7]>=min_qual; + spos += 8; + } + } else { + // easier to vectorize when no min_qual + for (k = 0; k < (oplen & ~7); k+=8) { + hist[k+0]++; + hist[k+1]++; + hist[k+2]++; + hist[k+3]++; + hist[k+4]++; + hist[k+5]++; + hist[k+6]++; + hist[k+7]++; + } + spos += k; + } + for (; k < oplen && spos < b->core.l_qseq; k++, spos++) + hist[k]+=qual[spos]>=min_qual; + for (; k < oplen; k++, spos++) + hist[k]++; + i += oplen; } else { - idx = sam_index_load(data[i]->fp, argv[optind+i]); + // Simple to understand case, but slower. + // We use this only for reads with wrap-around. + int *hist = dh->hist[file]; + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + break; + } else if (i < overlap_clip) { + oplen -= overlap_clip - i; + spos += overlap_clip - i; + i = overlap_clip; + } + } + for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++) + hist[i & hmask]+=qual[spos]>=min_qual; + for (; k < oplen; k++, i++, spos++) + hist[i & hmask]++; } - if (idx == NULL) { - print_error("depth", "can't load index for \"%s\"", argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; + break; + + case BAM_CINS: + case BAM_CSOFT_CLIP: + spos += oplen; + break; + + case BAM_CPAD: + case BAM_CHARD_CLIP: + // ignore + break; + + default: + print_error("depth", "Unsupported cigar op '%d'", op); + return -1; + } + } + + if (dh->end >= 0 && end_pos > dh->end) + end_pos = dh->end; + if (dh->end_pos[file] < end_pos) + dh->end_pos[file] = end_pos; + + return 0; +} + +// Hash on name -> alignment end pos. This permits a naive overlap removal. +// Note it cannot analyse the overlapping sequence and qualities, so the +// interaction of basecalls/qualities and the -Q parameter cannot be +// applied here (unlike the full mpileup algorithm). +KHASH_MAP_INIT_STR(olap_hash, hts_pos_t) +typedef khash_t(olap_hash) olap_hash_t; + +static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn, + samFile **fp, hts_itr_t **itr, sam_hdr_t **h) { + int ret = -1, err = 1, i; + olap_hash_t **overlaps = NULL; + depth_hist dh = {0}; + + // An array of bam structs, one per input file, to hold the next entry + bam1_t **b = calloc(nfiles, sizeof(*b)); + int *finished = calloc(nfiles, sizeof(*finished)), to_go = nfiles; + if (!b || !finished) + goto err; + + for (i = 0; i < nfiles; i++) + if (!(b[i] = bam_init1())) + goto err; + + // Do we need one overlap hash per file? Or shared? + if (opt->remove_overlaps) { + if (!(overlaps = calloc(nfiles, sizeof(*overlaps)))) + return -1; + for (i = 0; i < nfiles; i++) { + if (!(overlaps[i] = kh_init(olap_hash))) + return -1; + } + } + + // Create the initial histogram + dh.nfiles = nfiles; + dh.size = 0; + dh.hist = NULL; + dh.last_ref = -99; + dh.end_pos = NULL; + dh.last_output = itr && itr[0] ? itr[0]->beg : 0; + ks_initialize(&dh.ks); + + // Clip results to region if specified + dh.beg = -1; + dh.end = -1; + dh.tid = 0; + if (itr && itr[0]) { + dh.tid = itr[0]->tid; + dh.beg = itr[0]->beg; + dh.end = itr[0]->end; + } + + if (opt->header) { + fprintf(opt->out, "#CHROM\tPOS"); + for (i = 0; i < nfiles; i++) + fprintf(opt->out, "\t%s", fn[i]); + fputc('\n', opt->out); + } + + // Populate first record per file + for (i = 0; i < nfiles; i++) { + for(;;) { + ret = itr && itr[i] + ? sam_itr_next(fp[i], itr[i], b[i]) + : sam_read1(fp[i], h[i], b[i]); + if (ret < -1) + goto err; + if (ret == -1) { + to_go--; + finished[i] = 1; + break; } - data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator - hts_idx_destroy(idx); // the index is not needed any more; free the memory - if (data[i]->iter == NULL) { - print_error("depth", "can't parse region \"%s\"", reg); - status = EXIT_FAILURE; - goto depth_end; + + if (b[i]->core.tid < 0) + continue; + if (b[i]->core.flag & opt->flag) + continue; + if (b[i]->core.qual < opt->min_mqual) + continue; + + // Original samtools depth used the total sequence (l_qseq) + // including soft-clips. This doesn't feel like a useful metric + // to be filtering on. We now only count sequence bases that + // form the used part of the alignment. + if (opt->min_len) { + if (qlen_used(b[i]) < opt->min_len) + continue; } + + break; } - data[i]->flags = flags; } - if (print_header) { - fputs("#CHROM\tPOS", file_out); - for (i = 0; i < n; ++i) { - fputc('\t', file_out); - fputs(argv[optind+i], file_out); + + // Loop through input files, merging in order so we're + // always adding the next record in sequence + while (to_go) { + // Find next record in file list + int best_tid = INT_MAX, best_file = 0; + hts_pos_t best_pos = HTS_POS_MAX; + + for (i = 0; i < nfiles; i++) { + if (finished[i]) + continue; + if (best_tid > b[i]->core.tid) { + best_tid = b[i]->core.tid; + best_pos = b[i]->core.pos; + best_file = i; + } else if (best_tid == b[i]->core.tid && + best_pos > b[i]->core.pos) { + best_pos = b[i]->core.pos; + best_file = i; } - fputc('\n', file_out); } - h = data[0]->hdr; // easy access to the header of the 1st BAM - if (reg) { - beg = data[0]->iter->beg; // and to the parsed region coordinates - end = data[0]->iter->end; - reg_tid = data[0]->iter->tid; - } + i = best_file; - // the core multi-pileup loop - mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization - if (0 < max_depth) - bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth - else if (!max_depth) - bam_mplp_set_maxcnt(mplp,INT_MAX); - n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM - plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) - while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position - if (pos < beg || pos >= end) continue; // out of range; skip - if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file? - if (all) { - while (tid > last_tid) { - if (last_tid >= 0 && !reg) { - // Deal with remainder or entirety of last tid. - while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - // Horribly inefficient, but the bed API is an obfuscated black box. - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, last_tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); - } + hts_pos_t clip = 0; + if (overlaps && (b[i]->core.flag & BAM_FPAIRED) && + !(b[i]->core.flag & BAM_FMUNMAP)) { + khiter_t k = kh_get(olap_hash, overlaps[i], bam_get_qname(b[i])); + if (k == kh_end(overlaps[i])) { + // not seen before + hts_pos_t endpos = bam_endpos(b[i]); + + // Don't add if mate location is known and can't overlap. + if (b[i]->core.mpos == -1 || + (b[i]->core.tid == b[i]->core.mtid && + b[i]->core.mpos <= endpos)) { + k = kh_put(olap_hash, overlaps[i], bam_get_qname(b[i]), + &ret); + if (ret < 0) + return -1; + kh_key(overlaps[i], k) = strdup(bam_get_qname(b[i])); + kh_value(overlaps[i], k) = endpos; } - last_tid++; - last_pos = -1; - if (all < 2) - break; + } else { + // seen before + clip = kh_value(overlaps[i], k); + free((char *)kh_key(overlaps[i], k)); + kh_del(olap_hash, overlaps[i], k); } + } - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (last_pos < beg) continue; // out of range; skip - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); + // Add the next merged BAM record to the depth plot + if ((ret = add_depth(opt, &dh, h[i], b[i], clip, i)) < 0) { + ret = -1; + goto err; + } + + // Populate next record from this file + for(;!finished[i];) { + ret = itr && itr[i] + ? sam_itr_next(fp[i], itr[i], b[i]) + : sam_read1(fp[i], h[i], b[i]); + if (ret < -1) { + ret = -1; + goto err; + } + if (ret == -1) { + to_go--; + finished[i] = 1; + break; } - last_tid = tid; - last_pos = pos; - } - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue; - fputs(sam_hdr_tid2name(h, tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized fprintf(samtools_stdout, ) would be faster - for (i = 0; i < n; ++i) { // base level filters have to go here - int j, m = 0; - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know - if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos - else if (p->qpos < p->b->core.l_qseq && - bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality + if (b[i]->core.tid < 0) + continue; + if (b[i]->core.flag & opt->flag) + continue; + if (b[i]->core.qual < opt->min_mqual) + continue; + + if (opt->min_len) { + if (qlen_used(b[i]) < opt->min_len) + continue; } - fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output + + break; } - fputc('\n', file_out); } - if (ret < 0) status = EXIT_FAILURE; - free(n_plp); free(plp); - bam_mplp_destroy(mplp); - - if (all) { - // Handle terminating region - if (last_tid < 0 && reg) { - last_tid = reg_tid; - last_pos = beg-1; + + // Tidy up end. + ret = add_depth(opt, &dh, h[0], NULL, 0, 0); + err = 0; + + err: + if (ret == 0 && err) + ret = -1; + + for (i = 0; i < nfiles; i++) { + if (b[i]) + bam_destroy1(b[i]); + if (dh.hist && dh.hist[i]) + free(dh.hist[i]); + } + free(b); + free(finished); + ks_free(&dh.ks); + free(dh.hist); + free(dh.end_pos); + if (overlaps) { + khiter_t k; + for (i = 0; i < nfiles; i++) { + if (!overlaps[i]) + continue; + for (k = kh_begin(overlaps[i]); k < kh_end(overlaps[i]); k++) + if (kh_exist(overlaps[i], k)) + free((char *)kh_key(overlaps[i], k)); + kh_destroy(olap_hash, overlaps[i]); } - while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { - while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (last_pos >= end) break; - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, last_tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); + free(overlaps); + } + + return ret; +} + +static void usage_exit(FILE *fp, int exit_status) +{ + fprintf(fp, "Usage: samtools depth [options] in.bam [in.bam ...]\n"); + fprintf(fp, "\nOptions:\n"); + fprintf(fp, " -a Output all positions (including zero depth)\n"); + fprintf(fp, " -a -a, -aa Output absolutely all positions, including unused ref seqs\n"); + fprintf(fp, " -r REG Specify a region in chr or chr:from-to syntax\n"); + fprintf(fp, " -b FILE Use bed FILE for list of regions\n"); + fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n"); + fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n"); + fprintf(fp, " -g INT Remove specified flags from default flag filter\n"); + fprintf(fp, " -G INT Add specified flags to the default flag filter\n"); + fprintf(fp, " -H Print a file header line\n"); + fprintf(fp, " -l INT Minimum read length [0]\n"); + fprintf(fp, " -o FILE Write output to FILE [samtools_stdout]\n"); + fprintf(fp, " -q INT Minimum base quality [0]\n"); + fprintf(fp, " -Q INT Minimum mapping quality [0]\n"); + fprintf(fp, " -H Print a file header\n"); + fprintf(fp, " -J Include reads with deletions in depth computation\n"); + fprintf(fp, " -s Do not count overlapping reads within a template\n"); + sam_global_opt_help(fp, "-.---@-."); + samtools_exit(exit_status); +} + +int main_depth(int argc, char *argv[]) +{ + int nfiles, i; + samFile **fp; + sam_hdr_t **header; + int c, has_index_file = 0; + char *file_list = NULL, **fn = NULL; + depth_opt opt = { + .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL, + .min_qual = 0, + .min_mqual = 0, + .skip_del = 1, + .header = 0, + .min_len = 0, + .out = samtools_stdout, + .all_pos = 0, + .remove_overlaps = 0, + .reg = NULL, + .bed = NULL, + }; + + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "@:q:Q:JHd:m:l:g:G:o:ar:Xf:b:s", + lopts, NULL)) >= 0) { + switch (c) { + case 'a': + opt.all_pos++; + break; + + case 'b': + opt.bed = bed_read(optarg); + if (!opt.bed) { + print_error_errno("depth", "Could not read file \"%s\"", + optarg); + return 1; } - last_tid++; - last_pos = -1; - if (all < 2 || reg) + break; + + case 'f': + file_list = optarg; + break; + + case 'd': + case 'm': + // depth limit - now ignored + break; + + case 'g': + opt.flag &= ~bam_str2flag(optarg); + break; + case 'G': + opt.flag |= bam_str2flag(optarg); + break; + + case 'l': + opt.min_len = atoi(optarg); + break; + + case 'H': + opt.header = 1; + break; + + case 'q': + opt.min_qual = atoi(optarg); + break; + case 'Q': + opt.min_mqual = atoi(optarg); + break; + + case 'J': + opt.skip_del = 0; + break; + + case 'o': + if (opt.out != samtools_stdout) break; + opt.out = fopen(optarg, "w"); + if (!opt.out) { + print_error_errno("depth", "Cannot open \"%s\" for writing.", + optarg); + return EXIT_FAILURE; + } + break; + + case 'r': + opt.reg = optarg; + break; + + case 's': + opt.remove_overlaps = 1; + break; + + case 'X': + has_index_file = 1; + break; + + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': + usage_exit(samtools_stderr, EXIT_FAILURE); } } -depth_end: - if (((file_out != samtools_stdout)? fclose(file_out) : fflush(file_out)) != 0) { - if (status == EXIT_SUCCESS) { - if (file_out != samtools_stdout) - print_error_errno("depth", "error on closing \"%s\"", output_file); - else - print_error_errno("depth", "error on flushing standard output"); - status = EXIT_FAILURE; + if (argc < optind+1 && !file_list) { + if (argc == optind) + usage_exit(samtools_stdout, EXIT_SUCCESS); + else + usage_exit(samtools_stderr, EXIT_FAILURE); + } + + if (file_list) { + if (has_index_file) { + print_error("depth", "The -f option cannot be combined with -X"); + return 1; + } + if (read_file_list(file_list, &nfiles, &fn)) + return 1; + argv = fn; + argc = nfiles; + optind = 0; + } else { + nfiles = argc - optind; + } + + if (has_index_file) { + if (nfiles%1) { + print_error("depth", "-X needs one index specified per bam file"); + return 1; } + nfiles /= 2; + } + fp = malloc(nfiles * sizeof(*fp)); + header = malloc(nfiles * sizeof(*header)); + if (!fp || !header) { + print_error_errno("depth", "Out of memory"); + return 1; } - for (i = 0; i < n && data[i]; ++i) { - sam_hdr_destroy(data[i]->hdr); - if (data[i]->fp) sam_close(data[i]->fp); - hts_itr_destroy(data[i]->iter); - free(data[i]); + hts_itr_t **itr = NULL; + if (opt.reg) { + itr = calloc(nfiles, sizeof(*itr)); + if (!itr) + return 1; } - free(data); free(reg); - if (bed) bed_destroy(bed); - if ( file_list ) - { - for (i=0; i 0) + hts_set_threads(fp[i], ga.nthreads); + + if (hts_set_opt(fp[i], CRAM_OPT_REQUIRED_FIELDS, + SAM_FLAG | SAM_RNAME | SAM_POS | SAM_CIGAR + | (opt.remove_overlaps ? SAM_QNAME|SAM_RNEXT|SAM_PNEXT + : 0) + | (opt.min_mqual ? SAM_MAPQ : 0) + | (opt.min_len ? SAM_SEQ : 0) + | (opt.min_qual ? SAM_QUAL : 0))) { + fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + + if (hts_set_opt(fp[i], CRAM_OPT_DECODE_MD, 0)) { + fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } + + // FIXME: what if headers differ? + header[i] = sam_hdr_read(fp[i]); + if (header == NULL) { + fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", + argv[optind]); + return 1; + } + + if (opt.reg) { + hts_idx_t *idx = has_index_file + ? sam_index_load2(fp[i], argv[optind], argv[optind+nfiles]) + : sam_index_load(fp[i], argv[optind]); + if (!idx) { + print_error("depth", "cannot load index for \"%s\"", + argv[optind]); + return 1; + } + if (!(itr[i] = sam_itr_querys(idx, header[i], opt.reg))) { + print_error("depth", "cannot parse region \"%s\"", opt.reg); + return 1; + } + hts_idx_destroy(idx); + } + } + + int ret = fastdepth_core(&opt, nfiles, &argv[argc-nfiles], fp, itr, header) + ? 1 : 0; + + for (i = 0; i < nfiles; i++) { + sam_hdr_destroy(header[i]); + sam_close(fp[i]); + if (itr && itr[i]) + hts_itr_destroy(itr[i]); + } + free(header); + free(fp); + free(itr); + if (file_list) { + for (i=0; i @@ -51,6 +51,8 @@ struct parsed_opts { rg_mode mode; sam_global_args ga; htsThreadPool p; + int uncompressed; + int overwrite_hdr_rg; }; struct state; @@ -164,13 +166,15 @@ static char* get_rg_id(const char *line) static void usage(FILE *fp) { fprintf(fp, - "Usage: samtools addreplacerg [options] [-r <@RG line> | -R ] [-o ] \n" + "Usage: samtools addreplacerg [options] [-r <@RG line> | -R ] [-m orphan_only|overwrite_all] [-o ] \n" "\n" "Options:\n" " -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n" " -o FILE Where to write output to [stdout]\n" " -r STRING @RG line text\n" " -R STRING ID of @RG line in existing header to use\n" + " -u Output uncompressed data\n" + " -w Overwrite an existing @RG line\n" " --no-PG Do not add a PG line\n" ); sam_global_opt_help(fp, "..O..@.."); @@ -198,7 +202,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) }; kstring_t rg_line = {0,0,NULL}; - while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) { + while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) { switch (n) { case 'r': // Are we adding to existing rg line? @@ -235,6 +239,12 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) case 1: retval->no_pg = 1; break; + case 'u': + retval->uncompressed = 1; + break; + case 'w': + retval->overwrite_hdr_rg = 1; + break; case '?': usage(stderr); free(retval); @@ -314,7 +324,7 @@ static void orphan_only_func(const state_t* state, bam1_t* file_read) } static bool init(const parsed_opts_t* opts, state_t** state_out) { - char output_mode[8] = "w"; + char output_mode[9] = "w"; state_t* retval = (state_t*) calloc(1, sizeof(state_t)); if (retval == NULL) { @@ -332,8 +342,12 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { retval->input_header = sam_hdr_read(retval->input_file); retval->output_header = sam_hdr_dup(retval->input_header); + + if (opts->uncompressed) + strcat(output_mode, "0"); if (opts->output_name) // File format auto-detection - sam_open_mode(output_mode + 1, opts->output_name, NULL); + sam_open_mode(output_mode + strlen(output_mode), + opts->output_name, NULL); retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out); if (retval->output_file == NULL) { @@ -351,10 +365,20 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { // Check does not already exist kstring_t hdr_line = { 0, 0, NULL }; if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) { - fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); - free(hdr_line.s); - return false; + if (opts->overwrite_hdr_rg) { + if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) { + fprintf(stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id); + ks_free(&hdr_line); + return false; + } + } else { + fprintf(stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id); + ks_free(&hdr_line); + return false; + } } + ks_free(&hdr_line); + if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) { fprintf(stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id); return false; @@ -374,7 +398,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { return false; } retval->rg_id = strdup(opts->rg_id); - free(hdr_line.s); + ks_free(&hdr_line); } else { kstring_t rg_id = { 0, 0, NULL }; if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) { diff --git a/samtools/bam_addrprg.c.pysam.c b/samtools/bam_addrprg.c.pysam.c index ba1cb08..88ce7e3 100644 --- a/samtools/bam_addrprg.c.pysam.c +++ b/samtools/bam_addrprg.c.pysam.c @@ -2,7 +2,7 @@ /* bam_addrprg.c -- samtools command to add or replace readgroups. - Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited. + Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited. Author: Martin O. Pollard @@ -53,6 +53,8 @@ struct parsed_opts { rg_mode mode; sam_global_args ga; htsThreadPool p; + int uncompressed; + int overwrite_hdr_rg; }; struct state; @@ -166,13 +168,15 @@ static char* get_rg_id(const char *line) static void usage(FILE *fp) { fprintf(fp, - "Usage: samtools addreplacerg [options] [-r <@RG line> | -R ] [-o ] \n" + "Usage: samtools addreplacerg [options] [-r <@RG line> | -R ] [-m orphan_only|overwrite_all] [-o ] \n" "\n" "Options:\n" " -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n" " -o FILE Where to write output to [samtools_stdout]\n" " -r STRING @RG line text\n" " -R STRING ID of @RG line in existing header to use\n" + " -u Output uncompressed data\n" + " -w Overwrite an existing @RG line\n" " --no-PG Do not add a PG line\n" ); sam_global_opt_help(fp, "..O..@.."); @@ -200,7 +204,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) }; kstring_t rg_line = {0,0,NULL}; - while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) { + while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) { switch (n) { case 'r': // Are we adding to existing rg line? @@ -237,6 +241,12 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) case 1: retval->no_pg = 1; break; + case 'u': + retval->uncompressed = 1; + break; + case 'w': + retval->overwrite_hdr_rg = 1; + break; case '?': usage(samtools_stderr); free(retval); @@ -316,7 +326,7 @@ static void orphan_only_func(const state_t* state, bam1_t* file_read) } static bool init(const parsed_opts_t* opts, state_t** state_out) { - char output_mode[8] = "w"; + char output_mode[9] = "w"; state_t* retval = (state_t*) calloc(1, sizeof(state_t)); if (retval == NULL) { @@ -334,8 +344,12 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { retval->input_header = sam_hdr_read(retval->input_file); retval->output_header = sam_hdr_dup(retval->input_header); + + if (opts->uncompressed) + strcat(output_mode, "0"); if (opts->output_name) // File format auto-detection - sam_open_mode(output_mode + 1, opts->output_name, NULL); + sam_open_mode(output_mode + strlen(output_mode), + opts->output_name, NULL); retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out); if (retval->output_file == NULL) { @@ -353,10 +367,20 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { // Check does not already exist kstring_t hdr_line = { 0, 0, NULL }; if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) { - fprintf(samtools_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); - free(hdr_line.s); - return false; + if (opts->overwrite_hdr_rg) { + if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) { + fprintf(samtools_stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id); + ks_free(&hdr_line); + return false; + } + } else { + fprintf(samtools_stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id); + ks_free(&hdr_line); + return false; + } } + ks_free(&hdr_line); + if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) { fprintf(samtools_stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id); return false; @@ -376,7 +400,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { return false; } retval->rg_id = strdup(opts->rg_id); - free(hdr_line.s); + ks_free(&hdr_line); } else { kstring_t rg_id = { 0, 0, NULL }; if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) { diff --git a/samtools/bam_ampliconclip.c b/samtools/bam_ampliconclip.c new file mode 100644 index 0000000..f3fe2bc --- /dev/null +++ b/samtools/bam_ampliconclip.c @@ -0,0 +1,1079 @@ +/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads + from the 5' end. + + Copyright (C) 2020-2021 Genome Research Ltd. + + Authors: Andrew Whitwham + Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include "htslib/thread_pool.h" +#include "sam_opts.h" +#include +#include "htslib/hfile.h" +#include "htslib/kstring.h" +#include "htslib/sam.h" +#include "samtools.h" +#include "bam_ampliconclip.h" + +typedef enum { + soft_clip, + hard_clip +} clipping_type; + +typedef struct { + int add_pg; + int use_strand; + int write_clipped; + int mark_fail; + int both; + int fail_len; + int filter_len; + int unmapped; + int oa_tag; + int del_tag; + int tol; + char *arg_list; + char *stats_file; + char *rejects_file; +} cl_param_t; + + +static int bed_entry_sort(const void *av, const void *bv) { + bed_entry_t *a = (bed_entry_t *) av; + bed_entry_t *b = (bed_entry_t *) bv; + return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1); +} + + +int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) { + hFILE *fp; + int line_count = 0, ret; + int64_t left, right; + kstring_t line = KS_INITIALIZE; + bed_entry_list_t *list; + khiter_t bed_itr; + + if ((fp = hopen(infile, "r")) == NULL) { + print_error_errno("amplicon", "unable to open file %s.", infile); + return 1; + } + + char ref[1024]; + + while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) { + line_count++; + int hret; + char strand; + + if (line.l == 0 || *line.s == '#') continue; + if (strncmp(line.s, "track ", 6) == 0) continue; + if (strncmp(line.s, "browser ", 8) == 0) continue; + + if (get_strand) { + if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c", + ref, &left, &right, &strand) != 4) { + fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s.\n" + "(N.B. ref/chrom name limited to 1023 characters.)\n", + line_count, infile); + ret = 1; + goto error; + } + } else { + if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64, + ref, &left, &right) != 3) { + fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s\n" + "(N.B. ref/chrom name limited to 1023 characters.)\n", + line_count, infile); + ret = 1; + goto error; + } + } + + bed_itr = kh_get(bed_list_hash, bed_lists, ref); + + if (bed_itr == kh_end(bed_lists)) { // new ref entry + char *ref_name = strdup(ref); // need a copy for the hash key + + if (!ref_name) { + fprintf(stderr, "[amplicon] error: unable to allocate memory for ref name.\n"); + ret = 1; + goto error; + } + + bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret); + + if (hret > 0) { + list = &kh_val(bed_lists, bed_itr); + + // initialise the new hash entry + list->longest = 0; + list->size = 0; + list->length = 0; + list->bp = NULL; + } else { + fprintf(stderr, "[amplicon] error: ref hashing failure.\n"); + ret = 1; + goto error; + } + } else { // existing ref + list = &kh_val(bed_lists, bed_itr); + } + + if (list->length == list->size) { + bed_entry_t *tmp; + + list->size += list->size / 2 + 256; + + if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) { + fprintf(stderr, "[amplicon] error: unable to allocate more memory for bed data.\n"); + ret = 1; + goto error; + } + + list->bp = tmp; + } + + list->bp[list->length].left = left; + list->bp[list->length].right = right; + + if (get_strand) { + if (strand == '+') { + list->bp[list->length].rev = 0; + } else if (strand == '-') { + list->bp[list->length].rev = 1; + } else { + fprintf(stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n", + line_count, strand); + ret = 1; + goto error; + } + } + + if (right - left > list->longest) + list->longest = right - left; + + list->length++; + } + + if (sort_by_pos) { + for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) { + if (kh_exist(bed_lists, bed_itr)) { + list = &kh_val(bed_lists, bed_itr); + qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort); + } + } + } + + if (kh_size(bed_lists) > 0) {// any entries + ret = 0; + } else { + ret = 1; + } + +error: + ks_free(&line); + + if (hclose(fp) != 0) { + fprintf(stderr, "[amplicon] warning: failed to close %s", infile); + } + + return ret; +} + + +void destroy_bed_hash(khash_t(bed_list_hash) *hash) { + khiter_t itr; + + for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) { + if (kh_exist(hash, itr)) { + free(kh_val(hash, itr).bp); + free((char *)kh_key(hash, itr)); + kh_key(hash, itr) = NULL; + } + } + + kh_destroy(bed_list_hash, hash); +} + + +static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos, + int is_rev, int use_strand, int64_t longest, + cl_param_t *param) { + int i, size; // may need this to be variable + int tol = param->tol; + int l = 0, mid = sites->length / 2, r = sites->length; + int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos; + + while (r - l > 1) { + if (sites->bp[mid].right <= pos_tol) { + l = mid; + } else { + r = mid; + } + mid = (l + r) / 2; + } + + size = 0; + + for (i = l; i < sites->length; i++) { + hts_pos_t mod_left, mod_right; + + if (use_strand && is_rev != sites->bp[i].rev) + continue; + + if (is_rev) { + mod_left = sites->bp[i].left; + mod_right = sites->bp[i].right + tol; + } else { + if (sites->bp[i].left > tol) { + mod_left = sites->bp[i].left - tol; + } else { + mod_left = 0; + } + mod_right = sites->bp[i].right; + } + + if (pos + longest + tol < mod_right) + break; + + if (pos >= mod_left && pos <= mod_right) { + if (is_rev) { + if (size < pos - sites->bp[i].left) { + size = pos - sites->bp[i].left; + } + } else { + if (size < sites->bp[i].right - pos) { + size = sites->bp[i].right - pos; + } + } + } + } + + return size; +} + + +static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, + clipping_type clipping) { + uint32_t *orig_cigar = bam_get_cigar(rec); + uint8_t *orig_seq = bam_get_seq(rec); + uint8_t *orig_qual = bam_get_qual(rec); + uint8_t *orig_aux = bam_get_aux(rec); + uint32_t *new_cigar; + uint8_t *new_qual; + size_t orig_l_aux = bam_get_l_aux(rec); + uint32_t i, j, odd_base = 0; + uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0; + hts_pos_t new_pos = rec->core.pos; + uint32_t cig_type, cig_op; + + if (rec->l_data + 8 > rec_out->m_data) { + uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8); + if (!new_data) { + fprintf(stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n"); + return 1; + } + rec_out->data = new_data; + rec_out->m_data = rec->l_data + 8; + } + + // Copy core data & name + memcpy(&rec_out->core, &rec->core, sizeof(rec->core)); + memcpy(rec_out->data, rec->data, rec->core.l_qname); + + if (clipping == hard_clip && bases >= rec->core.l_qseq) { + rec_out->core.l_qseq = 0; + rec_out->core.n_cigar = 0; + + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + return 0; + } + + // Modify CIGAR + new_cigar = bam_get_cigar(rec_out); + + for (i = 0; i < rec->core.n_cigar; i++) { + cig_op = bam_cigar_op(orig_cigar[i]); + cig_type = bam_cigar_type(cig_op); + + if (cig_op == BAM_CHARD_CLIP) { + hardclip += bam_cigar_oplen(orig_cigar[i]); + } else { + if (cig_type & 2) { + if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) { + ref_remove -= bam_cigar_oplen(orig_cigar[i]); + } else { + break; + } + new_pos += bam_cigar_oplen(orig_cigar[i]); + } + if (cig_type & 1) { + qry_removed += bam_cigar_oplen(orig_cigar[i]); + } + } + } + + if (i < rec->core.n_cigar) { + cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i])); + + // account for the last operation + if (cig_type & 2) { + new_pos += ref_remove; + } + if (cig_type & 1) { + qry_removed += ref_remove; + } + } else { + qry_removed = rec->core.l_qseq; + } + + j = 0; + if (clipping == hard_clip && hardclip + qry_removed > 0) { + new_cigar[j++] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP); + } + if (clipping == soft_clip) { + if (hardclip > 0) { + new_cigar[j++] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP); + } + if (qry_removed > 0) { + new_cigar[j++] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP); + } + } + + if (i < rec->core.n_cigar + && bam_cigar_oplen(orig_cigar[i]) > ref_remove) { + new_cigar[j++] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i])); + + // fill in the rest of the cigar + i++; + + for (; i < rec->core.n_cigar; i++) { + new_cigar[j++] = orig_cigar[i]; + } + } + + rec_out->core.n_cigar = j; + + if (clipping == soft_clip) { + qry_removed = 0; // Copy all the sequence and confidence values + odd_base = 1; // account for an odd number of bases + } + + new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2; + // Copy remaining SEQ + if ((qry_removed & 1) == 0) { + memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2), + (rec->core.l_qseq - qry_removed + odd_base) / 2); + } else { + uint8_t *in = orig_seq + qry_removed / 2; + uint8_t *out = bam_get_seq(rec_out); + uint32_t i; + for (i = qry_removed; i < rec->core.l_qseq - 1; i += 2) { + *out++ = ((in[0] & 0x0f) << 4) | ((in[1] & 0xf0) >> 4); + in++; + } + if (i < rec->core.l_qseq) { + *out++ = (in[0] & 0x0f) << 4; + } + assert(out == new_qual); + } + + // Copy remaining QUAL + memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed); + + // Set new l_qseq + rec_out->core.l_qseq -= qry_removed; + + // Move AUX + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + // Set new l_data + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + // put in new pos + rec_out->core.pos = new_pos; + + return 0; +} + + +static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases, + clipping_type clipping) { + uint32_t *orig_cigar = bam_get_cigar(rec); + uint8_t *orig_seq = bam_get_seq(rec); + uint8_t *orig_qual = bam_get_qual(rec); + uint8_t *orig_aux = bam_get_aux(rec); + uint32_t *new_cigar; + uint32_t new_n_cigar = 0; + uint8_t *new_qual; + size_t orig_l_aux = bam_get_l_aux(rec); + int32_t i; + int32_t j; + uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0; + uint32_t cig_type, cig_op; + + if (rec->l_data + 8 > rec_out->m_data) { + uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8); + if (!new_data) { + fprintf(stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n"); + return 1; + } + rec_out->data = new_data; + rec_out->m_data = rec->l_data + 8; + } + + // Copy core data & name + memcpy(&rec_out->core, &rec->core, sizeof(rec->core)); + memcpy(rec_out->data, rec->data, rec->core.l_qname); + + if (clipping == hard_clip && bases >= rec->core.l_qseq) { + rec_out->core.l_qseq = 0; + rec_out->core.n_cigar = 0; + + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + return 0; + } + + // Modify CIGAR here + new_cigar = bam_get_cigar(rec_out); + + for (i = rec->core.n_cigar - 1; i >= 0; --i) { + cig_op = bam_cigar_op(orig_cigar[i]); + cig_type = bam_cigar_type(cig_op); + + if (cig_op == BAM_CHARD_CLIP) { + hardclip += bam_cigar_oplen(orig_cigar[i]); + } else { + if (cig_type & 2) { + if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) { + ref_remove -= bam_cigar_oplen(orig_cigar[i]); + } else { + break; + } + } + if (cig_type & 1) { + qry_removed += bam_cigar_oplen(orig_cigar[i]); + } + } + } + + if (i >= 0) { + cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i])); + if (cig_type & 1) { + qry_removed += ref_remove; + } + j = i; + if (qry_removed > 0) j++; + if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++; + } else { + qry_removed = rec->core.l_qseq; + j = 0; + if (hardclip > 0 && clipping == soft_clip) j++; + } + + if (clipping == hard_clip && hardclip + qry_removed > 0) { + new_cigar[j] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP); + new_n_cigar++; + } + if (clipping == soft_clip) { + if (hardclip > 0) { + new_cigar[j] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP); + new_n_cigar++; + if (qry_removed > 0) --j; + } + if (qry_removed > 0) { + new_cigar[j] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP); + new_n_cigar++; + } + } + + if (j > 0) { + new_cigar[--j] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i])); + new_n_cigar++; + } + + // fill in the rest of the cigar + while (j > 0) { + new_cigar[--j] = orig_cigar[--i]; + new_n_cigar++; + } + + rec_out->core.n_cigar = new_n_cigar; + + if (clipping == soft_clip) + qry_removed = 0; // Copy all the sequence and confidence values + + new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2; + // Copy remaining SEQ + memcpy(bam_get_seq(rec_out), orig_seq, (rec->core.l_qseq - qry_removed + 1) / 2); + + // Copy remaining QUAL + memcpy(new_qual, orig_qual, rec->core.l_qseq - qry_removed); + + // Set new l_qseq + rec_out->core.l_qseq -= qry_removed; + + // Copy AUX + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + // Set new l_data + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + return 0; +} + + +static hts_pos_t active_query_len(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + uint32_t cig_type, cig_op; + hts_pos_t len = 0; + int i; + + for (i = 0; i < b->core.n_cigar; i++) { + cig_op = bam_cigar_op(cigar[i]); + cig_type = bam_cigar_type(cig_op); + + if ((cig_type & 1) && (cig_op != BAM_CSOFT_CLIP)) { + len += bam_cigar_oplen(cigar[i]); + } + } + + return len; +} + + +static inline void swap_bams(bam1_t **a, bam1_t **b) { + bam1_t *tmp = *a; + *a = *b; + *b = tmp; +} + + +// Format OA:Z:(RNAME,POS,strand,CIGAR,MAPQ,NM; +static inline int tag_original_data(bam1_t *orig, kstring_t *oa_tag) { + char strand; + uint8_t *nm_tag, *old_oa_tag; + uint32_t *cigar; + int64_t nm = 0; + int i, res = 0; + + ks_clear(oa_tag); + + // if there is an existing OA tag the new one gets appended to it + if ((old_oa_tag = bam_aux_get(orig, "OA"))) { + res |= ksprintf(oa_tag, "%s", bam_aux2Z(old_oa_tag)) < 0; + } + + if (orig->core.flag & BAM_FREVERSE) + strand = '-'; + else + strand = '+'; + + if ((nm_tag = bam_aux_get(orig, "NM"))) { + nm = bam_aux2i(nm_tag); + } + + res |= ksprintf(oa_tag, "%s,%"PRIhts_pos",%c,", bam_get_qname(orig), orig->core.pos + 1, strand) < 0; + + for (i = 0, cigar = bam_get_cigar(orig); i < orig->core.n_cigar && res == 0; ++i) { + res |= kputw(bam_cigar_oplen(cigar[i]), oa_tag) < 0; + res |= kputc(bam_cigar_opchr(cigar[i]), oa_tag) < 0; + } + + if (nm_tag) { + res |= ksprintf(oa_tag, ",%d,%"PRId64";", orig->core.qual, nm) < 0; + } else { + res |= ksprintf(oa_tag, "%d,;", orig->core.qual) < 0; + } + + return res; +} + + +static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, + clipping_type clipping, cl_param_t *param) { + int ret = 1, r, file_open = 0; + + bam_hdr_t *header = NULL; + bam1_t *b = NULL, *b_tmp = NULL; + long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0; + long filtered = 0, written = 0, failed = 0; + kstring_t str = KS_INITIALIZE; + kstring_t oat = KS_INITIALIZE; + bed_entry_list_t *sites; + FILE *stats_fp = stderr; + khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash); + + if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) { + fprintf(stderr, "[ampliconclip] error: unable to load bed file.\n"); + goto fail; + } + + if ((header = sam_hdr_read(in)) == NULL) { + fprintf(stderr, "[ampliconclip] error: could not read header\n"); + goto fail; + } + + // changing pos can ruin coordinate sort order + if (sam_hdr_find_tag_hd(header, "SO", &str) == 0 && str.s && strcmp(str.s, "coordinate") == 0) { + const char *new_order = "unknown"; + + if (sam_hdr_update_hd(header, "SO", new_order) == -1) { + fprintf(stderr, "[ampliconclip] error: unable to change sort order to 'SO:%s'\n", new_order); + goto fail; + } + } + + ks_free(&str); + + if (param->add_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(), + param->arg_list ? "CL" : NULL, + param->arg_list ? param->arg_list : NULL, + NULL) != 0) { + fprintf(stderr, "[ampliconclip] warning: unable to add @PG line to header.\n"); + } + if (sam_hdr_write(out, header) < 0) { + fprintf(stderr, "[ampliconclip] error: could not write header.\n"); + goto fail; + } + + if (reject) { + if (sam_hdr_write(reject, header) < 0) { + fprintf(stderr, "[ampliconclip] error: could not write header to rejects file.\n"); + goto fail; + } + } + + b = bam_init1(); + b_tmp = bam_init1(); + if (!b || !b_tmp) { + fprintf(stderr, "[ampliconclip] error: out of memory when trying to create record.\n"); + goto fail; + } + + int32_t last_tid = -1; + int ref_found = 0; + + while ((r = sam_read1(in, header, b)) >= 0) { + hts_pos_t pos; + int is_rev; + int p_size; + int been_clipped = 0, filter = 0; + int exclude = (BAM_FUNMAP | BAM_FQCFAIL); + khiter_t itr; + + l_count++; + + if (b->core.tid != last_tid) { + const char *ref_name; + + ref_found = 0; + last_tid = b->core.tid; + + if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) { + itr = kh_get(bed_list_hash, bed_hash, ref_name); + + if (itr != kh_end(bed_hash)) { + sites = &kh_val(bed_hash, itr); + ref_found = 1; + } + } + } + + if (!(b->core.flag & exclude) && ref_found) { + if (param->oa_tag) + if (tag_original_data(b, &oat)) + goto fail; + + if (!param->both) { + if (bam_is_rev(b)) { + pos = bam_endpos(b); + is_rev = 1; + } else { + pos = b->core.pos; + is_rev = 0; + } + + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { + if (is_rev) { + if (bam_trim_right(b, b_tmp, p_size, clipping) != 0) + goto fail; + + swap_bams(&b, &b_tmp); + r_count++; + } else { + if (bam_trim_left(b, b_tmp, p_size, clipping) != 0) + goto fail; + + swap_bams(&b, &b_tmp); + f_count++; + } + + if (param->oa_tag) { + if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s)) + goto fail; + } + + if (param->del_tag) { + uint8_t *tag; + + if ((tag = bam_aux_get(b, "NM"))) + bam_aux_del(b, tag); + + if ((tag = bam_aux_get(b, "MD"))) + bam_aux_del(b, tag); + } + + been_clipped = 1; + } else { + if (param->mark_fail) { + b->core.flag |= BAM_FQCFAIL; + } + + n_count++; + } + } else { + int left = 0, right = 0; + + // left first + pos = b->core.pos; + is_rev = 0; + + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { + if (bam_trim_left(b, b_tmp, p_size, clipping) != 0) + goto fail; + + swap_bams(&b, &b_tmp); + f_count++; + left = 1; + been_clipped = 1; + } + + // the right + pos = bam_endpos(b); + is_rev = 1; + + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { + if (bam_trim_right(b, b_tmp, p_size, clipping) != 0) + goto fail; + + swap_bams(&b, &b_tmp); + r_count++; + right = 1; + been_clipped = 1; + } + + if (left || right) { + uint8_t *tag; + + if (param->oa_tag) { + if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s)) + goto fail; + } + + if (param->del_tag) { + if ((tag = bam_aux_get(b, "NM"))) + bam_aux_del(b, tag); + + if ((tag = bam_aux_get(b, "MD"))) + bam_aux_del(b, tag); + } + } + + if (left && right) { + b_count++; + } else if (!left && !right) { + if (param->mark_fail) { + b->core.flag |= BAM_FQCFAIL; + } + + n_count++; + } + } + + if (param->fail_len >= 0 || param->filter_len >= 0) { + hts_pos_t aql = active_query_len(b); + + if (param->fail_len >= 0 && aql <= param->fail_len) { + b->core.flag |= BAM_FQCFAIL; + } + + if (param->filter_len >= 0 && aql <= param->filter_len) { + filter = 1; + } + } + + if (b->core.flag & BAM_FQCFAIL) { + failed++; + } + + if (param->write_clipped && !been_clipped) { + filter = 1; + } + + } else { + l_exclude++; + + if (param->unmapped) { + filter = 1; + } + } + + if (!filter) { + if (sam_write1(out, header, b) < 0) { + fprintf(stderr, "[ampliconclip] error: could not write line %ld.\n", l_count); + goto fail; + } + + written++; + } else { + if (reject) { + if (sam_write1(reject, header, b) < 0) { + fprintf(stderr, "[ampliconclip] error: could not write to reject file %s\n", + param->rejects_file); + goto fail; + } + } + + filtered++; + } + } + + if (r < -1) { + fprintf(stderr, "[ampliconclip] error: failed to read input.\n"); + goto fail; + } + + if (param->stats_file) { + if ((stats_fp = fopen(param->stats_file, "w")) == NULL) { + fprintf(stderr, "[ampliconclip] warning: cannot write stats to %s.\n", param->stats_file); + } else { + file_open = 1; + } + } + + fprintf(stats_fp, "COMMAND: %s\n" + "TOTAL READS: %ld\n" + "TOTAL CLIPPED: %ld\n" + "FORWARD CLIPPED: %ld\n" + "REVERSE CLIPPED: %ld\n" + "BOTH CLIPPED: %ld\n" + "NOT CLIPPED: %ld\n" + "EXCLUDED: %ld\n" + "FILTERED: %ld\n" + "FAILED: %ld\n" + "WRITTEN: %ld\n", param->arg_list, l_count, f_count + r_count, + f_count, r_count, b_count, n_count, l_exclude, + filtered, failed, written); + + if (file_open) { + fclose(stats_fp); + } + + ret = 0; + +fail: + destroy_bed_hash(bed_hash); + ks_free(&oat); + sam_hdr_destroy(header); + bam_destroy1(b); + bam_destroy1(b_tmp); + return ret; +} + + +static void usage(void) { + fprintf(stderr, "Usage: samtools ampliconclip -b BED file -o \n\n"); + fprintf(stderr, "Option: \n"); + fprintf(stderr, " -b FILE BED file of regions (eg amplicon primers) to be removed.\n"); + fprintf(stderr, " -o FILE output file name (default stdout).\n"); + fprintf(stderr, " -f FILE write stats to file name (default stderr)\n"); + fprintf(stderr, " -u Output uncompressed data\n"); + fprintf(stderr, " --soft-clip soft clip amplicon primers from reads (default)\n"); + fprintf(stderr, " --hard-clip hard clip amplicon primers from reads.\n"); + fprintf(stderr, " --both-ends clip on both 5' and 3' ends.\n"); + fprintf(stderr, " --strand use strand data from BED file to match read direction.\n"); + fprintf(stderr, " --clipped only output clipped reads.\n"); + fprintf(stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n"); + fprintf(stderr, " --filter-len INT do not output reads INT size or shorter.\n"); + fprintf(stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n"); + fprintf(stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n"); + fprintf(stderr, " --rejects-file FILE file to write filtered reads.\n"); + fprintf(stderr, " --original for clipped entries add an OA tag with original data.\n"); + fprintf(stderr, " --keep-tag for clipped entries keep the old NM and MD tags.\n"); + fprintf(stderr, " --tolerance match region within this number of bases, default 5.\n"); + fprintf(stderr, " --no-PG do not add an @PG line.\n"); + sam_global_opt_help(stderr, "-.O..@-."); + fprintf(stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n" + "Default clipping is only on the 5' end.\n\n"); +} + + +int amplicon_clip_main(int argc, char **argv) { + int c, ret; + char wmode[4] = {'w', 'b', 0, 0}; + char *bedfile = NULL, *fnout = "-"; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + htsThreadPool p = {NULL, 0}; + samFile *in = NULL, *out = NULL, *reject = NULL; + clipping_type clipping = soft_clip; + cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL}; + + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), + {"no-PG", no_argument, NULL, 1002}, + {"soft-clip", no_argument, NULL, 1003}, + {"hard-clip", no_argument, NULL, 1004}, + {"strand", no_argument, NULL, 1005}, + {"clipped", no_argument, NULL, 1006}, + {"fail", no_argument, NULL, 1007}, + {"both-ends", no_argument, NULL, 1008}, + {"filter-len", required_argument, NULL, 1009}, + {"fail-len", required_argument, NULL, 1010}, + {"no-excluded", no_argument, NULL, 1011}, + {"rejects-file", required_argument, NULL, 1012}, + {"original", no_argument, NULL, 1013}, + {"keep-tag", no_argument, NULL, 1014}, + {"tolerance", required_argument, NULL, 1015}, + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "b:@:o:O:f:u", lopts, NULL)) >= 0) { + switch (c) { + case 'b': bedfile = optarg; break; + case 'o': fnout = optarg; break; + case 'f': param.stats_file = optarg; break; + case 'u': wmode[2] = '0'; break; + case 1002: param.add_pg = 0; break; + case 1003: clipping = soft_clip; break; + case 1004: clipping = hard_clip; break; + case 1005: param.use_strand = 1; break; + case 1006: param.write_clipped = 1; break; + case 1007: param.mark_fail = 1; break; + case 1008: param.both = 1; break; + case 1009: param.filter_len = atoi(optarg); break; + case 1010: param.fail_len = atoi(optarg); break; + case 1011: param.unmapped = 1; break; + case 1012: param.rejects_file = optarg; break; + case 1013: param.oa_tag = 1; break; + case 1014: param.del_tag = 0; break; + case 1015: param.tol = atoi(optarg); break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage(); exit(1); + } + } + + if (!bedfile) { + usage(); + return 1; + } + + if (optind + 1 > argc) { + usage(); + return 1; + } + + if (param.tol < 0) { + fprintf(stderr, "[ampliconclip] warning: invalid tolerance of %d," + " reseting tolerance to default of 5.\n", param.tol); + param.tol = 5; + } + + if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { + print_error_errno("ampliconclip", "cannot open input file"); + return 1; + } + + sam_open_mode(wmode+1, fnout, NULL); + + if ((out = sam_open_format(fnout, wmode, &ga.out)) == NULL) { + print_error_errno("ampliconclip", "cannot open output file"); + return 1; + } + + if (param.rejects_file) { + sam_open_mode(wmode+1, param.rejects_file, NULL); + + if ((reject = sam_open_format(param.rejects_file, wmode, &ga.out)) == NULL) { + print_error_errno("ampliconclip", "cannot open rejects file"); + return 1; + } + } + + if (ga.nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(stderr, "[ampliconclip] error: cannot create thread pool.\n"); + return 1; + } + hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + + if (reject) { + hts_set_opt(reject, HTS_OPT_THREAD_POOL, &p); + } + } + + param.arg_list = stringify_argv(argc + 1, argv - 1); + + ret = bam_clip(in, out, reject, bedfile, clipping, ¶m); + + // cleanup + sam_close(in); + + if (sam_close(out) < 0) { + fprintf(stderr, "[ampliconclip] error: error while closing output file %s.\n", argv[optind+1]); + ret = 1; + } + + if (reject) { + if (sam_close(reject) < 0) { + fprintf(stderr, "[ampliconclip] error: error while closing reject file %s.\n", param.rejects_file); + ret = 1; + } + } + + if (p.pool) hts_tpool_destroy(p.pool); + + sam_global_args_free(&ga); + free(param.arg_list); + + return ret; +} + diff --git a/samtools/bam_ampliconclip.c.pysam.c b/samtools/bam_ampliconclip.c.pysam.c new file mode 100644 index 0000000..3b2ed29 --- /dev/null +++ b/samtools/bam_ampliconclip.c.pysam.c @@ -0,0 +1,1081 @@ +#include "samtools.pysam.h" + +/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads + from the 5' end. + + Copyright (C) 2020-2021 Genome Research Ltd. + + Authors: Andrew Whitwham + Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include "htslib/thread_pool.h" +#include "sam_opts.h" +#include +#include "htslib/hfile.h" +#include "htslib/kstring.h" +#include "htslib/sam.h" +#include "samtools.h" +#include "bam_ampliconclip.h" + +typedef enum { + soft_clip, + hard_clip +} clipping_type; + +typedef struct { + int add_pg; + int use_strand; + int write_clipped; + int mark_fail; + int both; + int fail_len; + int filter_len; + int unmapped; + int oa_tag; + int del_tag; + int tol; + char *arg_list; + char *stats_file; + char *rejects_file; +} cl_param_t; + + +static int bed_entry_sort(const void *av, const void *bv) { + bed_entry_t *a = (bed_entry_t *) av; + bed_entry_t *b = (bed_entry_t *) bv; + return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1); +} + + +int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) { + hFILE *fp; + int line_count = 0, ret; + int64_t left, right; + kstring_t line = KS_INITIALIZE; + bed_entry_list_t *list; + khiter_t bed_itr; + + if ((fp = hopen(infile, "r")) == NULL) { + print_error_errno("amplicon", "unable to open file %s.", infile); + return 1; + } + + char ref[1024]; + + while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) { + line_count++; + int hret; + char strand; + + if (line.l == 0 || *line.s == '#') continue; + if (strncmp(line.s, "track ", 6) == 0) continue; + if (strncmp(line.s, "browser ", 8) == 0) continue; + + if (get_strand) { + if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c", + ref, &left, &right, &strand) != 4) { + fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s.\n" + "(N.B. ref/chrom name limited to 1023 characters.)\n", + line_count, infile); + ret = 1; + goto error; + } + } else { + if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64, + ref, &left, &right) != 3) { + fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s\n" + "(N.B. ref/chrom name limited to 1023 characters.)\n", + line_count, infile); + ret = 1; + goto error; + } + } + + bed_itr = kh_get(bed_list_hash, bed_lists, ref); + + if (bed_itr == kh_end(bed_lists)) { // new ref entry + char *ref_name = strdup(ref); // need a copy for the hash key + + if (!ref_name) { + fprintf(samtools_stderr, "[amplicon] error: unable to allocate memory for ref name.\n"); + ret = 1; + goto error; + } + + bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret); + + if (hret > 0) { + list = &kh_val(bed_lists, bed_itr); + + // initialise the new hash entry + list->longest = 0; + list->size = 0; + list->length = 0; + list->bp = NULL; + } else { + fprintf(samtools_stderr, "[amplicon] error: ref hashing failure.\n"); + ret = 1; + goto error; + } + } else { // existing ref + list = &kh_val(bed_lists, bed_itr); + } + + if (list->length == list->size) { + bed_entry_t *tmp; + + list->size += list->size / 2 + 256; + + if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) { + fprintf(samtools_stderr, "[amplicon] error: unable to allocate more memory for bed data.\n"); + ret = 1; + goto error; + } + + list->bp = tmp; + } + + list->bp[list->length].left = left; + list->bp[list->length].right = right; + + if (get_strand) { + if (strand == '+') { + list->bp[list->length].rev = 0; + } else if (strand == '-') { + list->bp[list->length].rev = 1; + } else { + fprintf(samtools_stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n", + line_count, strand); + ret = 1; + goto error; + } + } + + if (right - left > list->longest) + list->longest = right - left; + + list->length++; + } + + if (sort_by_pos) { + for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) { + if (kh_exist(bed_lists, bed_itr)) { + list = &kh_val(bed_lists, bed_itr); + qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort); + } + } + } + + if (kh_size(bed_lists) > 0) {// any entries + ret = 0; + } else { + ret = 1; + } + +error: + ks_free(&line); + + if (hclose(fp) != 0) { + fprintf(samtools_stderr, "[amplicon] warning: failed to close %s", infile); + } + + return ret; +} + + +void destroy_bed_hash(khash_t(bed_list_hash) *hash) { + khiter_t itr; + + for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) { + if (kh_exist(hash, itr)) { + free(kh_val(hash, itr).bp); + free((char *)kh_key(hash, itr)); + kh_key(hash, itr) = NULL; + } + } + + kh_destroy(bed_list_hash, hash); +} + + +static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos, + int is_rev, int use_strand, int64_t longest, + cl_param_t *param) { + int i, size; // may need this to be variable + int tol = param->tol; + int l = 0, mid = sites->length / 2, r = sites->length; + int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos; + + while (r - l > 1) { + if (sites->bp[mid].right <= pos_tol) { + l = mid; + } else { + r = mid; + } + mid = (l + r) / 2; + } + + size = 0; + + for (i = l; i < sites->length; i++) { + hts_pos_t mod_left, mod_right; + + if (use_strand && is_rev != sites->bp[i].rev) + continue; + + if (is_rev) { + mod_left = sites->bp[i].left; + mod_right = sites->bp[i].right + tol; + } else { + if (sites->bp[i].left > tol) { + mod_left = sites->bp[i].left - tol; + } else { + mod_left = 0; + } + mod_right = sites->bp[i].right; + } + + if (pos + longest + tol < mod_right) + break; + + if (pos >= mod_left && pos <= mod_right) { + if (is_rev) { + if (size < pos - sites->bp[i].left) { + size = pos - sites->bp[i].left; + } + } else { + if (size < sites->bp[i].right - pos) { + size = sites->bp[i].right - pos; + } + } + } + } + + return size; +} + + +static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, + clipping_type clipping) { + uint32_t *orig_cigar = bam_get_cigar(rec); + uint8_t *orig_seq = bam_get_seq(rec); + uint8_t *orig_qual = bam_get_qual(rec); + uint8_t *orig_aux = bam_get_aux(rec); + uint32_t *new_cigar; + uint8_t *new_qual; + size_t orig_l_aux = bam_get_l_aux(rec); + uint32_t i, j, odd_base = 0; + uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0; + hts_pos_t new_pos = rec->core.pos; + uint32_t cig_type, cig_op; + + if (rec->l_data + 8 > rec_out->m_data) { + uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8); + if (!new_data) { + fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n"); + return 1; + } + rec_out->data = new_data; + rec_out->m_data = rec->l_data + 8; + } + + // Copy core data & name + memcpy(&rec_out->core, &rec->core, sizeof(rec->core)); + memcpy(rec_out->data, rec->data, rec->core.l_qname); + + if (clipping == hard_clip && bases >= rec->core.l_qseq) { + rec_out->core.l_qseq = 0; + rec_out->core.n_cigar = 0; + + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + return 0; + } + + // Modify CIGAR + new_cigar = bam_get_cigar(rec_out); + + for (i = 0; i < rec->core.n_cigar; i++) { + cig_op = bam_cigar_op(orig_cigar[i]); + cig_type = bam_cigar_type(cig_op); + + if (cig_op == BAM_CHARD_CLIP) { + hardclip += bam_cigar_oplen(orig_cigar[i]); + } else { + if (cig_type & 2) { + if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) { + ref_remove -= bam_cigar_oplen(orig_cigar[i]); + } else { + break; + } + new_pos += bam_cigar_oplen(orig_cigar[i]); + } + if (cig_type & 1) { + qry_removed += bam_cigar_oplen(orig_cigar[i]); + } + } + } + + if (i < rec->core.n_cigar) { + cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i])); + + // account for the last operation + if (cig_type & 2) { + new_pos += ref_remove; + } + if (cig_type & 1) { + qry_removed += ref_remove; + } + } else { + qry_removed = rec->core.l_qseq; + } + + j = 0; + if (clipping == hard_clip && hardclip + qry_removed > 0) { + new_cigar[j++] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP); + } + if (clipping == soft_clip) { + if (hardclip > 0) { + new_cigar[j++] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP); + } + if (qry_removed > 0) { + new_cigar[j++] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP); + } + } + + if (i < rec->core.n_cigar + && bam_cigar_oplen(orig_cigar[i]) > ref_remove) { + new_cigar[j++] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i])); + + // fill in the rest of the cigar + i++; + + for (; i < rec->core.n_cigar; i++) { + new_cigar[j++] = orig_cigar[i]; + } + } + + rec_out->core.n_cigar = j; + + if (clipping == soft_clip) { + qry_removed = 0; // Copy all the sequence and confidence values + odd_base = 1; // account for an odd number of bases + } + + new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2; + // Copy remaining SEQ + if ((qry_removed & 1) == 0) { + memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2), + (rec->core.l_qseq - qry_removed + odd_base) / 2); + } else { + uint8_t *in = orig_seq + qry_removed / 2; + uint8_t *out = bam_get_seq(rec_out); + uint32_t i; + for (i = qry_removed; i < rec->core.l_qseq - 1; i += 2) { + *out++ = ((in[0] & 0x0f) << 4) | ((in[1] & 0xf0) >> 4); + in++; + } + if (i < rec->core.l_qseq) { + *out++ = (in[0] & 0x0f) << 4; + } + assert(out == new_qual); + } + + // Copy remaining QUAL + memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed); + + // Set new l_qseq + rec_out->core.l_qseq -= qry_removed; + + // Move AUX + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + // Set new l_data + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + // put in new pos + rec_out->core.pos = new_pos; + + return 0; +} + + +static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases, + clipping_type clipping) { + uint32_t *orig_cigar = bam_get_cigar(rec); + uint8_t *orig_seq = bam_get_seq(rec); + uint8_t *orig_qual = bam_get_qual(rec); + uint8_t *orig_aux = bam_get_aux(rec); + uint32_t *new_cigar; + uint32_t new_n_cigar = 0; + uint8_t *new_qual; + size_t orig_l_aux = bam_get_l_aux(rec); + int32_t i; + int32_t j; + uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0; + uint32_t cig_type, cig_op; + + if (rec->l_data + 8 > rec_out->m_data) { + uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8); + if (!new_data) { + fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n"); + return 1; + } + rec_out->data = new_data; + rec_out->m_data = rec->l_data + 8; + } + + // Copy core data & name + memcpy(&rec_out->core, &rec->core, sizeof(rec->core)); + memcpy(rec_out->data, rec->data, rec->core.l_qname); + + if (clipping == hard_clip && bases >= rec->core.l_qseq) { + rec_out->core.l_qseq = 0; + rec_out->core.n_cigar = 0; + + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + return 0; + } + + // Modify CIGAR here + new_cigar = bam_get_cigar(rec_out); + + for (i = rec->core.n_cigar - 1; i >= 0; --i) { + cig_op = bam_cigar_op(orig_cigar[i]); + cig_type = bam_cigar_type(cig_op); + + if (cig_op == BAM_CHARD_CLIP) { + hardclip += bam_cigar_oplen(orig_cigar[i]); + } else { + if (cig_type & 2) { + if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) { + ref_remove -= bam_cigar_oplen(orig_cigar[i]); + } else { + break; + } + } + if (cig_type & 1) { + qry_removed += bam_cigar_oplen(orig_cigar[i]); + } + } + } + + if (i >= 0) { + cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i])); + if (cig_type & 1) { + qry_removed += ref_remove; + } + j = i; + if (qry_removed > 0) j++; + if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++; + } else { + qry_removed = rec->core.l_qseq; + j = 0; + if (hardclip > 0 && clipping == soft_clip) j++; + } + + if (clipping == hard_clip && hardclip + qry_removed > 0) { + new_cigar[j] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP); + new_n_cigar++; + } + if (clipping == soft_clip) { + if (hardclip > 0) { + new_cigar[j] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP); + new_n_cigar++; + if (qry_removed > 0) --j; + } + if (qry_removed > 0) { + new_cigar[j] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP); + new_n_cigar++; + } + } + + if (j > 0) { + new_cigar[--j] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i])); + new_n_cigar++; + } + + // fill in the rest of the cigar + while (j > 0) { + new_cigar[--j] = orig_cigar[--i]; + new_n_cigar++; + } + + rec_out->core.n_cigar = new_n_cigar; + + if (clipping == soft_clip) + qry_removed = 0; // Copy all the sequence and confidence values + + new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2; + // Copy remaining SEQ + memcpy(bam_get_seq(rec_out), orig_seq, (rec->core.l_qseq - qry_removed + 1) / 2); + + // Copy remaining QUAL + memcpy(new_qual, orig_qual, rec->core.l_qseq - qry_removed); + + // Set new l_qseq + rec_out->core.l_qseq -= qry_removed; + + // Copy AUX + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + // Set new l_data + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + return 0; +} + + +static hts_pos_t active_query_len(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + uint32_t cig_type, cig_op; + hts_pos_t len = 0; + int i; + + for (i = 0; i < b->core.n_cigar; i++) { + cig_op = bam_cigar_op(cigar[i]); + cig_type = bam_cigar_type(cig_op); + + if ((cig_type & 1) && (cig_op != BAM_CSOFT_CLIP)) { + len += bam_cigar_oplen(cigar[i]); + } + } + + return len; +} + + +static inline void swap_bams(bam1_t **a, bam1_t **b) { + bam1_t *tmp = *a; + *a = *b; + *b = tmp; +} + + +// Format OA:Z:(RNAME,POS,strand,CIGAR,MAPQ,NM; +static inline int tag_original_data(bam1_t *orig, kstring_t *oa_tag) { + char strand; + uint8_t *nm_tag, *old_oa_tag; + uint32_t *cigar; + int64_t nm = 0; + int i, res = 0; + + ks_clear(oa_tag); + + // if there is an existing OA tag the new one gets appended to it + if ((old_oa_tag = bam_aux_get(orig, "OA"))) { + res |= ksprintf(oa_tag, "%s", bam_aux2Z(old_oa_tag)) < 0; + } + + if (orig->core.flag & BAM_FREVERSE) + strand = '-'; + else + strand = '+'; + + if ((nm_tag = bam_aux_get(orig, "NM"))) { + nm = bam_aux2i(nm_tag); + } + + res |= ksprintf(oa_tag, "%s,%"PRIhts_pos",%c,", bam_get_qname(orig), orig->core.pos + 1, strand) < 0; + + for (i = 0, cigar = bam_get_cigar(orig); i < orig->core.n_cigar && res == 0; ++i) { + res |= kputw(bam_cigar_oplen(cigar[i]), oa_tag) < 0; + res |= kputc(bam_cigar_opchr(cigar[i]), oa_tag) < 0; + } + + if (nm_tag) { + res |= ksprintf(oa_tag, ",%d,%"PRId64";", orig->core.qual, nm) < 0; + } else { + res |= ksprintf(oa_tag, "%d,;", orig->core.qual) < 0; + } + + return res; +} + + +static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, + clipping_type clipping, cl_param_t *param) { + int ret = 1, r, file_open = 0; + + bam_hdr_t *header = NULL; + bam1_t *b = NULL, *b_tmp = NULL; + long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0; + long filtered = 0, written = 0, failed = 0; + kstring_t str = KS_INITIALIZE; + kstring_t oat = KS_INITIALIZE; + bed_entry_list_t *sites; + FILE *stats_fp = samtools_stderr; + khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash); + + if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) { + fprintf(samtools_stderr, "[ampliconclip] error: unable to load bed file.\n"); + goto fail; + } + + if ((header = sam_hdr_read(in)) == NULL) { + fprintf(samtools_stderr, "[ampliconclip] error: could not read header\n"); + goto fail; + } + + // changing pos can ruin coordinate sort order + if (sam_hdr_find_tag_hd(header, "SO", &str) == 0 && str.s && strcmp(str.s, "coordinate") == 0) { + const char *new_order = "unknown"; + + if (sam_hdr_update_hd(header, "SO", new_order) == -1) { + fprintf(samtools_stderr, "[ampliconclip] error: unable to change sort order to 'SO:%s'\n", new_order); + goto fail; + } + } + + ks_free(&str); + + if (param->add_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(), + param->arg_list ? "CL" : NULL, + param->arg_list ? param->arg_list : NULL, + NULL) != 0) { + fprintf(samtools_stderr, "[ampliconclip] warning: unable to add @PG line to header.\n"); + } + if (sam_hdr_write(out, header) < 0) { + fprintf(samtools_stderr, "[ampliconclip] error: could not write header.\n"); + goto fail; + } + + if (reject) { + if (sam_hdr_write(reject, header) < 0) { + fprintf(samtools_stderr, "[ampliconclip] error: could not write header to rejects file.\n"); + goto fail; + } + } + + b = bam_init1(); + b_tmp = bam_init1(); + if (!b || !b_tmp) { + fprintf(samtools_stderr, "[ampliconclip] error: out of memory when trying to create record.\n"); + goto fail; + } + + int32_t last_tid = -1; + int ref_found = 0; + + while ((r = sam_read1(in, header, b)) >= 0) { + hts_pos_t pos; + int is_rev; + int p_size; + int been_clipped = 0, filter = 0; + int exclude = (BAM_FUNMAP | BAM_FQCFAIL); + khiter_t itr; + + l_count++; + + if (b->core.tid != last_tid) { + const char *ref_name; + + ref_found = 0; + last_tid = b->core.tid; + + if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) { + itr = kh_get(bed_list_hash, bed_hash, ref_name); + + if (itr != kh_end(bed_hash)) { + sites = &kh_val(bed_hash, itr); + ref_found = 1; + } + } + } + + if (!(b->core.flag & exclude) && ref_found) { + if (param->oa_tag) + if (tag_original_data(b, &oat)) + goto fail; + + if (!param->both) { + if (bam_is_rev(b)) { + pos = bam_endpos(b); + is_rev = 1; + } else { + pos = b->core.pos; + is_rev = 0; + } + + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { + if (is_rev) { + if (bam_trim_right(b, b_tmp, p_size, clipping) != 0) + goto fail; + + swap_bams(&b, &b_tmp); + r_count++; + } else { + if (bam_trim_left(b, b_tmp, p_size, clipping) != 0) + goto fail; + + swap_bams(&b, &b_tmp); + f_count++; + } + + if (param->oa_tag) { + if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s)) + goto fail; + } + + if (param->del_tag) { + uint8_t *tag; + + if ((tag = bam_aux_get(b, "NM"))) + bam_aux_del(b, tag); + + if ((tag = bam_aux_get(b, "MD"))) + bam_aux_del(b, tag); + } + + been_clipped = 1; + } else { + if (param->mark_fail) { + b->core.flag |= BAM_FQCFAIL; + } + + n_count++; + } + } else { + int left = 0, right = 0; + + // left first + pos = b->core.pos; + is_rev = 0; + + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { + if (bam_trim_left(b, b_tmp, p_size, clipping) != 0) + goto fail; + + swap_bams(&b, &b_tmp); + f_count++; + left = 1; + been_clipped = 1; + } + + // the right + pos = bam_endpos(b); + is_rev = 1; + + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { + if (bam_trim_right(b, b_tmp, p_size, clipping) != 0) + goto fail; + + swap_bams(&b, &b_tmp); + r_count++; + right = 1; + been_clipped = 1; + } + + if (left || right) { + uint8_t *tag; + + if (param->oa_tag) { + if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s)) + goto fail; + } + + if (param->del_tag) { + if ((tag = bam_aux_get(b, "NM"))) + bam_aux_del(b, tag); + + if ((tag = bam_aux_get(b, "MD"))) + bam_aux_del(b, tag); + } + } + + if (left && right) { + b_count++; + } else if (!left && !right) { + if (param->mark_fail) { + b->core.flag |= BAM_FQCFAIL; + } + + n_count++; + } + } + + if (param->fail_len >= 0 || param->filter_len >= 0) { + hts_pos_t aql = active_query_len(b); + + if (param->fail_len >= 0 && aql <= param->fail_len) { + b->core.flag |= BAM_FQCFAIL; + } + + if (param->filter_len >= 0 && aql <= param->filter_len) { + filter = 1; + } + } + + if (b->core.flag & BAM_FQCFAIL) { + failed++; + } + + if (param->write_clipped && !been_clipped) { + filter = 1; + } + + } else { + l_exclude++; + + if (param->unmapped) { + filter = 1; + } + } + + if (!filter) { + if (sam_write1(out, header, b) < 0) { + fprintf(samtools_stderr, "[ampliconclip] error: could not write line %ld.\n", l_count); + goto fail; + } + + written++; + } else { + if (reject) { + if (sam_write1(reject, header, b) < 0) { + fprintf(samtools_stderr, "[ampliconclip] error: could not write to reject file %s\n", + param->rejects_file); + goto fail; + } + } + + filtered++; + } + } + + if (r < -1) { + fprintf(samtools_stderr, "[ampliconclip] error: failed to read input.\n"); + goto fail; + } + + if (param->stats_file) { + if ((stats_fp = fopen(param->stats_file, "w")) == NULL) { + fprintf(samtools_stderr, "[ampliconclip] warning: cannot write stats to %s.\n", param->stats_file); + } else { + file_open = 1; + } + } + + fprintf(stats_fp, "COMMAND: %s\n" + "TOTAL READS: %ld\n" + "TOTAL CLIPPED: %ld\n" + "FORWARD CLIPPED: %ld\n" + "REVERSE CLIPPED: %ld\n" + "BOTH CLIPPED: %ld\n" + "NOT CLIPPED: %ld\n" + "EXCLUDED: %ld\n" + "FILTERED: %ld\n" + "FAILED: %ld\n" + "WRITTEN: %ld\n", param->arg_list, l_count, f_count + r_count, + f_count, r_count, b_count, n_count, l_exclude, + filtered, failed, written); + + if (file_open) { + fclose(stats_fp); + } + + ret = 0; + +fail: + destroy_bed_hash(bed_hash); + ks_free(&oat); + sam_hdr_destroy(header); + bam_destroy1(b); + bam_destroy1(b_tmp); + return ret; +} + + +static void usage(void) { + fprintf(samtools_stderr, "Usage: samtools ampliconclip -b BED file -o \n\n"); + fprintf(samtools_stderr, "Option: \n"); + fprintf(samtools_stderr, " -b FILE BED file of regions (eg amplicon primers) to be removed.\n"); + fprintf(samtools_stderr, " -o FILE output file name (default samtools_stdout).\n"); + fprintf(samtools_stderr, " -f FILE write stats to file name (default samtools_stderr)\n"); + fprintf(samtools_stderr, " -u Output uncompressed data\n"); + fprintf(samtools_stderr, " --soft-clip soft clip amplicon primers from reads (default)\n"); + fprintf(samtools_stderr, " --hard-clip hard clip amplicon primers from reads.\n"); + fprintf(samtools_stderr, " --both-ends clip on both 5' and 3' ends.\n"); + fprintf(samtools_stderr, " --strand use strand data from BED file to match read direction.\n"); + fprintf(samtools_stderr, " --clipped only output clipped reads.\n"); + fprintf(samtools_stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n"); + fprintf(samtools_stderr, " --filter-len INT do not output reads INT size or shorter.\n"); + fprintf(samtools_stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n"); + fprintf(samtools_stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n"); + fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n"); + fprintf(samtools_stderr, " --original for clipped entries add an OA tag with original data.\n"); + fprintf(samtools_stderr, " --keep-tag for clipped entries keep the old NM and MD tags.\n"); + fprintf(samtools_stderr, " --tolerance match region within this number of bases, default 5.\n"); + fprintf(samtools_stderr, " --no-PG do not add an @PG line.\n"); + sam_global_opt_help(samtools_stderr, "-.O..@-."); + fprintf(samtools_stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n" + "Default clipping is only on the 5' end.\n\n"); +} + + +int amplicon_clip_main(int argc, char **argv) { + int c, ret; + char wmode[4] = {'w', 'b', 0, 0}; + char *bedfile = NULL, *fnout = "-"; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + htsThreadPool p = {NULL, 0}; + samFile *in = NULL, *out = NULL, *reject = NULL; + clipping_type clipping = soft_clip; + cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL}; + + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), + {"no-PG", no_argument, NULL, 1002}, + {"soft-clip", no_argument, NULL, 1003}, + {"hard-clip", no_argument, NULL, 1004}, + {"strand", no_argument, NULL, 1005}, + {"clipped", no_argument, NULL, 1006}, + {"fail", no_argument, NULL, 1007}, + {"both-ends", no_argument, NULL, 1008}, + {"filter-len", required_argument, NULL, 1009}, + {"fail-len", required_argument, NULL, 1010}, + {"no-excluded", no_argument, NULL, 1011}, + {"rejects-file", required_argument, NULL, 1012}, + {"original", no_argument, NULL, 1013}, + {"keep-tag", no_argument, NULL, 1014}, + {"tolerance", required_argument, NULL, 1015}, + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "b:@:o:O:f:u", lopts, NULL)) >= 0) { + switch (c) { + case 'b': bedfile = optarg; break; + case 'o': fnout = optarg; break; + case 'f': param.stats_file = optarg; break; + case 'u': wmode[2] = '0'; break; + case 1002: param.add_pg = 0; break; + case 1003: clipping = soft_clip; break; + case 1004: clipping = hard_clip; break; + case 1005: param.use_strand = 1; break; + case 1006: param.write_clipped = 1; break; + case 1007: param.mark_fail = 1; break; + case 1008: param.both = 1; break; + case 1009: param.filter_len = atoi(optarg); break; + case 1010: param.fail_len = atoi(optarg); break; + case 1011: param.unmapped = 1; break; + case 1012: param.rejects_file = optarg; break; + case 1013: param.oa_tag = 1; break; + case 1014: param.del_tag = 0; break; + case 1015: param.tol = atoi(optarg); break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage(); samtools_exit(1); + } + } + + if (!bedfile) { + usage(); + return 1; + } + + if (optind + 1 > argc) { + usage(); + return 1; + } + + if (param.tol < 0) { + fprintf(samtools_stderr, "[ampliconclip] warning: invalid tolerance of %d," + " reseting tolerance to default of 5.\n", param.tol); + param.tol = 5; + } + + if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { + print_error_errno("ampliconclip", "cannot open input file"); + return 1; + } + + sam_open_mode(wmode+1, fnout, NULL); + + if ((out = sam_open_format(fnout, wmode, &ga.out)) == NULL) { + print_error_errno("ampliconclip", "cannot open output file"); + return 1; + } + + if (param.rejects_file) { + sam_open_mode(wmode+1, param.rejects_file, NULL); + + if ((reject = sam_open_format(param.rejects_file, wmode, &ga.out)) == NULL) { + print_error_errno("ampliconclip", "cannot open rejects file"); + return 1; + } + } + + if (ga.nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(samtools_stderr, "[ampliconclip] error: cannot create thread pool.\n"); + return 1; + } + hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + + if (reject) { + hts_set_opt(reject, HTS_OPT_THREAD_POOL, &p); + } + } + + param.arg_list = stringify_argv(argc + 1, argv - 1); + + ret = bam_clip(in, out, reject, bedfile, clipping, ¶m); + + // cleanup + sam_close(in); + + if (sam_close(out) < 0) { + fprintf(samtools_stderr, "[ampliconclip] error: error while closing output file %s.\n", argv[optind+1]); + ret = 1; + } + + if (reject) { + if (sam_close(reject) < 0) { + fprintf(samtools_stderr, "[ampliconclip] error: error while closing reject file %s.\n", param.rejects_file); + ret = 1; + } + } + + if (p.pool) hts_tpool_destroy(p.pool); + + sam_global_args_free(&ga); + free(param.arg_list); + + return ret; +} + diff --git a/samtools/bam_ampliconclip.h b/samtools/bam_ampliconclip.h new file mode 100644 index 0000000..ef35357 --- /dev/null +++ b/samtools/bam_ampliconclip.h @@ -0,0 +1,54 @@ +/* bam_ampliconclip.h -- shared functions between amplicon clip/stats + + Copyright (C) 2020-2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef BAM_AMPLICONCLIP_H +#define BAM_AMPLICONCLIP_H + +#include "htslib/khash.h" + +typedef struct { + int64_t left; + int64_t right; + int rev; +} bed_entry_t; + +typedef struct { + bed_entry_t *bp; + int64_t longest; + int length; + int size; +} bed_entry_list_t; + +KHASH_MAP_INIT_STR(bed_list_hash, bed_entry_list_t); + +#define BED_LIST_INIT {NULL, 0, 0, 0, {0}} + + +int load_bed_file_multi_ref(char *infile, int get_strand, + int sort_by_pos, khash_t(bed_list_hash) *bed_lists); + +void destroy_bed_hash(khash_t(bed_list_hash) *hash); + + +#endif /* BAM_AMPLICONCLIP_H */ diff --git a/samtools/bam_aux.c b/samtools/bam_aux.c index 4e222a0..77d94f8 100644 --- a/samtools/bam_aux.c +++ b/samtools/bam_aux.c @@ -50,13 +50,13 @@ int bam_aux_drop_other(bam1_t *b, uint8_t *s) { if (s) { uint8_t *p, *aux; - aux = bam1_aux(b); + aux = bam_get_aux(b); p = s - 2; __skip_tag(s); memmove(aux, p, s - p); - b->data_len -= bam_get_l_aux(b) - (s - p); + b->l_data -= bam_get_l_aux(b) - (s - p); } else { - b->data_len -= bam_get_l_aux(b); + b->l_data -= bam_get_l_aux(b); } return 0; } diff --git a/samtools/bam_aux.c.pysam.c b/samtools/bam_aux.c.pysam.c index 0763976..39fe5ce 100644 --- a/samtools/bam_aux.c.pysam.c +++ b/samtools/bam_aux.c.pysam.c @@ -52,13 +52,13 @@ int bam_aux_drop_other(bam1_t *b, uint8_t *s) { if (s) { uint8_t *p, *aux; - aux = bam1_aux(b); + aux = bam_get_aux(b); p = s - 2; __skip_tag(s); memmove(aux, p, s - p); - b->data_len -= bam_get_l_aux(b) - (s - p); + b->l_data -= bam_get_l_aux(b) - (s - p); } else { - b->data_len -= bam_get_l_aux(b); + b->l_data -= bam_get_l_aux(b); } return 0; } diff --git a/samtools/bam_cat.c b/samtools/bam_cat.c index f3c812a..ed8cf58 100644 --- a/samtools/bam_cat.c +++ b/samtools/bam_cat.c @@ -1,6 +1,6 @@ /* bam_cat.c -- efficiently concatenates bam files. - Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd. + Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd. Modified SAMtools work copyright (C) 2010 Illumina, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -270,22 +270,13 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, // Copy contains and blocks within them while ((c = cram_read_container(in_c))) { - cram_block *blk; - - if (cram_container_is_empty(in_c)) { - if (cram_write_container(out_c, c) != 0) - return -1; - + if (cram_container_is_empty(in_c)) { + cram_block *blk; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; - if (cram_write_block(out_c, blk) != 0) { - cram_free_block(blk); - return -1; - } cram_free_block(blk); cram_free_container(c); - continue; } @@ -297,6 +288,7 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; + cram_block *blk; // Not switching rg so do the usual read/write loop if (cram_write_container(out_c, c) != 0) @@ -467,7 +459,7 @@ int main_cat(int argc, char *argv[]) char *outfn = 0; char **infns = NULL; // files to concatenate int infns_size = 0; - int c, ret = 0, no_pg = 0; + int c, ret = 0, no_pg = 0, usage = 0; samFile *in; sam_global_args ga; @@ -481,7 +473,7 @@ int main_cat(int argc, char *argv[]) sam_global_args_init(&ga); - while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) { switch (c) { case 'h': { samFile *fph = sam_open(optarg, "r"); @@ -522,6 +514,8 @@ int main_cat(int argc, char *argv[]) break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage=1; break; } } @@ -539,7 +533,7 @@ int main_cat(int argc, char *argv[]) } // Require at least one input file - if (infns_size + nargv_fns == 0) { + if (infns_size + nargv_fns == 0 || usage) { fprintf(stderr, "Usage: samtools cat [options] [... ]\n"); fprintf(stderr, " samtools cat [options] [... ]\n\n"); fprintf(stderr, "Concatenate BAM or CRAM files, first those in , then those\non the command line.\n\n"); diff --git a/samtools/bam_cat.c.pysam.c b/samtools/bam_cat.c.pysam.c index 58a41b7..ef2199c 100644 --- a/samtools/bam_cat.c.pysam.c +++ b/samtools/bam_cat.c.pysam.c @@ -2,7 +2,7 @@ /* bam_cat.c -- efficiently concatenates bam files. - Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd. + Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd. Modified SAMtools work copyright (C) 2010 Illumina, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -272,22 +272,13 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, // Copy contains and blocks within them while ((c = cram_read_container(in_c))) { - cram_block *blk; - - if (cram_container_is_empty(in_c)) { - if (cram_write_container(out_c, c) != 0) - return -1; - + if (cram_container_is_empty(in_c)) { + cram_block *blk; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; - if (cram_write_block(out_c, blk) != 0) { - cram_free_block(blk); - return -1; - } cram_free_block(blk); cram_free_container(c); - continue; } @@ -299,6 +290,7 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; + cram_block *blk; // Not switching rg so do the usual read/write loop if (cram_write_container(out_c, c) != 0) @@ -469,7 +461,7 @@ int main_cat(int argc, char *argv[]) char *outfn = 0; char **infns = NULL; // files to concatenate int infns_size = 0; - int c, ret = 0, no_pg = 0; + int c, ret = 0, no_pg = 0, usage = 0; samFile *in; sam_global_args ga; @@ -483,7 +475,7 @@ int main_cat(int argc, char *argv[]) sam_global_args_init(&ga); - while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) { switch (c) { case 'h': { samFile *fph = sam_open(optarg, "r"); @@ -524,6 +516,8 @@ int main_cat(int argc, char *argv[]) break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage=1; break; } } @@ -541,7 +535,7 @@ int main_cat(int argc, char *argv[]) } // Require at least one input file - if (infns_size + nargv_fns == 0) { + if (infns_size + nargv_fns == 0 || usage) { fprintf(samtools_stderr, "Usage: samtools cat [options] [... ]\n"); fprintf(samtools_stderr, " samtools cat [options] [... ]\n\n"); fprintf(samtools_stderr, "Concatenate BAM or CRAM files, first those in , then those\non the command line.\n\n"); diff --git a/samtools/bam_color.c b/samtools/bam_color.c index bee19b9..6decbc1 100644 --- a/samtools/bam_color.c +++ b/samtools/bam_color.c @@ -25,7 +25,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include "bam.h" +#include + +#include "htslib/sam.h" /*! @abstract Get the color encoding the previous and current base @@ -45,10 +47,10 @@ char bam_aux_getCSi(bam1_t *b, int i) cs = bam_aux2Z(c); // adjust for strandedness and leading adaptor - if(bam1_strand(b)) { + if(bam_is_rev(b)) { i = strlen(cs) - 1 - i; // adjust for leading hard clip - uint32_t cigar = bam1_cigar(b)[0]; + uint32_t cigar = bam_get_cigar(b)[0]; if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { i -= cigar >> BAM_CIGAR_SHIFT; } @@ -74,10 +76,10 @@ char bam_aux_getCQi(bam1_t *b, int i) cq = bam_aux2Z(c); // adjust for strandedness - if(bam1_strand(b)) { + if(bam_is_rev(b)) { i = strlen(cq) - 1 - i; // adjust for leading hard clip - uint32_t cigar = bam1_cigar(b)[0]; + uint32_t cigar = bam_get_cigar(b)[0]; if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { i -= (cigar >> BAM_CIGAR_SHIFT); } @@ -135,28 +137,28 @@ char bam_aux_getCEi(bam1_t *b, int i) cs = bam_aux2Z(c); // adjust for strandedness and leading adaptor - if(bam1_strand(b)) { //reverse strand + if(bam_is_rev(b)) { //reverse strand cs_i = strlen(cs) - 1 - i; // adjust for leading hard clip - uint32_t cigar = bam1_cigar(b)[0]; + uint32_t cigar = bam_get_cigar(b)[0]; if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { cs_i -= cigar >> BAM_CIGAR_SHIFT; } // get current color cur_color = cs[cs_i]; // get previous base. Note: must rc adaptor - prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; + prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : seq_nt16_str[bam_seqi(bam_get_seq(b), i+1)]; // get current base - cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)]; } else { cs_i=i+1; // get current color cur_color = cs[cs_i]; // get previous base - prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; + prev_b = (0 == i) ? cs[0] : seq_nt16_str[bam_seqi(bam_get_seq(b), i-1)]; // get current base - cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)]; } // corrected color diff --git a/samtools/bam_color.c.pysam.c b/samtools/bam_color.c.pysam.c index 762e83b..105cc33 100644 --- a/samtools/bam_color.c.pysam.c +++ b/samtools/bam_color.c.pysam.c @@ -27,7 +27,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include "bam.h" +#include + +#include "htslib/sam.h" /*! @abstract Get the color encoding the previous and current base @@ -47,10 +49,10 @@ char bam_aux_getCSi(bam1_t *b, int i) cs = bam_aux2Z(c); // adjust for strandedness and leading adaptor - if(bam1_strand(b)) { + if(bam_is_rev(b)) { i = strlen(cs) - 1 - i; // adjust for leading hard clip - uint32_t cigar = bam1_cigar(b)[0]; + uint32_t cigar = bam_get_cigar(b)[0]; if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { i -= cigar >> BAM_CIGAR_SHIFT; } @@ -76,10 +78,10 @@ char bam_aux_getCQi(bam1_t *b, int i) cq = bam_aux2Z(c); // adjust for strandedness - if(bam1_strand(b)) { + if(bam_is_rev(b)) { i = strlen(cq) - 1 - i; // adjust for leading hard clip - uint32_t cigar = bam1_cigar(b)[0]; + uint32_t cigar = bam_get_cigar(b)[0]; if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { i -= (cigar >> BAM_CIGAR_SHIFT); } @@ -137,28 +139,28 @@ char bam_aux_getCEi(bam1_t *b, int i) cs = bam_aux2Z(c); // adjust for strandedness and leading adaptor - if(bam1_strand(b)) { //reverse strand + if(bam_is_rev(b)) { //reverse strand cs_i = strlen(cs) - 1 - i; // adjust for leading hard clip - uint32_t cigar = bam1_cigar(b)[0]; + uint32_t cigar = bam_get_cigar(b)[0]; if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { cs_i -= cigar >> BAM_CIGAR_SHIFT; } // get current color cur_color = cs[cs_i]; // get previous base. Note: must rc adaptor - prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; + prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : seq_nt16_str[bam_seqi(bam_get_seq(b), i+1)]; // get current base - cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)]; } else { cs_i=i+1; // get current color cur_color = cs[cs_i]; // get previous base - prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; + prev_b = (0 == i) ? cs[0] : seq_nt16_str[bam_seqi(bam_get_seq(b), i-1)]; // get current base - cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)]; } // corrected color diff --git a/samtools/bam_fastq.c b/samtools/bam_fastq.c index 44879c2..a4d757c 100644 --- a/samtools/bam_fastq.c +++ b/samtools/bam_fastq.c @@ -1,6 +1,6 @@ /* bam_fastq.c -- FASTA and FASTQ file generation - Copyright (C) 2009-2017, 2019 Genome Research Ltd. + Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -42,16 +42,11 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" #include "sam_opts.h" -#define taglist_free(p) -KLIST_INIT(ktaglist, char*, taglist_free) - #define DEFAULT_BARCODE_TAG "BC" #define DEFAULT_QUALITY_TAG "QT" #define INDEX_SEPARATOR "+" int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; -static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; - static void bam2fq_usage(FILE *to, const char *command) { int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; @@ -60,64 +55,71 @@ static void bam2fq_usage(FILE *to, const char *command) fprintf(to, "\n" "Description:\n" -"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n" +"Converts a SAM, BAM or CRAM to %s format.\n" "\n" "Options:\n" -" -0 FILE write reads designated READ_OTHER to FILE\n" -" -1 FILE write reads designated READ1 to FILE\n" -" -2 FILE write reads designated READ2 to FILE\n" -" -o FILE write reads designated READ1 or READ2 to FILE\n" -" note: if a singleton file is specified with -s, only\n" -" paired reads will be written to the -1 and -2 files.\n" -" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 -" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) -" -n don't append /1 and /2 to the read name\n" -" -N always append /1 and /2 to the read name\n"); +" -0 FILE write reads designated READ_OTHER to FILE\n" +" -1 FILE write reads designated READ1 to FILE\n" +" -2 FILE write reads designated READ2 to FILE\n" +" -o FILE write reads designated READ1 or READ2 to FILE\n" +" note: if a singleton file is specified with -s, only\n" +" paired reads will be written to the -1 and -2 files.\n" +" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 +" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +" -n don't append /1 and /2 to the read name\n" +" -N always append /1 and /2 to the read name\n", + fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, -" -O output quality in the OQ tag if present\n"); +" -O output quality in the OQ tag if present\n"); fprintf(to, -" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" -" -t copy RG, BC and QT tags to the %s header line\n", +" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" +" -t copy RG, BC and QT tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); fprintf(to, -" -T TAGLIST copy arbitrary tags to the %s header line\n", +" -T TAGLIST copy arbitrary tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, -" -v INT default quality score if not given in file [1]\n" -" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" -" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n" -" --i1 FILE write first index reads to FILE\n" -" --i2 FILE write second index reads to FILE\n" -" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" -" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" -" --index-format STR How to parse barcode and quality tags\n\n"); +" -v INT default quality score if not given in file [1]\n" +" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" +" -c INT compression level [0..9] to use when writing bgzf files [1]\n" +" --i1 FILE write first index reads to FILE\n" +" --i2 FILE write second index reads to FILE\n" +" --barcode-tag TAG\n" +" Barcode tag [" DEFAULT_BARCODE_TAG "]\n" +" --quality-tag TAG\n" +" Quality tag [" DEFAULT_QUALITY_TAG "]\n" +" --index-format STR\n" +" How to parse barcode and quality tags\n\n"); sam_global_opt_help(to, "-.--.@-."); fprintf(to, "\n" -"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n" -"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n" +"The files will be automatically compressed if the file names have a .gz\n" +"or .bgzf extension. The input to this program must be collated by name.\n" +"Run 'samtools collate' or 'samtools sort -n' to achieve this.\n" "\n" "Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" "Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" -"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" -"or both unset.\n" +"Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n" "Run 'samtools flags' for more information on flag codes and meanings.\n"); fprintf(to, "\n" -"The index-format string describes how to parse the barcode and quality tags, for example:\n" -" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" -" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" -"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" -"'read until the separator or end of tag', for example:\n" -" n*i* ignore the left part of the tag until the separator, then use the second part\n" -" of the tag as index 1\n"); +"The index-format string describes how to parse the barcode and quality tags.\n" +"It is made up of 'i' or 'n' followed by a length or '*'. For example:\n" +" i14i8 The first 14 characters are index 1, the next 8 are index 2\n" +" n8i14 Ignore the first 8 characters, and use the next 14 for index 1\n\n" +"If the tag contains a separator, then the numeric part can be replaced with\n" +"'*' to mean 'read until the separator or end of tag', for example:\n" +" i*i* Break the tag at the separator into index 1 and index 2\n" +" n*i* Ignore the left part of the tag until the separator,\n" +" then use the second part of the tag as index 1\n"); fprintf(to, "\n" "Examples:\n" -" To get just the paired reads in separate files, use:\n" -" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n" -"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" +"To get just the paired reads in separate files, use:\n" +" samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n" +"\nTo get all non-supplementary/secondary reads in a single file, redirect\n" +"the output:\n" " samtools %s in.bam > all_reads.%s\n", command, fq ? "fq" : "fa", fq ? "fq" : "fa", command, fq ? "fq" : "fa"); @@ -144,96 +146,20 @@ typedef struct bam2fq_opts { typedef struct bam2fq_state { samFile *fp; - BGZF *fpse; - BGZF *fpr[3]; - BGZF *fpi[2]; - BGZF *hstdout; + samFile *fpse; + samFile *fpr[3]; + samFile *fpi[3]; + samFile *hstdout; sam_hdr_t *h; bool has12, use_oq, copy_tags, illumina_tag; int flag_on, flag_off, flag_alloff; fastfile filetype; int def_qual; - klist_t(ktaglist) *taglist; char *index_sequence; char compression_level; htsThreadPool p; } bam2fq_state_t; -/* - * Get and decode the read from a BAM record. - * - * TODO: htslib really needs an interface for this. Consider this or perhaps - * bam_get_seq_str (current vs original orientation) and bam_get_qual_str - * functions as string formatted equivalents to bam_get_{seq,qual}? - */ - -/* - * Reverse a string in place. - * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. - * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik - */ -static char *reverse(char *str) -{ - int i = strlen(str)-1,j=0; - char ch; - while (i>j) { - ch = str[i]; - str[i]= str[j]; - str[j] = ch; - i--; - j++; - } - return str; -} - -/* return the read, reverse complemented if necessary */ -static char *get_read(const bam1_t *rec) -{ - int len = rec->core.l_qseq + 1; - char *read = calloc(1, len); - char *seq = (char *)bam_get_seq(rec); - int n; - - if (!read) return NULL; - - for (n=0; n < rec->core.l_qseq; n++) { - if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; - else read[n] = seq_nt16_str[bam_seqi(seq,n)]; - } - if (rec->core.flag & BAM_FREVERSE) reverse(read); - return read; -} - -/* - * get and decode the quality from a BAM record - */ -static int get_quality(const bam1_t *rec, char **qual_out) -{ - char *quality = calloc(1, rec->core.l_qseq + 1); - char *q = (char *)bam_get_qual(rec); - int n; - - if (!quality) return -1; - - if (*q == '\xff') { - free(quality); - *qual_out = NULL; - return 0; - } - - for (n=0; n < rec->core.l_qseq; n++) { - quality[n] = q[n]+33; - } - if (rec->core.flag & BAM_FREVERSE) reverse(quality); - *qual_out = quality; - return 0; -} - -// -// End of htslib complaints -// - - static readpart which_readpart(const bam1_t *b) { if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { @@ -245,290 +171,8 @@ static readpart which_readpart(const bam1_t *b) } } -/* - * parse the length part from the index-format string - */ -static int getLength(char **s) -{ - int n = 0; - while (**s) { - if (**s == '*') { n=-1; (*s)++; break; } - if ( !isdigit(**s)) break; - n = n*10 + ((**s)-'0'); - (*s)++; - } - return n; -} - -static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) -{ - uint8_t *s = bam_aux_get(rec, tag); - if (s) { - char aux_type = *s; - switch (aux_type) { - case 'C': - case 'S': aux_type = 'I'; break; - case 'c': - case 's': aux_type = 'i'; break; - case 'd': aux_type = 'f'; break; - } - - // Ensure space. Need 6 chars + length of tag. Max length of - // i is 16, A is 21, B currently 26, Z is unknown, so - // have to check that one later. - if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; - - kputc('\t', linebuf); - kputsn(tag, 2, linebuf); - kputc(':', linebuf); - kputc(aux_type=='I'? 'i': aux_type, linebuf); - kputc(':', linebuf); - switch (aux_type) { - case 'H': - case 'Z': - if (kputs(bam_aux2Z(s), linebuf) < 0) return false; - break; - case 'i': kputw(bam_aux2i(s), linebuf); break; - case 'I': kputuw(bam_aux2i(s), linebuf); break; - case 'A': kputc(bam_aux2A(s), linebuf); break; - case 'f': kputd(bam_aux2f(s), linebuf); break; - case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; - default: kputs("*** Unknown aux type ***", linebuf); return false; - } - } - return true; -} - -static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) -{ - if (!index_sequence) return 0; - - kstring_t new = {0,0,NULL}; - if (linebuf->s) { - char *s = strchr(linebuf->s, '\n'); - if (s) { - if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) - return -1; - *s = 0; - kputs(linebuf->s, &new); - kputc(' ', &new); - readpart readpart = which_readpart(rec); - if (readpart == READ_1) kputc('1', &new); - else if (readpart == READ_2) kputc('2', &new); - else kputc('0', &new); - - kputc(':', &new); - if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); - else kputc('N', &new); - - kputs(":0:", &new); - kputs(index_sequence, &new); - kputc('\n', &new); - kputs(s+1, &new); - free(ks_release(linebuf)); - linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; - } - } - return 0; -} - -static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) -{ - int i; - - linebuf->l = 0; - // Write read name - if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; - if (kputs(bam_get_qname(rec), linebuf) < 0) return false; - // Add the /1 /2 if requested - if (state->has12) { - readpart readpart = which_readpart(rec); - if (readpart == READ_1) { - if (kputs("/1", linebuf) < 0) return false; - } else if (readpart == READ_2) { - if (kputs("/2", linebuf) < 0) return false; - } - } - if (state->copy_tags) { - for (i = 0; copied_tags[i]; ++i) { - if (!copy_tag(copied_tags[i], rec, linebuf)) { - fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); - return false; - } - } - } - - if (state->taglist->size) { - kliter_t(ktaglist) *p; - for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { - if (!copy_tag(kl_val(p), rec, linebuf)) { - fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); - return false; - } - } - } - - if (kputc('\n', linebuf) < 0) return false; - if (kputs(seq, linebuf) < 0) return false; - if (kputc('\n', linebuf) < 0) return false; - - if (state->filetype == FASTQ) { - // Write quality - if (kputs("+\n", linebuf) < 0) return false; - if (qual && *qual) { - if (kputs(qual, linebuf) < 0) return false; - } else { - int len = strlen(seq); - if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; - for (i = 0; i < len; ++i) { - kputc(33 + state->def_qual, linebuf); - } - } - if (kputc('\n', linebuf) < 0) return false; - } - return true; -} - -/* - * Create FASTQ lines from the barcode tag using the index-format - */ -static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) -{ - uint8_t *p; - char *ifmt = opts->index_format; - char *tag = NULL; - char *qual = NULL; - char *sub_tag = NULL; - char *sub_qual = NULL; - size_t tag_len; - int file_number = 0; - kstring_t linebuf = { 0, 0, NULL }; // Buffer - - if (!ifmt) return true; - - // read barcode tag - p = bam_aux_get(rec,opts->barcode_tag); - if (p) tag = bam_aux2Z(p); - - if (!tag) return true; // there is no tag - - tag_len = strlen(tag); - sub_tag = calloc(1, tag_len + 1); - if (!sub_tag) goto fail; - sub_qual = calloc(1, tag_len + 1); - if (!sub_qual) goto fail; - - // read quality tag - p = bam_aux_get(rec, opts->quality_tag); - if (p) qual = bam_aux2Z(p); - - // Parse the index-format string - while (*ifmt) { - if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly - char action = *ifmt; // should be 'i' or 'n' - ifmt++; // skip over action - int index_len = getLength(&ifmt); - int n = 0; - - if (index_len < 0) { - // read until separator - while (isalpha(*tag)) { - sub_tag[n] = *tag++; - if (qual) sub_qual[n] = *qual++; - n++; - } - if (*tag) { // skip separator - tag++; - if (qual) qual++; - } - } else { - // read index_len characters - while (index_len-- && *tag) { - sub_tag[n] = *tag++; - if (qual) sub_qual[n] = *qual++; - n++; - } - } - sub_tag[n] = '\0'; - sub_qual[n] = '\0'; - - if (action=='i' && *sub_tag) { - if (state->index_sequence) { - char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2); - if (!new_index_sequence) goto fail; - state->index_sequence = new_index_sequence; - strcat(state->index_sequence, INDEX_SEPARATOR); - strcat(state->index_sequence, sub_tag); - } else { - state->index_sequence = strdup(sub_tag); // we're going to need this later... - } - if (!state->index_sequence) goto fail; - if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; - if (state->illumina_tag) { - if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) { - goto fail; - } - } - if (state->fpi[file_number]) { - if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) - goto fail; - } - } - - } - - free(sub_qual); free(sub_tag); - free(linebuf.s); - return true; - - fail: - perror(__func__); - free(sub_qual); free(sub_tag); - free(linebuf.s); - return false; -} - -// Transform a bam1_t record into a string with the FASTQ representation of it -// @returns false for error, true for success -static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) -{ - int32_t qlen = b->core.l_qseq; - assert(qlen >= 0); - const uint8_t *oq = NULL; - char *qual = NULL; - - char *seq = get_read(b); - if (!seq) return false; - - if (state->use_oq) oq = bam_aux_get(b, "OQ"); - if (oq && *oq=='Z') { - qual = strdup(bam_aux2Z(oq)); - if (!qual) goto fail; - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - reverse(qual); - } - } else { - if (get_quality(b, &qual) < 0) goto fail; - } - - if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; - - free(qual); - free(seq); - return true; - - fail: - free(seq); - free(qual); - return false; -} - static void free_opts(bam2fq_opts_t *opts) { - free(opts->barcode_tag); - free(opts->quality_tag); - free(opts->index_format); - free(opts->extra_tags); free(opts); } @@ -566,13 +210,14 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) {"quality-tag", required_argument, NULL, 'q'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { + while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", + lopts, NULL)) > 0) { switch (c) { - case 'b': opts->barcode_tag = strdup(optarg); break; - case 'q': opts->quality_tag = strdup(optarg); break; + case 'b': opts->barcode_tag = optarg; break; + case 'q': opts->quality_tag = optarg; break; case 1 : opts->index_file[0] = optarg; break; case 2 : opts->index_file[1] = optarg; break; - case 3 : opts->index_format = strdup(optarg); break; + case 3 : opts->index_format = optarg; break; case '0': opts->fnr[0] = optarg; break; case '1': opts->fnr[1] = optarg; break; case '2': opts->fnr[2] = optarg; break; @@ -583,7 +228,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) flag_off_set = 1; opts->flag_off = 0; } - opts->flag_off |= strtol(optarg, 0, 0); break; + opts->flag_off |= strtol(optarg, 0, 0); + break; case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; case 'n': opts->has12 = false; break; case 'N': opts->has12always = true; break; @@ -591,13 +237,25 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) case 's': opts->fnse = optarg; break; case 't': opts->copy_tags = true; break; case 'i': opts->illumina_tag = true; break; - case 'c': opts->compression_level = atoi(optarg); break; - case 'T': opts->extra_tags = strdup(optarg); break; + case 'c': + opts->compression_level = atoi(optarg); + if (opts->compression_level < 0) + opts->compression_level = 0; + if (opts->compression_level > 9) + opts->compression_level = 9; + break; + case 'T': opts->extra_tags = optarg; break; case 'v': opts->def_qual = atoi(optarg); break; - case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; + + case '?': + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; default: if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { - bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; } break; } @@ -606,8 +264,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; if (opts->has12always) opts->has12 = true; - if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); - if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); + if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG; + if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG; int nIndex = 0; if (opts->index_format) { @@ -652,7 +310,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) } const char* type_str = argv[0]; - if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { + if (strcasecmp("fastq", type_str) == 0 || + strcasecmp("bam2fq", type_str) == 0) { opts->filetype = FASTQ; } else if (strcasecmp("fasta", type_str) == 0) { opts->filetype = FASTA; @@ -680,34 +339,61 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) return true; } -static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp) -{ - char mode[4] = "w"; - size_t len = strlen(filename); - - mode[2] = 0; mode[3] = 0; - if (len > 3 && strstr(filename + (len - 3),".gz")) { - mode[1] = 'g'; mode[2] = c+'0'; - } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) - || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { - mode[1] = c+'0'; - } else { - mode[1] = 'u'; +void set_sam_opts(samFile *fp, bam2fq_state_t *state, + const bam2fq_opts_t *opts) { + if (state->has12) + hts_set_opt(fp, FASTQ_OPT_RNUM, 1); + + if (state->illumina_tag) + hts_set_opt(fp, FASTQ_OPT_CASAVA, 1); + + hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag); + + kstring_t tag_list = {0,0}; + if (state->copy_tags) + kputs("RG,BC,QT", &tag_list); + if (opts->extra_tags) { + if (tag_list.l) + kputc(',', &tag_list); + kputs(opts->extra_tags, &tag_list); } + if (tag_list.l) + hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s); + ks_free(&tag_list); +} - BGZF *fp = bgzf_open(filename,mode); +// Open a file as normal or gzipped based on filename. +// Note we always use bgzf and don't bother to attempt non-blocked +// gzip streams. This is a departure from the old fastq code. +static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) { + char modez[6]; + strcpy(modez, mode); + + size_t l = strlen(fn); + if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) || + (l > 4 && strcmp(fn+l-4, ".bgz") == 0) || + (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) { + char m[3] = {'z', state->compression_level+'0', '\0'}; + strcat(modez, m); + } + + samFile *fp = sam_open(fn, modez); if (!fp) - return fp; - if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) { - bgzf_close(fp); return NULL; - } + + if (state->p.pool) + hts_set_thread_pool(fp, &state->p); + return fp; } static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) { + char *mode = opts->filetype == FASTA ? "wF" : "wf"; + bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); + if (!state) + return false; state->flag_on = opts->flag_on; state->flag_off = opts->flag_off; state->flag_alloff = opts->flag_alloff; @@ -721,22 +407,6 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->hstdout = NULL; state->compression_level = opts->compression_level; - state->taglist = kl_init(ktaglist); - if (opts->extra_tags) { - char *save_p; - char *s = strtok_r(opts->extra_tags, ",", &save_p); - while (s) { - if (strlen(s) != 2) { - fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s); - free(state); - return false; - } - char **et = kl_pushp(ktaglist, state->taglist); - *et = s; - s = strtok_r(NULL, ",", &save_p); - } - } - state->fp = sam_open(opts->fn_input, "r"); if (state->fp == NULL) { print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); @@ -768,12 +438,12 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) return false; } if (opts->fnse) { - state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p); - if (state->fpse == NULL) { - print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); + if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) { + print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse); free(state); return false; } + set_sam_opts(state->fpse, state, opts); } if (opts->ga.reference) { @@ -784,6 +454,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) } } + // single, read1, read2 int i, j; for (i = 0; i < 3; ++i) { if (opts->fnr[i]) { @@ -791,28 +462,30 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0) break; if (j == i) { - state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p); - if (state->fpr[i] == NULL) { - print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", + if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) { + print_error_errno("bam2fq", "Cannot open r%d file \"%s\"", i, opts->fnr[i]); free(state); return false; } + set_sam_opts(state->fpr[i], state, opts); } else { state->fpr[i] = state->fpr[j]; } } else { if (!state->hstdout) { - state->hstdout = bgzf_dopen(fileno(stdout), "wu"); - if (!state->hstdout) { + if (!(state->hstdout = sam_open_z("-", mode, state))) { print_error_errno("bam2fq", "Cannot open STDOUT"); free(state); return false; } + set_sam_opts(state->hstdout, state, opts); } state->fpr[i] = state->hstdout; } } + + // index 1, index 2 for (i = 0; i < 2; i++) { state->fpi[i] = NULL; if (opts->index_file[i]) { @@ -823,13 +496,14 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0) break; if (i == j) { - state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p); - if (state->fpi[i] == NULL) { - print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", + if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode, + state))) { + print_error_errno("bam2fq", "Cannot open i%d file \"%s\"", i+1, opts->index_file[i]); free(state); return false; } + set_sam_opts(state->fpi[i], state, opts); } else if (j < 0) { state->fpi[i] = state->fpr[j+3]; } else { @@ -854,21 +528,25 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* bool valid = true; sam_hdr_destroy(state->h); check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); - if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } + if (state->fpse && sam_close(state->fpse) < 0) { + print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); + valid = false; + } + int i, j; for (i = 0; i < 3; ++i) { if (state->fpr[i] != state->hstdout) { for (j = 0; j < i; j++) if (state->fpr[i] == state->fpr[j]) break; - if (j == i && bgzf_close(state->fpr[i])) { + if (j == i && sam_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } } } if (state->hstdout) { - if (bgzf_close(state->hstdout)) { + if (sam_close(state->hstdout) < 0) { print_error_errno("bam2fq", "Error closing STDOUT"); valid = false; } @@ -880,12 +558,11 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* for (j -= 3; j >= 0 && j < i; j++) if (state->fpi[i] == state->fpi[j]) break; - if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) { + if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) { print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); valid = false; } } - kl_destroy(ktaglist,state->taglist); free(state->index_sequence); if (state->p.pool) hts_tpool_destroy(state->p.pool); @@ -901,135 +578,300 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) } +int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state, + bam2fq_opts_t* opts, char *seq, int seq_len, + char *qual, int qual_len) { + if (!fp || !b || !seq_len) + return 0; + + int ret = -1; + bam1_t *b2 = bam_init1(); // FIXME: reuse + if (!b2) + return -1; + + size_t aux_len = b->data + b->l_data - bam_get_aux(b); + if (bam_set1(b2, b->core.l_qname, bam_get_qname(b), + (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE, + -1, -1, 0, // refid, pos, mapq + 0, NULL, // cigar + -1, -1, 0, // rnext, pnext, tlen + seq_len, seq, qual, + aux_len) < 0) + goto err; + + uint8_t *q = bam_get_qual(b2); + if (qual) { + int i; + for (i = 0; i < seq_len; i++) + q[i] -= '!'; + } else { + memset(q, opts->def_qual, seq_len); + } + + memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len); + b2->l_data += aux_len; + if (sam_write1(fp, state->h, b2) < 0) + goto err; + + ret = 0; + err: + if (b2) + bam_destroy1(b2); + return ret; +} + +int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state, + bam2fq_opts_t* opts) { + bam1_t *b[2] = {b1, b2}; + + char *ifmt = opts->index_format; + if (!ifmt) + ifmt = "i*i*"; + + // Get seq / qual elements + char *bc = NULL, *qt = NULL; + if (b1) + bc = (char *)bam_aux_get(b1, opts->barcode_tag); + if (b2 && !bc) + bc = (char *)bam_aux_get(b2, opts->barcode_tag); + if (!bc) + return 0; + else + bc++; // skip Z + + if (b1) + qt = (char *)bam_aux_get(b1, opts->quality_tag); + if (b2 && !qt) + qt = (char *)bam_aux_get(b2, opts->quality_tag); + if (qt && strlen(bc) != strlen(qt)-1) + qt = NULL; + else if (qt) + qt++; + + int inum = 0; + while (inum < 2) { + char fc = *ifmt++; + if (!fc) + break; // ran out of index-format + + long len, rem = 0; + if (isdigit(*ifmt)) { + rem = len = strtol(ifmt, &ifmt, 10); + } else { + ifmt++; + len = 0; + } + + char *bc_end = bc, *qt_end = qt; + while (len ? *bc_end && rem-- : isalpha(*bc_end)) + bc_end++, qt_end += qt != NULL; + + switch (fc) { + case 'n': + // skip + bc = bc_end + (len==0); + if (qt) + qt = qt_end + (len==0); + break; + + case 'i': + if (write_index_rec(state->fpi[inum], b[inum], state, opts, + bc, bc_end-bc, qt, qt_end-qt) < 0) + return -1; + bc = bc_end + (len==0); + if (qt) + qt = qt_end + (len==0); + inum++; + break; + + default: + fprintf(stderr, "Unknown index-format code\n"); + return -1; + } + } + + return 0; +} + +static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts, + bam1_t *b[4], int score[3], int best[3], + int64_t *n_singletons) { + // Paired data, with 1 or 2 ends present. + if (score[1] > 0 && score[2] > 0) { + // If CASAVA tag is required and barcode is only on R1, + // copy it to R2 + if (state->illumina_tag) { + char *tag; + if ((tag = (char *)bam_aux_get(b[best[1]], + opts->barcode_tag))) + if (bam_aux_update_str(b[best[2]], + opts->barcode_tag, + strlen(tag), tag+1) < 0) + goto err; + if ((tag = (char *)bam_aux_get(b[best[1]], + opts->quality_tag))) + if (bam_aux_update_str(b[best[2]], + opts->quality_tag, + strlen(tag), tag+1) < 0) + goto err; + + } + if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0) + goto err; + if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0) + goto err; + + if (output_index(b[best[1]], b[best[2]], state, opts) < 0) + goto err; + } else if (score[1] > 0 || score[2] > 0) { + if (state->fpse) { + // print whichever one exists to fpse + if (score[1] > 0) { + if (sam_write1(state->fpse, state->h, b[best[1]]) < 0) + goto err; + } else { + if (sam_write1(state->fpse, state->h, b[best[2]]) < 0) + goto err; + } + ++(*n_singletons); + } else { + if (score[1] > 0) { + if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0) + goto err; + } else { + if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0) + goto err; + } + } + + if (output_index(score[1] > 0 ? b[best[1]] : NULL, + score[2] > 0 ? b[best[2]] : NULL, + state, opts) < 0) + goto err; + } + + if (score[0]) { // single ended data (neither READ1 nor READ2) + if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0) + goto err; + + if (output_index(b[best[0]], NULL, state, opts) < 0) + goto err; + } + + return 0; + + err: + return -1; +} + static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) { int n; - bam1_t *records[3] = {NULL, NULL, NULL}; char *current_qname = NULL; int64_t n_reads = 0, n_singletons = 0; // Statistics - kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; int score[3]; int at_eof; - bool valid = true; - bam1_t* b = NULL; + bool valid = false; + int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices; + // indexed by [readpart] + bam1_t *b[4]; // 3 readparts, plus current record - while (true) { - if (!b) - b = bam_init1(); - if (b == NULL) { + for (n = 0; n < 4; n++) { + if (!(b[n] = bam_init1())) { perror("[bam2fq_mainloop] Malloc error for bam record buffer."); - valid = false; - break; + return false; } - int res = sam_read1(state->fp, state->h, b); + } + + n = 0; + while (true) { + int res = sam_read1(state->fp, state->h, b[n]); if (res < -1) { fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); - valid = false; - break; + goto err; } at_eof = res < 0; - if (!at_eof && filter_it_out(b, state)) + if (!at_eof && filter_it_out(b[n], state)) continue; - if (!at_eof) ++n_reads; - - if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { - if (current_qname) { - if (state->illumina_tag) { - for (n=0; valid && n<3; n++) { - if (!records[n]) continue; - if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; - } - if (!valid) break; - } - free(state->index_sequence); state->index_sequence = NULL; - if (score[1] > 0 && score[2] > 0) { - // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] - if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } else if (score[1] > 0 || score[2] > 0) { - if (state->fpse) { - // print whichever one exists to fpse - if (score[1] > 0) { - if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - } else { - if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } - ++n_singletons; - } else { - if (score[1] > 0) { - if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - } else { - if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } - } - } - if (score[0]) { // TODO: check this - // print linebuf[0] to fpr[0] - if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } - } + if (!at_eof) { + ++n_reads; + + // Handle -O option: use OQ for qual + uint8_t *oq; + if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') { + int i, l = strlen((char *)++oq); + uint8_t *qual = bam_get_qual(b[n]); + for (i = 0; i < l && i < b[n]->core.l_qseq; i++) + qual[i] = oq[i] - '!'; } + } + if (at_eof + || !current_qname + || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) { + // New name, so flush best examples of previous name. + if (current_qname) + if (flush_rec(state, opts, b, score, best, &n_singletons) < 0) + goto err; - free(current_qname); current_qname = NULL; + current_qname = bam_get_qname(b[n]); score[0] = score[1] = score[2] = 0; - for (n=0; n < 3; n++) { - bam_destroy1(records[n]); records[n]=NULL; - } if (at_eof) { break; } - - current_qname = strdup(bam_get_qname(b)); - if (!current_qname) { valid = false; break; } } // Prefer a copy of the read that has base qualities - int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; - readpart rp = which_readpart(b); - if (b_score > score[rp]) { - if (!tags2fq(b, state, opts)) { valid = false; break; } - if (records[rp]) bam_destroy1(records[rp]); - records[rp] = b; + int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1; + readpart rp = which_readpart(b[n]); + if (score[rp] < b_score) { score[rp] = b_score; - b = NULL; - if(!bam1_to_fq(records[rp], &linebuf[rp], state)) { - fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__); - valid = false; break; - } + // Record b[n] slot for best copy of readpair and find a new + // slot for next bam read + best[rp] = n; + int used_slot[4] = {0}, i; + for (i = 0; i < 3; i++) + if (best[i] >= 0) + used_slot[best[i]] = 1; + for (i = 0; i < 4 && used_slot[i]; i++) + ; + n = i; } } + + valid = true; + err: if (!valid) - { - perror("[bam2fq_mainloop] Error writing to FASTx files."); - } - bam_destroy1(b); - for (n=0; n < 3; n++) { - bam_destroy1(records[n]); - } - free(current_qname); - free(linebuf[0].s); - free(linebuf[1].s); - free(linebuf[2].s); - fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); - fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); + print_error_errno("bam2fq", "Error writing to FASTx files."); + + for (n = 0; n < 4; n++) + bam_destroy1(b[n]); + + fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", + __func__, n_singletons); + fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", + __func__, n_reads); return valid; } int main_bam2fq(int argc, char *argv[]) { - int status = EXIT_SUCCESS; + int status = EXIT_FAILURE; bam2fq_opts_t* opts = NULL; bam2fq_state_t* state = NULL; bool valid = parse_opts(argc, argv, &opts); if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; - if (!init_state(opts, &state)) return EXIT_FAILURE; + if (!init_state(opts, &state)) goto err; + + if (!bam2fq_mainloop(state,opts)) goto err; - if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; + if (!destroy_state(opts, state, &status)) goto err; - if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; + status = EXIT_SUCCESS; + err: sam_global_args_free(&opts->ga); free_opts(opts); diff --git a/samtools/bam_fastq.c.pysam.c b/samtools/bam_fastq.c.pysam.c index 2fe4c87..f7249d1 100644 --- a/samtools/bam_fastq.c.pysam.c +++ b/samtools/bam_fastq.c.pysam.c @@ -2,7 +2,7 @@ /* bam_fastq.c -- FASTA and FASTQ file generation - Copyright (C) 2009-2017, 2019 Genome Research Ltd. + Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -44,16 +44,11 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" #include "sam_opts.h" -#define taglist_free(p) -KLIST_INIT(ktaglist, char*, taglist_free) - #define DEFAULT_BARCODE_TAG "BC" #define DEFAULT_QUALITY_TAG "QT" #define INDEX_SEPARATOR "+" int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; -static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; - static void bam2fq_usage(FILE *to, const char *command) { int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; @@ -62,64 +57,71 @@ static void bam2fq_usage(FILE *to, const char *command) fprintf(to, "\n" "Description:\n" -"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n" +"Converts a SAM, BAM or CRAM to %s format.\n" "\n" "Options:\n" -" -0 FILE write reads designated READ_OTHER to FILE\n" -" -1 FILE write reads designated READ1 to FILE\n" -" -2 FILE write reads designated READ2 to FILE\n" -" -o FILE write reads designated READ1 or READ2 to FILE\n" -" note: if a singleton file is specified with -s, only\n" -" paired reads will be written to the -1 and -2 files.\n" -" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 -" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) -" -n don't append /1 and /2 to the read name\n" -" -N always append /1 and /2 to the read name\n"); +" -0 FILE write reads designated READ_OTHER to FILE\n" +" -1 FILE write reads designated READ1 to FILE\n" +" -2 FILE write reads designated READ2 to FILE\n" +" -o FILE write reads designated READ1 or READ2 to FILE\n" +" note: if a singleton file is specified with -s, only\n" +" paired reads will be written to the -1 and -2 files.\n" +" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 +" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +" -n don't append /1 and /2 to the read name\n" +" -N always append /1 and /2 to the read name\n", + fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, -" -O output quality in the OQ tag if present\n"); +" -O output quality in the OQ tag if present\n"); fprintf(to, -" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" -" -t copy RG, BC and QT tags to the %s header line\n", +" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" +" -t copy RG, BC and QT tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); fprintf(to, -" -T TAGLIST copy arbitrary tags to the %s header line\n", +" -T TAGLIST copy arbitrary tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, -" -v INT default quality score if not given in file [1]\n" -" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" -" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n" -" --i1 FILE write first index reads to FILE\n" -" --i2 FILE write second index reads to FILE\n" -" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" -" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" -" --index-format STR How to parse barcode and quality tags\n\n"); +" -v INT default quality score if not given in file [1]\n" +" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" +" -c INT compression level [0..9] to use when writing bgzf files [1]\n" +" --i1 FILE write first index reads to FILE\n" +" --i2 FILE write second index reads to FILE\n" +" --barcode-tag TAG\n" +" Barcode tag [" DEFAULT_BARCODE_TAG "]\n" +" --quality-tag TAG\n" +" Quality tag [" DEFAULT_QUALITY_TAG "]\n" +" --index-format STR\n" +" How to parse barcode and quality tags\n\n"); sam_global_opt_help(to, "-.--.@-."); fprintf(to, "\n" -"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n" -"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n" +"The files will be automatically compressed if the file names have a .gz\n" +"or .bgzf extension. The input to this program must be collated by name.\n" +"Run 'samtools collate' or 'samtools sort -n' to achieve this.\n" "\n" "Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" "Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" -"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" -"or both unset.\n" +"Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n" "Run 'samtools flags' for more information on flag codes and meanings.\n"); fprintf(to, "\n" -"The index-format string describes how to parse the barcode and quality tags, for example:\n" -" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" -" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" -"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" -"'read until the separator or end of tag', for example:\n" -" n*i* ignore the left part of the tag until the separator, then use the second part\n" -" of the tag as index 1\n"); +"The index-format string describes how to parse the barcode and quality tags.\n" +"It is made up of 'i' or 'n' followed by a length or '*'. For example:\n" +" i14i8 The first 14 characters are index 1, the next 8 are index 2\n" +" n8i14 Ignore the first 8 characters, and use the next 14 for index 1\n\n" +"If the tag contains a separator, then the numeric part can be replaced with\n" +"'*' to mean 'read until the separator or end of tag', for example:\n" +" i*i* Break the tag at the separator into index 1 and index 2\n" +" n*i* Ignore the left part of the tag until the separator,\n" +" then use the second part of the tag as index 1\n"); fprintf(to, "\n" "Examples:\n" -" To get just the paired reads in separate files, use:\n" -" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n" -"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" +"To get just the paired reads in separate files, use:\n" +" samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n" +"\nTo get all non-supplementary/secondary reads in a single file, redirect\n" +"the output:\n" " samtools %s in.bam > all_reads.%s\n", command, fq ? "fq" : "fa", fq ? "fq" : "fa", command, fq ? "fq" : "fa"); @@ -146,96 +148,20 @@ typedef struct bam2fq_opts { typedef struct bam2fq_state { samFile *fp; - BGZF *fpse; - BGZF *fpr[3]; - BGZF *fpi[2]; - BGZF *hsamtools_stdout; + samFile *fpse; + samFile *fpr[3]; + samFile *fpi[3]; + samFile *hsamtools_stdout; sam_hdr_t *h; bool has12, use_oq, copy_tags, illumina_tag; int flag_on, flag_off, flag_alloff; fastfile filetype; int def_qual; - klist_t(ktaglist) *taglist; char *index_sequence; char compression_level; htsThreadPool p; } bam2fq_state_t; -/* - * Get and decode the read from a BAM record. - * - * TODO: htslib really needs an interface for this. Consider this or perhaps - * bam_get_seq_str (current vs original orientation) and bam_get_qual_str - * functions as string formatted equivalents to bam_get_{seq,qual}? - */ - -/* - * Reverse a string in place. - * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. - * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik - */ -static char *reverse(char *str) -{ - int i = strlen(str)-1,j=0; - char ch; - while (i>j) { - ch = str[i]; - str[i]= str[j]; - str[j] = ch; - i--; - j++; - } - return str; -} - -/* return the read, reverse complemented if necessary */ -static char *get_read(const bam1_t *rec) -{ - int len = rec->core.l_qseq + 1; - char *read = calloc(1, len); - char *seq = (char *)bam_get_seq(rec); - int n; - - if (!read) return NULL; - - for (n=0; n < rec->core.l_qseq; n++) { - if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; - else read[n] = seq_nt16_str[bam_seqi(seq,n)]; - } - if (rec->core.flag & BAM_FREVERSE) reverse(read); - return read; -} - -/* - * get and decode the quality from a BAM record - */ -static int get_quality(const bam1_t *rec, char **qual_out) -{ - char *quality = calloc(1, rec->core.l_qseq + 1); - char *q = (char *)bam_get_qual(rec); - int n; - - if (!quality) return -1; - - if (*q == '\xff') { - free(quality); - *qual_out = NULL; - return 0; - } - - for (n=0; n < rec->core.l_qseq; n++) { - quality[n] = q[n]+33; - } - if (rec->core.flag & BAM_FREVERSE) reverse(quality); - *qual_out = quality; - return 0; -} - -// -// End of htslib complaints -// - - static readpart which_readpart(const bam1_t *b) { if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { @@ -247,290 +173,8 @@ static readpart which_readpart(const bam1_t *b) } } -/* - * parse the length part from the index-format string - */ -static int getLength(char **s) -{ - int n = 0; - while (**s) { - if (**s == '*') { n=-1; (*s)++; break; } - if ( !isdigit(**s)) break; - n = n*10 + ((**s)-'0'); - (*s)++; - } - return n; -} - -static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) -{ - uint8_t *s = bam_aux_get(rec, tag); - if (s) { - char aux_type = *s; - switch (aux_type) { - case 'C': - case 'S': aux_type = 'I'; break; - case 'c': - case 's': aux_type = 'i'; break; - case 'd': aux_type = 'f'; break; - } - - // Ensure space. Need 6 chars + length of tag. Max length of - // i is 16, A is 21, B currently 26, Z is unknown, so - // have to check that one later. - if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; - - kputc('\t', linebuf); - kputsn(tag, 2, linebuf); - kputc(':', linebuf); - kputc(aux_type=='I'? 'i': aux_type, linebuf); - kputc(':', linebuf); - switch (aux_type) { - case 'H': - case 'Z': - if (kputs(bam_aux2Z(s), linebuf) < 0) return false; - break; - case 'i': kputw(bam_aux2i(s), linebuf); break; - case 'I': kputuw(bam_aux2i(s), linebuf); break; - case 'A': kputc(bam_aux2A(s), linebuf); break; - case 'f': kputd(bam_aux2f(s), linebuf); break; - case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; - default: kputs("*** Unknown aux type ***", linebuf); return false; - } - } - return true; -} - -static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) -{ - if (!index_sequence) return 0; - - kstring_t new = {0,0,NULL}; - if (linebuf->s) { - char *s = strchr(linebuf->s, '\n'); - if (s) { - if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) - return -1; - *s = 0; - kputs(linebuf->s, &new); - kputc(' ', &new); - readpart readpart = which_readpart(rec); - if (readpart == READ_1) kputc('1', &new); - else if (readpart == READ_2) kputc('2', &new); - else kputc('0', &new); - - kputc(':', &new); - if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); - else kputc('N', &new); - - kputs(":0:", &new); - kputs(index_sequence, &new); - kputc('\n', &new); - kputs(s+1, &new); - free(ks_release(linebuf)); - linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; - } - } - return 0; -} - -static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) -{ - int i; - - linebuf->l = 0; - // Write read name - if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; - if (kputs(bam_get_qname(rec), linebuf) < 0) return false; - // Add the /1 /2 if requested - if (state->has12) { - readpart readpart = which_readpart(rec); - if (readpart == READ_1) { - if (kputs("/1", linebuf) < 0) return false; - } else if (readpart == READ_2) { - if (kputs("/2", linebuf) < 0) return false; - } - } - if (state->copy_tags) { - for (i = 0; copied_tags[i]; ++i) { - if (!copy_tag(copied_tags[i], rec, linebuf)) { - fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); - return false; - } - } - } - - if (state->taglist->size) { - kliter_t(ktaglist) *p; - for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { - if (!copy_tag(kl_val(p), rec, linebuf)) { - fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); - return false; - } - } - } - - if (kputc('\n', linebuf) < 0) return false; - if (kputs(seq, linebuf) < 0) return false; - if (kputc('\n', linebuf) < 0) return false; - - if (state->filetype == FASTQ) { - // Write quality - if (kputs("+\n", linebuf) < 0) return false; - if (qual && *qual) { - if (kputs(qual, linebuf) < 0) return false; - } else { - int len = strlen(seq); - if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; - for (i = 0; i < len; ++i) { - kputc(33 + state->def_qual, linebuf); - } - } - if (kputc('\n', linebuf) < 0) return false; - } - return true; -} - -/* - * Create FASTQ lines from the barcode tag using the index-format - */ -static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) -{ - uint8_t *p; - char *ifmt = opts->index_format; - char *tag = NULL; - char *qual = NULL; - char *sub_tag = NULL; - char *sub_qual = NULL; - size_t tag_len; - int file_number = 0; - kstring_t linebuf = { 0, 0, NULL }; // Buffer - - if (!ifmt) return true; - - // read barcode tag - p = bam_aux_get(rec,opts->barcode_tag); - if (p) tag = bam_aux2Z(p); - - if (!tag) return true; // there is no tag - - tag_len = strlen(tag); - sub_tag = calloc(1, tag_len + 1); - if (!sub_tag) goto fail; - sub_qual = calloc(1, tag_len + 1); - if (!sub_qual) goto fail; - - // read quality tag - p = bam_aux_get(rec, opts->quality_tag); - if (p) qual = bam_aux2Z(p); - - // Parse the index-format string - while (*ifmt) { - if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly - char action = *ifmt; // should be 'i' or 'n' - ifmt++; // skip over action - int index_len = getLength(&ifmt); - int n = 0; - - if (index_len < 0) { - // read until separator - while (isalpha(*tag)) { - sub_tag[n] = *tag++; - if (qual) sub_qual[n] = *qual++; - n++; - } - if (*tag) { // skip separator - tag++; - if (qual) qual++; - } - } else { - // read index_len characters - while (index_len-- && *tag) { - sub_tag[n] = *tag++; - if (qual) sub_qual[n] = *qual++; - n++; - } - } - sub_tag[n] = '\0'; - sub_qual[n] = '\0'; - - if (action=='i' && *sub_tag) { - if (state->index_sequence) { - char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2); - if (!new_index_sequence) goto fail; - state->index_sequence = new_index_sequence; - strcat(state->index_sequence, INDEX_SEPARATOR); - strcat(state->index_sequence, sub_tag); - } else { - state->index_sequence = strdup(sub_tag); // we're going to need this later... - } - if (!state->index_sequence) goto fail; - if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; - if (state->illumina_tag) { - if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) { - goto fail; - } - } - if (state->fpi[file_number]) { - if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) - goto fail; - } - } - - } - - free(sub_qual); free(sub_tag); - free(linebuf.s); - return true; - - fail: - perror(__func__); - free(sub_qual); free(sub_tag); - free(linebuf.s); - return false; -} - -// Transform a bam1_t record into a string with the FASTQ representation of it -// @returns false for error, true for success -static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) -{ - int32_t qlen = b->core.l_qseq; - assert(qlen >= 0); - const uint8_t *oq = NULL; - char *qual = NULL; - - char *seq = get_read(b); - if (!seq) return false; - - if (state->use_oq) oq = bam_aux_get(b, "OQ"); - if (oq && *oq=='Z') { - qual = strdup(bam_aux2Z(oq)); - if (!qual) goto fail; - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - reverse(qual); - } - } else { - if (get_quality(b, &qual) < 0) goto fail; - } - - if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; - - free(qual); - free(seq); - return true; - - fail: - free(seq); - free(qual); - return false; -} - static void free_opts(bam2fq_opts_t *opts) { - free(opts->barcode_tag); - free(opts->quality_tag); - free(opts->index_format); - free(opts->extra_tags); free(opts); } @@ -568,13 +212,14 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) {"quality-tag", required_argument, NULL, 'q'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { + while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", + lopts, NULL)) > 0) { switch (c) { - case 'b': opts->barcode_tag = strdup(optarg); break; - case 'q': opts->quality_tag = strdup(optarg); break; + case 'b': opts->barcode_tag = optarg; break; + case 'q': opts->quality_tag = optarg; break; case 1 : opts->index_file[0] = optarg; break; case 2 : opts->index_file[1] = optarg; break; - case 3 : opts->index_format = strdup(optarg); break; + case 3 : opts->index_format = optarg; break; case '0': opts->fnr[0] = optarg; break; case '1': opts->fnr[1] = optarg; break; case '2': opts->fnr[2] = optarg; break; @@ -585,7 +230,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) flag_off_set = 1; opts->flag_off = 0; } - opts->flag_off |= strtol(optarg, 0, 0); break; + opts->flag_off |= strtol(optarg, 0, 0); + break; case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; case 'n': opts->has12 = false; break; case 'N': opts->has12always = true; break; @@ -593,13 +239,25 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) case 's': opts->fnse = optarg; break; case 't': opts->copy_tags = true; break; case 'i': opts->illumina_tag = true; break; - case 'c': opts->compression_level = atoi(optarg); break; - case 'T': opts->extra_tags = strdup(optarg); break; + case 'c': + opts->compression_level = atoi(optarg); + if (opts->compression_level < 0) + opts->compression_level = 0; + if (opts->compression_level > 9) + opts->compression_level = 9; + break; + case 'T': opts->extra_tags = optarg; break; case 'v': opts->def_qual = atoi(optarg); break; - case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; + + case '?': + bam2fq_usage(samtools_stderr, argv[0]); + free_opts(opts); + return false; default: if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { - bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; + bam2fq_usage(samtools_stderr, argv[0]); + free_opts(opts); + return false; } break; } @@ -608,8 +266,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; if (opts->has12always) opts->has12 = true; - if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); - if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); + if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG; + if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG; int nIndex = 0; if (opts->index_format) { @@ -654,7 +312,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) } const char* type_str = argv[0]; - if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { + if (strcasecmp("fastq", type_str) == 0 || + strcasecmp("bam2fq", type_str) == 0) { opts->filetype = FASTQ; } else if (strcasecmp("fasta", type_str) == 0) { opts->filetype = FASTA; @@ -682,34 +341,61 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) return true; } -static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp) -{ - char mode[4] = "w"; - size_t len = strlen(filename); - - mode[2] = 0; mode[3] = 0; - if (len > 3 && strstr(filename + (len - 3),".gz")) { - mode[1] = 'g'; mode[2] = c+'0'; - } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) - || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { - mode[1] = c+'0'; - } else { - mode[1] = 'u'; +void set_sam_opts(samFile *fp, bam2fq_state_t *state, + const bam2fq_opts_t *opts) { + if (state->has12) + hts_set_opt(fp, FASTQ_OPT_RNUM, 1); + + if (state->illumina_tag) + hts_set_opt(fp, FASTQ_OPT_CASAVA, 1); + + hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag); + + kstring_t tag_list = {0,0}; + if (state->copy_tags) + kputs("RG,BC,QT", &tag_list); + if (opts->extra_tags) { + if (tag_list.l) + kputc(',', &tag_list); + kputs(opts->extra_tags, &tag_list); } + if (tag_list.l) + hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s); + ks_free(&tag_list); +} - BGZF *fp = bgzf_open(filename,mode); +// Open a file as normal or gzipped based on filename. +// Note we always use bgzf and don't bother to attempt non-blocked +// gzip streams. This is a departure from the old fastq code. +static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) { + char modez[6]; + strcpy(modez, mode); + + size_t l = strlen(fn); + if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) || + (l > 4 && strcmp(fn+l-4, ".bgz") == 0) || + (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) { + char m[3] = {'z', state->compression_level+'0', '\0'}; + strcat(modez, m); + } + + samFile *fp = sam_open(fn, modez); if (!fp) - return fp; - if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) { - bgzf_close(fp); return NULL; - } + + if (state->p.pool) + hts_set_thread_pool(fp, &state->p); + return fp; } static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) { + char *mode = opts->filetype == FASTA ? "wF" : "wf"; + bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); + if (!state) + return false; state->flag_on = opts->flag_on; state->flag_off = opts->flag_off; state->flag_alloff = opts->flag_alloff; @@ -723,22 +409,6 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->hsamtools_stdout = NULL; state->compression_level = opts->compression_level; - state->taglist = kl_init(ktaglist); - if (opts->extra_tags) { - char *save_p; - char *s = strtok_r(opts->extra_tags, ",", &save_p); - while (s) { - if (strlen(s) != 2) { - fprintf(samtools_stderr, "Parsing extra tags - '%s' is not two characters\n", s); - free(state); - return false; - } - char **et = kl_pushp(ktaglist, state->taglist); - *et = s; - s = strtok_r(NULL, ",", &save_p); - } - } - state->fp = sam_open(opts->fn_input, "r"); if (state->fp == NULL) { print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); @@ -770,12 +440,12 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) return false; } if (opts->fnse) { - state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p); - if (state->fpse == NULL) { - print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); + if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) { + print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse); free(state); return false; } + set_sam_opts(state->fpse, state, opts); } if (opts->ga.reference) { @@ -786,6 +456,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) } } + // single, read1, read2 int i, j; for (i = 0; i < 3; ++i) { if (opts->fnr[i]) { @@ -793,28 +464,30 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0) break; if (j == i) { - state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p); - if (state->fpr[i] == NULL) { - print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", + if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) { + print_error_errno("bam2fq", "Cannot open r%d file \"%s\"", i, opts->fnr[i]); free(state); return false; } + set_sam_opts(state->fpr[i], state, opts); } else { state->fpr[i] = state->fpr[j]; } } else { if (!state->hsamtools_stdout) { - state->hsamtools_stdout = bgzf_dopen(fileno(samtools_stdout), "wu"); - if (!state->hsamtools_stdout) { + if (!(state->hsamtools_stdout = sam_open_z("-", mode, state))) { print_error_errno("bam2fq", "Cannot open STDOUT"); free(state); return false; } + set_sam_opts(state->hsamtools_stdout, state, opts); } state->fpr[i] = state->hsamtools_stdout; } } + + // index 1, index 2 for (i = 0; i < 2; i++) { state->fpi[i] = NULL; if (opts->index_file[i]) { @@ -825,13 +498,14 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0) break; if (i == j) { - state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p); - if (state->fpi[i] == NULL) { - print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", + if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode, + state))) { + print_error_errno("bam2fq", "Cannot open i%d file \"%s\"", i+1, opts->index_file[i]); free(state); return false; } + set_sam_opts(state->fpi[i], state, opts); } else if (j < 0) { state->fpi[i] = state->fpr[j+3]; } else { @@ -856,21 +530,25 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* bool valid = true; sam_hdr_destroy(state->h); check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); - if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } + if (state->fpse && sam_close(state->fpse) < 0) { + print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); + valid = false; + } + int i, j; for (i = 0; i < 3; ++i) { if (state->fpr[i] != state->hsamtools_stdout) { for (j = 0; j < i; j++) if (state->fpr[i] == state->fpr[j]) break; - if (j == i && bgzf_close(state->fpr[i])) { + if (j == i && sam_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } } } if (state->hsamtools_stdout) { - if (bgzf_close(state->hsamtools_stdout)) { + if (sam_close(state->hsamtools_stdout) < 0) { print_error_errno("bam2fq", "Error closing STDOUT"); valid = false; } @@ -882,12 +560,11 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* for (j -= 3; j >= 0 && j < i; j++) if (state->fpi[i] == state->fpi[j]) break; - if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) { + if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) { print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); valid = false; } } - kl_destroy(ktaglist,state->taglist); free(state->index_sequence); if (state->p.pool) hts_tpool_destroy(state->p.pool); @@ -903,135 +580,300 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) } +int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state, + bam2fq_opts_t* opts, char *seq, int seq_len, + char *qual, int qual_len) { + if (!fp || !b || !seq_len) + return 0; + + int ret = -1; + bam1_t *b2 = bam_init1(); // FIXME: reuse + if (!b2) + return -1; + + size_t aux_len = b->data + b->l_data - bam_get_aux(b); + if (bam_set1(b2, b->core.l_qname, bam_get_qname(b), + (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE, + -1, -1, 0, // refid, pos, mapq + 0, NULL, // cigar + -1, -1, 0, // rnext, pnext, tlen + seq_len, seq, qual, + aux_len) < 0) + goto err; + + uint8_t *q = bam_get_qual(b2); + if (qual) { + int i; + for (i = 0; i < seq_len; i++) + q[i] -= '!'; + } else { + memset(q, opts->def_qual, seq_len); + } + + memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len); + b2->l_data += aux_len; + if (sam_write1(fp, state->h, b2) < 0) + goto err; + + ret = 0; + err: + if (b2) + bam_destroy1(b2); + return ret; +} + +int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state, + bam2fq_opts_t* opts) { + bam1_t *b[2] = {b1, b2}; + + char *ifmt = opts->index_format; + if (!ifmt) + ifmt = "i*i*"; + + // Get seq / qual elements + char *bc = NULL, *qt = NULL; + if (b1) + bc = (char *)bam_aux_get(b1, opts->barcode_tag); + if (b2 && !bc) + bc = (char *)bam_aux_get(b2, opts->barcode_tag); + if (!bc) + return 0; + else + bc++; // skip Z + + if (b1) + qt = (char *)bam_aux_get(b1, opts->quality_tag); + if (b2 && !qt) + qt = (char *)bam_aux_get(b2, opts->quality_tag); + if (qt && strlen(bc) != strlen(qt)-1) + qt = NULL; + else if (qt) + qt++; + + int inum = 0; + while (inum < 2) { + char fc = *ifmt++; + if (!fc) + break; // ran out of index-format + + long len, rem = 0; + if (isdigit(*ifmt)) { + rem = len = strtol(ifmt, &ifmt, 10); + } else { + ifmt++; + len = 0; + } + + char *bc_end = bc, *qt_end = qt; + while (len ? *bc_end && rem-- : isalpha(*bc_end)) + bc_end++, qt_end += qt != NULL; + + switch (fc) { + case 'n': + // skip + bc = bc_end + (len==0); + if (qt) + qt = qt_end + (len==0); + break; + + case 'i': + if (write_index_rec(state->fpi[inum], b[inum], state, opts, + bc, bc_end-bc, qt, qt_end-qt) < 0) + return -1; + bc = bc_end + (len==0); + if (qt) + qt = qt_end + (len==0); + inum++; + break; + + default: + fprintf(samtools_stderr, "Unknown index-format code\n"); + return -1; + } + } + + return 0; +} + +static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts, + bam1_t *b[4], int score[3], int best[3], + int64_t *n_singletons) { + // Paired data, with 1 or 2 ends present. + if (score[1] > 0 && score[2] > 0) { + // If CASAVA tag is required and barcode is only on R1, + // copy it to R2 + if (state->illumina_tag) { + char *tag; + if ((tag = (char *)bam_aux_get(b[best[1]], + opts->barcode_tag))) + if (bam_aux_update_str(b[best[2]], + opts->barcode_tag, + strlen(tag), tag+1) < 0) + goto err; + if ((tag = (char *)bam_aux_get(b[best[1]], + opts->quality_tag))) + if (bam_aux_update_str(b[best[2]], + opts->quality_tag, + strlen(tag), tag+1) < 0) + goto err; + + } + if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0) + goto err; + if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0) + goto err; + + if (output_index(b[best[1]], b[best[2]], state, opts) < 0) + goto err; + } else if (score[1] > 0 || score[2] > 0) { + if (state->fpse) { + // print whichever one exists to fpse + if (score[1] > 0) { + if (sam_write1(state->fpse, state->h, b[best[1]]) < 0) + goto err; + } else { + if (sam_write1(state->fpse, state->h, b[best[2]]) < 0) + goto err; + } + ++(*n_singletons); + } else { + if (score[1] > 0) { + if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0) + goto err; + } else { + if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0) + goto err; + } + } + + if (output_index(score[1] > 0 ? b[best[1]] : NULL, + score[2] > 0 ? b[best[2]] : NULL, + state, opts) < 0) + goto err; + } + + if (score[0]) { // single ended data (neither READ1 nor READ2) + if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0) + goto err; + + if (output_index(b[best[0]], NULL, state, opts) < 0) + goto err; + } + + return 0; + + err: + return -1; +} + static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) { int n; - bam1_t *records[3] = {NULL, NULL, NULL}; char *current_qname = NULL; int64_t n_reads = 0, n_singletons = 0; // Statistics - kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; int score[3]; int at_eof; - bool valid = true; - bam1_t* b = NULL; + bool valid = false; + int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices; + // indexed by [readpart] + bam1_t *b[4]; // 3 readparts, plus current record - while (true) { - if (!b) - b = bam_init1(); - if (b == NULL) { + for (n = 0; n < 4; n++) { + if (!(b[n] = bam_init1())) { perror("[bam2fq_mainloop] Malloc error for bam record buffer."); - valid = false; - break; + return false; } - int res = sam_read1(state->fp, state->h, b); + } + + n = 0; + while (true) { + int res = sam_read1(state->fp, state->h, b[n]); if (res < -1) { fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); - valid = false; - break; + goto err; } at_eof = res < 0; - if (!at_eof && filter_it_out(b, state)) + if (!at_eof && filter_it_out(b[n], state)) continue; - if (!at_eof) ++n_reads; - - if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { - if (current_qname) { - if (state->illumina_tag) { - for (n=0; valid && n<3; n++) { - if (!records[n]) continue; - if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; - } - if (!valid) break; - } - free(state->index_sequence); state->index_sequence = NULL; - if (score[1] > 0 && score[2] > 0) { - // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] - if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } else if (score[1] > 0 || score[2] > 0) { - if (state->fpse) { - // print whichever one exists to fpse - if (score[1] > 0) { - if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - } else { - if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } - ++n_singletons; - } else { - if (score[1] > 0) { - if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - } else { - if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } - } - } - if (score[0]) { // TODO: check this - // print linebuf[0] to fpr[0] - if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } - } + if (!at_eof) { + ++n_reads; + + // Handle -O option: use OQ for qual + uint8_t *oq; + if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') { + int i, l = strlen((char *)++oq); + uint8_t *qual = bam_get_qual(b[n]); + for (i = 0; i < l && i < b[n]->core.l_qseq; i++) + qual[i] = oq[i] - '!'; } + } + if (at_eof + || !current_qname + || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) { + // New name, so flush best examples of previous name. + if (current_qname) + if (flush_rec(state, opts, b, score, best, &n_singletons) < 0) + goto err; - free(current_qname); current_qname = NULL; + current_qname = bam_get_qname(b[n]); score[0] = score[1] = score[2] = 0; - for (n=0; n < 3; n++) { - bam_destroy1(records[n]); records[n]=NULL; - } if (at_eof) { break; } - - current_qname = strdup(bam_get_qname(b)); - if (!current_qname) { valid = false; break; } } // Prefer a copy of the read that has base qualities - int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; - readpart rp = which_readpart(b); - if (b_score > score[rp]) { - if (!tags2fq(b, state, opts)) { valid = false; break; } - if (records[rp]) bam_destroy1(records[rp]); - records[rp] = b; + int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1; + readpart rp = which_readpart(b[n]); + if (score[rp] < b_score) { score[rp] = b_score; - b = NULL; - if(!bam1_to_fq(records[rp], &linebuf[rp], state)) { - fprintf(samtools_stderr, "[%s] Error converting read to FASTA/Q\n", __func__); - valid = false; break; - } + // Record b[n] slot for best copy of readpair and find a new + // slot for next bam read + best[rp] = n; + int used_slot[4] = {0}, i; + for (i = 0; i < 3; i++) + if (best[i] >= 0) + used_slot[best[i]] = 1; + for (i = 0; i < 4 && used_slot[i]; i++) + ; + n = i; } } + + valid = true; + err: if (!valid) - { - perror("[bam2fq_mainloop] Error writing to FASTx files."); - } - bam_destroy1(b); - for (n=0; n < 3; n++) { - bam_destroy1(records[n]); - } - free(current_qname); - free(linebuf[0].s); - free(linebuf[1].s); - free(linebuf[2].s); - fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); - fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); + print_error_errno("bam2fq", "Error writing to FASTx files."); + + for (n = 0; n < 4; n++) + bam_destroy1(b[n]); + + fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", + __func__, n_singletons); + fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", + __func__, n_reads); return valid; } int main_bam2fq(int argc, char *argv[]) { - int status = EXIT_SUCCESS; + int status = EXIT_FAILURE; bam2fq_opts_t* opts = NULL; bam2fq_state_t* state = NULL; bool valid = parse_opts(argc, argv, &opts); if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; - if (!init_state(opts, &state)) return EXIT_FAILURE; + if (!init_state(opts, &state)) goto err; + + if (!bam2fq_mainloop(state,opts)) goto err; - if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; + if (!destroy_state(opts, state, &status)) goto err; - if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; + status = EXIT_SUCCESS; + err: sam_global_args_free(&opts->ga); free_opts(opts); diff --git a/samtools/bam_flags.c b/samtools/bam_flags.c index 11a82b6..78312ee 100644 --- a/samtools/bam_flags.c +++ b/samtools/bam_flags.c @@ -1,6 +1,6 @@ /* bam_flags.c -- flags subcommand. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2014, 2021 Genome Research Ltd. Author: Petr Danecek @@ -32,38 +32,54 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "samtools.h" -static void usage(void) +static void usage(FILE *fp) { - fprintf(stderr, "\n"); - fprintf(stderr, "About: Convert between textual and numeric flag representation\n"); - fprintf(stderr, "Usage: samtools flags INT|STR[,...]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Flags:\n"); - fprintf(stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED); - fprintf(stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR); - fprintf(stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP); - fprintf(stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP); - fprintf(stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE); - fprintf(stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE); - fprintf(stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1); - fprintf(stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2); - fprintf(stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY); - fprintf(stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL); - fprintf(stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP); - fprintf(stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY); - fprintf(stderr, "\n"); + static const struct { int bit; const char *desc; } *fl, flags[] = { + { BAM_FPAIRED, "paired-end / multiple-segment sequencing technology" }, + { BAM_FPROPER_PAIR, "each segment properly aligned according to aligner" }, + { BAM_FUNMAP, "segment unmapped" }, + { BAM_FMUNMAP, "next segment in the template unmapped" }, + { BAM_FREVERSE, "SEQ is reverse complemented" }, + { BAM_FMREVERSE, "SEQ of next segment in template is rev.complemented" }, + { BAM_FREAD1, "the first segment in the template" }, + { BAM_FREAD2, "the last segment in the template" }, + { BAM_FSECONDARY, "secondary alignment" }, + { BAM_FQCFAIL, "not passing quality controls or other filters" }, + { BAM_FDUP, "PCR or optical duplicate" }, + { BAM_FSUPPLEMENTARY, "supplementary alignment" }, + { 0, NULL } + }; + + fprintf(fp, +"About: Convert between textual and numeric flag representation\n" +"Usage: samtools flags FLAGS...\n" +"\n" +"Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing\n" +"a combination of the following numeric flag values, or a comma-separated string\n" +"NAME,...,NAME representing a combination of the following flag names:\n" +"\n"); + for (fl = flags; fl->desc; fl++) { + char *name = bam_flag2str(fl->bit); + fprintf(fp, "%#6x %5d %-15s%s\n", fl->bit, fl->bit, name, fl->desc); + free(name); + } } int main_flags(int argc, char *argv[]) { - if ( argc!=2 ) usage(); - else + if ( argc < 2 ) { usage(stdout); return 0; } + + int i; + for (i = 1; i < argc; i++) { - int mask = bam_str2flag(argv[1]); - if ( mask<0 ) { fprintf(stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; } - printf("0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask)); + int mask = bam_str2flag(argv[i]); + if ( mask<0 ) { print_error("flags", "Could not parse \"%s\"", argv[i]); usage(stderr); return 1; } + char *str = bam_flag2str(mask); + printf("0x%x\t%d\t%s\n", mask, mask, str); + free(str); } return 0; } diff --git a/samtools/bam_flags.c.pysam.c b/samtools/bam_flags.c.pysam.c index 9c6424f..b3a9d29 100644 --- a/samtools/bam_flags.c.pysam.c +++ b/samtools/bam_flags.c.pysam.c @@ -2,7 +2,7 @@ /* bam_flags.c -- flags subcommand. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2014, 2021 Genome Research Ltd. Author: Petr Danecek @@ -34,38 +34,54 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "samtools.h" -static void usage(void) +static void usage(FILE *fp) { - fprintf(samtools_stderr, "\n"); - fprintf(samtools_stderr, "About: Convert between textual and numeric flag representation\n"); - fprintf(samtools_stderr, "Usage: samtools flags INT|STR[,...]\n"); - fprintf(samtools_stderr, "\n"); - fprintf(samtools_stderr, "Flags:\n"); - fprintf(samtools_stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED); - fprintf(samtools_stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR); - fprintf(samtools_stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP); - fprintf(samtools_stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP); - fprintf(samtools_stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE); - fprintf(samtools_stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE); - fprintf(samtools_stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1); - fprintf(samtools_stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2); - fprintf(samtools_stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY); - fprintf(samtools_stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL); - fprintf(samtools_stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP); - fprintf(samtools_stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY); - fprintf(samtools_stderr, "\n"); + static const struct { int bit; const char *desc; } *fl, flags[] = { + { BAM_FPAIRED, "paired-end / multiple-segment sequencing technology" }, + { BAM_FPROPER_PAIR, "each segment properly aligned according to aligner" }, + { BAM_FUNMAP, "segment unmapped" }, + { BAM_FMUNMAP, "next segment in the template unmapped" }, + { BAM_FREVERSE, "SEQ is reverse complemented" }, + { BAM_FMREVERSE, "SEQ of next segment in template is rev.complemented" }, + { BAM_FREAD1, "the first segment in the template" }, + { BAM_FREAD2, "the last segment in the template" }, + { BAM_FSECONDARY, "secondary alignment" }, + { BAM_FQCFAIL, "not passing quality controls or other filters" }, + { BAM_FDUP, "PCR or optical duplicate" }, + { BAM_FSUPPLEMENTARY, "supplementary alignment" }, + { 0, NULL } + }; + + fprintf(fp, +"About: Convert between textual and numeric flag representation\n" +"Usage: samtools flags FLAGS...\n" +"\n" +"Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing\n" +"a combination of the following numeric flag values, or a comma-separated string\n" +"NAME,...,NAME representing a combination of the following flag names:\n" +"\n"); + for (fl = flags; fl->desc; fl++) { + char *name = bam_flag2str(fl->bit); + fprintf(fp, "%#6x %5d %-15s%s\n", fl->bit, fl->bit, name, fl->desc); + free(name); + } } int main_flags(int argc, char *argv[]) { - if ( argc!=2 ) usage(); - else + if ( argc < 2 ) { usage(samtools_stdout); return 0; } + + int i; + for (i = 1; i < argc; i++) { - int mask = bam_str2flag(argv[1]); - if ( mask<0 ) { fprintf(samtools_stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; } - fprintf(samtools_stdout, "0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask)); + int mask = bam_str2flag(argv[i]); + if ( mask<0 ) { print_error("flags", "Could not parse \"%s\"", argv[i]); usage(samtools_stderr); return 1; } + char *str = bam_flag2str(mask); + fprintf(samtools_stdout, "0x%x\t%d\t%s\n", mask, mask, str); + free(str); } return 0; } diff --git a/samtools/bam_import.c b/samtools/bam_import.c new file mode 100644 index 0000000..daf6b17 --- /dev/null +++ b/samtools/bam_import.c @@ -0,0 +1,487 @@ +/* bam_import -- Import of FASTQ files. + * + * samtools import -1 a_1.fq -2 a_2.fq --i1 a_i1.fq --i2 a_i2.fq + * samtools import a_1.fq a_2.fq + * samtools import a_interleaved.fq + * + * Copyright (C) 2020 Genome Research Ltd. + * + * Author: James Bonfield + */ + +/* +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +// TODO: Store other non-aux comments; in new sam tag? + +#include +#include + +#include "htslib/sam.h" +#include "htslib/thread_pool.h" + +#include "samtools.h" +#include "sam_opts.h" + +static int usage(FILE *fp, int exit_status) { + fprintf(fp, "Usage: samtools import [options] [file.fastq ...]\n"); + fprintf(fp, "\n"); + fprintf(fp, "Options:\n"); + fprintf(fp, " -s FILE Read paired-ended data from single FILE\n"); + fprintf(fp, " -0 FILE Read single-ended data from FILE\n"); + fprintf(fp, " -1 FILE Read-1 from FILE\n"); + fprintf(fp, " -2 FILE Read-2 from FILE\n"); + fprintf(fp, " --i1 FILE Index-1 from FILE\n"); + fprintf(fp, " --i2 FILE Index-2 from FILE\n"); + fprintf(fp, " -i Parse CASAVA identifier\n"); + fprintf(fp, " --barcode-tag TAG\n"); + fprintf(fp, " Tag to use with barcode sequences [BC]\n"); + fprintf(fp, " --quality-tag TAG\n"); + fprintf(fp, " Tag to use with barcode qualities [QT]\n"); + fprintf(fp, " -r STRING Build up a complete @RG line\n"); + fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n"); + fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n"); + fprintf(fp, " -o FILE Output to FILE instead of stdout\n"); + fprintf(fp, " -u Uncompressed output\n"); + fprintf(fp, " --order TAG Store Nth record count in TAG\n"); + fprintf(fp, "\n"); + sam_global_opt_help(fp, "-.O.-@--"); + + fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n"); + fprintf(fp, "file contents, and a pair of fastq files as \"-1 FILE1 -2 FILE2\".\n"); + + return exit_status; +} + +// Order matters here as we want to read index elements before main +// sequences so on reading the seqs we can emit a fully annotated record. +enum fileno { + FQ_I1, FQ_I2, // index seqs for R1 and R2 + FQ_R0, // single file and unpaired data (singled-ended tech). + FQ_R1, FQ_R2, // separate read1 and read2 files + FQ_SINGLE, // single file, but with read1 and/or read2 present. + FQ_END +}; + +typedef struct { + sam_global_args ga; + int no_pg; + char *fn[FQ_END], *fn_out; + int idx_both; // add index to READ2 too, not just READ1 + int casava; + char *barcode_seq; + char *barcode_qual; + char *aux; + char *rg; + char *rg_line; + char *order; + int compress_level; + htsThreadPool p; +} opts_t; + +// Append a sequence and quality string from a BAM record to a BC:Z and +// QT:Z style aux tag string. +static int append_index(kstring_t *s, kstring_t *q, bam1_t *b) { + char *sp, *qp; + if (ks_resize(s, s->l + b->core.l_qseq+1 +1) < 0) + return -1; + if (ks_resize(q, q->l + b->core.l_qseq+1 +1) < 0) + return -1; + + sp = s->s + s->l - (s->l > 0); + qp = q->s + q->l - (q->l > 0); + + if (s->l) + *sp++ = '-'; + + if (q->l) + *qp++ = ' '; + + int i; + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + for (i = 0; i < b->core.l_qseq; i++) { + *sp++ = seq_nt16_str[bam_seqi(seq, i)]; + *qp++ = qual[i] + '!'; + } + *sp++ = 0; + *qp++ = 0; + + s->l = sp - s->s; + q->l = qp - q->s; + + return 0; +} + +static int import_fastq(int argc, char **argv, opts_t *opts) { + int i, n, ret = 0; + samFile *fp_in[FQ_END] = {NULL}; + bam1_t *b = bam_init1(); + int ids[FQ_END]; + samFile *fp_out = NULL; + sam_hdr_t *hdr_out = NULL; + kstring_t index_str = {0,0}; + kstring_t read_str = {0,0}; + char *rg = opts->rg; + kstring_t rg_line = {0,0}; + uint64_t read_num = 0; + kstring_t idx_seq = {0}; + kstring_t idx_qual = {0}; + + // Any additional arguments are assumed to be r1 r2, as a + // short cut. We support reading index tags out of those too (eg + // Illumina CASAVA format), but if we do that we lack the barcode + // quality string. + // + // We also consider a read name ending in /1 or /2 to be a single + // file containing interleaved fastq records for both ends. + // These will be labeled as fn[FQ_R1] but adjusted during reading. + if (argc == 1) + opts->fn[FQ_SINGLE] = argv[0]; + else + for (i = 0; i < 4; i++) + if (argc > i) + opts->fn[FQ_R1+i] = argv[i]; + + // Open all files + for (i = n = 0; i < FQ_END; i++) { + if (!opts->fn[i]) + continue; + fp_in[i] = sam_open_format(opts->fn[i], "r", &opts->ga.in); + if (!fp_in[i]) { + perror(opts->fn[i]); + ret = -1; + goto err; + } + if (opts->p.pool) + hts_set_thread_pool(fp_in[i], &opts->p); + ids[n++] = i; + + if (opts->casava) + hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1); + if (opts->barcode_seq) // for auto-CASAVA parsing + hts_set_opt(fp_in[i], FASTQ_OPT_BARCODE, opts->barcode_seq); + if (opts->aux) + hts_set_opt(fp_in[i], FASTQ_OPT_AUX, + *opts->aux == '*' || *opts->aux == '\0' + ? NULL : opts->aux); + + switch (i) { + case FQ_I1: + kputs("--i1 I1.fastq ", &read_str); + kputs("i*", &index_str); + break; + case FQ_I2: + kputs("--i2 I2.fastq ", &read_str); + kputs("i*", &index_str); + break; + + case FQ_R0: + kputs("-0 unpaired.fastq ", &read_str); + break; + + case FQ_R1: + kputs("-1 R1.fastq ", &read_str); + break; + + case FQ_R2: + kputs("-2 R2.fastq ", &read_str); + break; + + case FQ_SINGLE: + kputs("-N -o paired.fastq ", &read_str); + break; + + default: + ks_clear(&read_str); // not reversible + kputs("", &read_str); + } + } + if (n == 0) { + bam_destroy1(b); + return usage(stdout, EXIT_SUCCESS); + } + + char out_mode[10] = {'w', 0, 0}; + if (opts->compress_level != -1) + out_mode[1] = '0' + opts->compress_level; + sam_open_mode(out_mode+strlen(out_mode), opts->fn_out, NULL); + fp_out = sam_open_format(opts->fn_out, out_mode, &opts->ga.out); + if (!fp_out) { + perror(opts->fn_out); + goto err; + } + if (opts->p.pool) + hts_set_thread_pool(fp_out, &opts->p); + + // Create header + if (ks_len(&read_str)) { + char CO[2100]; + if (ks_len(&index_str)) + snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s " + "--index-format=\"%s\"\n", + ks_str(&read_str), ks_str(&index_str)); + else + snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s\n", + ks_str(&read_str)); + + hdr_out = sam_hdr_parse(strlen(CO), CO); + } else { + hdr_out = sam_hdr_init(); + } + + // Read group + if (opts->rg_line) { + if (*opts->rg_line != '@') + ksprintf(&rg_line, "@RG\t%s", opts->rg_line); + else + kputs(opts->rg_line, &rg_line); + } else if (opts->rg) { + ksprintf(&rg_line, "@RG\tID:%s", opts->rg); + } + + if (ks_len(&rg_line)) { + if (sam_hdr_add_lines(hdr_out, ks_str(&rg_line), 0) < 0) + goto err; + rg = strstr(ks_str(&rg_line), "\tID:"); + if (!rg) { + fprintf(stderr, "\"-r RG-LINE\" option contained no ID field\n"); + goto err; + } + rg += 4; + + i = 0; + while (rg[i] != '\t' && rg[i] != '\0') + i++; + rg[i] = 0; + } + + if ((ret = sam_hdr_write(fp_out, hdr_out)) < 0) + goto err; + + + // Interleave / combine from n files (ids[0..n-1]). + int res; + int eof = 0; + do { + idx_seq.l = idx_qual.l = 0; + for (i = 0; i < n; i++) { + if ((res = sam_read1(fp_in[ids[i]], NULL, b)) < 0) { + if (res == -1) { + eof++; + continue; + } else + break; + } + + // index + if (ids[i] == FQ_I1 || ids[i] == FQ_I2) { + if (append_index(&idx_seq, &idx_qual, b) < 0) { + res = -1; + break; + } + continue; + } + + // full read + if (idx_seq.l) { + if (opts->idx_both || ids[i] == FQ_SINGLE || + ids[i] == FQ_R0 || ids[i] == FQ_R1) { + if (bam_aux_append(b, opts->barcode_seq, 'Z', idx_seq.l, + (uint8_t *)idx_seq.s) || + bam_aux_append(b, opts->barcode_qual, 'Z', idx_qual.l, + (uint8_t *)idx_qual.s)) { + res = -1; + break; + } + } + } + + switch(ids[i]) { + case FQ_R0: + // unpaired; no flags to declare + break; + case FQ_SINGLE: + // paired (but don't know if R1 or R2) or unpaired. + // We rely on the /1 and /2 read suffix parsing in htslib + // to distinguish the two cases, or CASAVA tags if + // explicitly enabled. + break; + case FQ_R1: + if ((b->core.flag & (BAM_FREAD1 | BAM_FREAD2)) == 0) + b->core.flag |= BAM_FREAD1; + b->core.flag |= BAM_FPAIRED; + if (i+1 < n && ids[i+1] == FQ_R2) + b->core.flag |= BAM_FMUNMAP; + break; + case FQ_R2: + b->core.flag |= BAM_FPAIRED | BAM_FREAD2; + if (i > 0 && ids[i-1] == FQ_R1) + b->core.flag |= BAM_FMUNMAP; + break; + } + + if (rg) { + if (bam_aux_append(b, "RG", 'Z', strlen(rg)+1, + (uint8_t *)rg) < 0) { + ret = -1; + goto err; + } + } + + if (opts->order) { + if (bam_aux_update_int(b, opts->order, read_num++) < 0) { + ret = -1; + goto err; + } + } + + res = sam_write1(fp_out, hdr_out, b); + } + } while (res >= 0); + + if (res != -1) { + print_error("import", "truncated file. Aborting"); + ret = res; + goto err; + } + + if (eof != n) { + print_error("import", "input files with differing number of records"); + ret = -1; + goto err; + } + + // Close and return + ret = 0; +err: + bam_destroy1(b); + sam_hdr_destroy(hdr_out); + ks_free(&rg_line); + ks_free(&index_str); + ks_free(&read_str); + if (fp_out) { + if (sam_close(fp_out) < 0) { + perror(opts->fn_out); + ret |= -1; + } + } + for (i = 0; i < FQ_END; i++) { + if (fp_in[i] && sam_close(fp_in[i]) < 0) { + perror(opts->fn[i]); + ret |= -1; + } + } + ks_free(&idx_seq); + ks_free(&idx_qual); + + return ret; +} + +int main_import(int argc, char *argv[]) { + int c; + opts_t opts = { + .no_pg = 0, + .ga = SAM_GLOBAL_ARGS_INIT, + .fn = {NULL}, + .fn_out = "-", + .casava = 0, + .barcode_seq = "BC", + .barcode_qual = "QT", + .aux = NULL, + .rg = NULL, + .rg_line = NULL, + .order = NULL, + .compress_level = -1, + }; + kstring_t rg = {0}; + + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, '-', '@'), + {"no-PG", no_argument, NULL, 9}, + {"i1", required_argument, NULL, 1}, + {"i2", required_argument, NULL, 2}, + {"r1", required_argument, NULL, '1'}, + {"r2", required_argument, NULL, '2'}, + {"rg", required_argument, NULL, 'R'}, + {"rg-line", required_argument, NULL, 'r'}, + {"order", required_argument, NULL, 3}, + {"barcode-tag", required_argument, NULL, 4}, + {"quality-tag", required_argument, NULL, 5}, + { NULL, 0, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) { + switch (c) { + case 'b': opts.idx_both = 1; break; + case '0': opts.fn[FQ_R0] = optarg; break; + case '1': opts.fn[FQ_R1] = optarg; break; + case '2': opts.fn[FQ_R2] = optarg; break; + case 1: opts.fn[FQ_I1] = optarg; break; + case 2: opts.fn[FQ_I2] = optarg; break; + case 's': opts.fn[FQ_SINGLE] = optarg; break; + case 'o': opts.fn_out = optarg; break; + case 'i': opts.casava = 1; break; + case 4: opts.barcode_seq = optarg; break; + case 5: opts.barcode_qual = optarg; break; + case 'T': opts.aux = optarg; break; + case 'u': opts.compress_level = 0; break; + case 'R': opts.rg = optarg; break; + case 'r': + if (*optarg != '@' && ks_len(&rg) == 0) + kputs("@RG", &rg); + if (ks_len(&rg)) + kputc_('\t', &rg); + kputs(optarg, &rg); + opts.rg_line = rg.s; + break; + + case 9: opts.no_pg = 1; break; + case 3: opts.order = optarg; break; + + case 'h': return usage(stdout, EXIT_SUCCESS); + case '?': return usage(stderr, EXIT_FAILURE); + + default: + if (parse_sam_global_opt(c, optarg, lopts, &opts.ga) != 0) + return usage(stderr, EXIT_FAILURE); + break; + } + } + + if (opts.ga.nthreads > 0) { + if (!(opts.p.pool = hts_tpool_init(opts.ga.nthreads))) { + fprintf(stderr, "Failed to create thread pool\n"); + if (rg.s) + free(rg.s); + return -1;; + } + } + + int ret = import_fastq(argc-optind, argv+optind, &opts) ? 1 : 0; + + if (rg.s) + free(rg.s); + + if (opts.p.pool) + hts_tpool_destroy(opts.p.pool); + + return ret; +} diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c new file mode 100644 index 0000000..1307ac6 --- /dev/null +++ b/samtools/bam_import.c.pysam.c @@ -0,0 +1,489 @@ +#include "samtools.pysam.h" + +/* bam_import -- Import of FASTQ files. + * + * samtools import -1 a_1.fq -2 a_2.fq --i1 a_i1.fq --i2 a_i2.fq + * samtools import a_1.fq a_2.fq + * samtools import a_interleaved.fq + * + * Copyright (C) 2020 Genome Research Ltd. + * + * Author: James Bonfield + */ + +/* +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +// TODO: Store other non-aux comments; in new sam tag? + +#include +#include + +#include "htslib/sam.h" +#include "htslib/thread_pool.h" + +#include "samtools.h" +#include "sam_opts.h" + +static int usage(FILE *fp, int exit_status) { + fprintf(fp, "Usage: samtools import [options] [file.fastq ...]\n"); + fprintf(fp, "\n"); + fprintf(fp, "Options:\n"); + fprintf(fp, " -s FILE Read paired-ended data from single FILE\n"); + fprintf(fp, " -0 FILE Read single-ended data from FILE\n"); + fprintf(fp, " -1 FILE Read-1 from FILE\n"); + fprintf(fp, " -2 FILE Read-2 from FILE\n"); + fprintf(fp, " --i1 FILE Index-1 from FILE\n"); + fprintf(fp, " --i2 FILE Index-2 from FILE\n"); + fprintf(fp, " -i Parse CASAVA identifier\n"); + fprintf(fp, " --barcode-tag TAG\n"); + fprintf(fp, " Tag to use with barcode sequences [BC]\n"); + fprintf(fp, " --quality-tag TAG\n"); + fprintf(fp, " Tag to use with barcode qualities [QT]\n"); + fprintf(fp, " -r STRING Build up a complete @RG line\n"); + fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n"); + fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n"); + fprintf(fp, " -o FILE Output to FILE instead of samtools_stdout\n"); + fprintf(fp, " -u Uncompressed output\n"); + fprintf(fp, " --order TAG Store Nth record count in TAG\n"); + fprintf(fp, "\n"); + sam_global_opt_help(fp, "-.O.-@--"); + + fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n"); + fprintf(fp, "file contents, and a pair of fastq files as \"-1 FILE1 -2 FILE2\".\n"); + + return exit_status; +} + +// Order matters here as we want to read index elements before main +// sequences so on reading the seqs we can emit a fully annotated record. +enum fileno { + FQ_I1, FQ_I2, // index seqs for R1 and R2 + FQ_R0, // single file and unpaired data (singled-ended tech). + FQ_R1, FQ_R2, // separate read1 and read2 files + FQ_SINGLE, // single file, but with read1 and/or read2 present. + FQ_END +}; + +typedef struct { + sam_global_args ga; + int no_pg; + char *fn[FQ_END], *fn_out; + int idx_both; // add index to READ2 too, not just READ1 + int casava; + char *barcode_seq; + char *barcode_qual; + char *aux; + char *rg; + char *rg_line; + char *order; + int compress_level; + htsThreadPool p; +} opts_t; + +// Append a sequence and quality string from a BAM record to a BC:Z and +// QT:Z style aux tag string. +static int append_index(kstring_t *s, kstring_t *q, bam1_t *b) { + char *sp, *qp; + if (ks_resize(s, s->l + b->core.l_qseq+1 +1) < 0) + return -1; + if (ks_resize(q, q->l + b->core.l_qseq+1 +1) < 0) + return -1; + + sp = s->s + s->l - (s->l > 0); + qp = q->s + q->l - (q->l > 0); + + if (s->l) + *sp++ = '-'; + + if (q->l) + *qp++ = ' '; + + int i; + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + for (i = 0; i < b->core.l_qseq; i++) { + *sp++ = seq_nt16_str[bam_seqi(seq, i)]; + *qp++ = qual[i] + '!'; + } + *sp++ = 0; + *qp++ = 0; + + s->l = sp - s->s; + q->l = qp - q->s; + + return 0; +} + +static int import_fastq(int argc, char **argv, opts_t *opts) { + int i, n, ret = 0; + samFile *fp_in[FQ_END] = {NULL}; + bam1_t *b = bam_init1(); + int ids[FQ_END]; + samFile *fp_out = NULL; + sam_hdr_t *hdr_out = NULL; + kstring_t index_str = {0,0}; + kstring_t read_str = {0,0}; + char *rg = opts->rg; + kstring_t rg_line = {0,0}; + uint64_t read_num = 0; + kstring_t idx_seq = {0}; + kstring_t idx_qual = {0}; + + // Any additional arguments are assumed to be r1 r2, as a + // short cut. We support reading index tags out of those too (eg + // Illumina CASAVA format), but if we do that we lack the barcode + // quality string. + // + // We also consider a read name ending in /1 or /2 to be a single + // file containing interleaved fastq records for both ends. + // These will be labeled as fn[FQ_R1] but adjusted during reading. + if (argc == 1) + opts->fn[FQ_SINGLE] = argv[0]; + else + for (i = 0; i < 4; i++) + if (argc > i) + opts->fn[FQ_R1+i] = argv[i]; + + // Open all files + for (i = n = 0; i < FQ_END; i++) { + if (!opts->fn[i]) + continue; + fp_in[i] = sam_open_format(opts->fn[i], "r", &opts->ga.in); + if (!fp_in[i]) { + perror(opts->fn[i]); + ret = -1; + goto err; + } + if (opts->p.pool) + hts_set_thread_pool(fp_in[i], &opts->p); + ids[n++] = i; + + if (opts->casava) + hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1); + if (opts->barcode_seq) // for auto-CASAVA parsing + hts_set_opt(fp_in[i], FASTQ_OPT_BARCODE, opts->barcode_seq); + if (opts->aux) + hts_set_opt(fp_in[i], FASTQ_OPT_AUX, + *opts->aux == '*' || *opts->aux == '\0' + ? NULL : opts->aux); + + switch (i) { + case FQ_I1: + kputs("--i1 I1.fastq ", &read_str); + kputs("i*", &index_str); + break; + case FQ_I2: + kputs("--i2 I2.fastq ", &read_str); + kputs("i*", &index_str); + break; + + case FQ_R0: + kputs("-0 unpaired.fastq ", &read_str); + break; + + case FQ_R1: + kputs("-1 R1.fastq ", &read_str); + break; + + case FQ_R2: + kputs("-2 R2.fastq ", &read_str); + break; + + case FQ_SINGLE: + kputs("-N -o paired.fastq ", &read_str); + break; + + default: + ks_clear(&read_str); // not reversible + kputs("", &read_str); + } + } + if (n == 0) { + bam_destroy1(b); + return usage(samtools_stdout, EXIT_SUCCESS); + } + + char out_mode[10] = {'w', 0, 0}; + if (opts->compress_level != -1) + out_mode[1] = '0' + opts->compress_level; + sam_open_mode(out_mode+strlen(out_mode), opts->fn_out, NULL); + fp_out = sam_open_format(opts->fn_out, out_mode, &opts->ga.out); + if (!fp_out) { + perror(opts->fn_out); + goto err; + } + if (opts->p.pool) + hts_set_thread_pool(fp_out, &opts->p); + + // Create header + if (ks_len(&read_str)) { + char CO[2100]; + if (ks_len(&index_str)) + snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s " + "--index-format=\"%s\"\n", + ks_str(&read_str), ks_str(&index_str)); + else + snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s\n", + ks_str(&read_str)); + + hdr_out = sam_hdr_parse(strlen(CO), CO); + } else { + hdr_out = sam_hdr_init(); + } + + // Read group + if (opts->rg_line) { + if (*opts->rg_line != '@') + ksprintf(&rg_line, "@RG\t%s", opts->rg_line); + else + kputs(opts->rg_line, &rg_line); + } else if (opts->rg) { + ksprintf(&rg_line, "@RG\tID:%s", opts->rg); + } + + if (ks_len(&rg_line)) { + if (sam_hdr_add_lines(hdr_out, ks_str(&rg_line), 0) < 0) + goto err; + rg = strstr(ks_str(&rg_line), "\tID:"); + if (!rg) { + fprintf(samtools_stderr, "\"-r RG-LINE\" option contained no ID field\n"); + goto err; + } + rg += 4; + + i = 0; + while (rg[i] != '\t' && rg[i] != '\0') + i++; + rg[i] = 0; + } + + if ((ret = sam_hdr_write(fp_out, hdr_out)) < 0) + goto err; + + + // Interleave / combine from n files (ids[0..n-1]). + int res; + int eof = 0; + do { + idx_seq.l = idx_qual.l = 0; + for (i = 0; i < n; i++) { + if ((res = sam_read1(fp_in[ids[i]], NULL, b)) < 0) { + if (res == -1) { + eof++; + continue; + } else + break; + } + + // index + if (ids[i] == FQ_I1 || ids[i] == FQ_I2) { + if (append_index(&idx_seq, &idx_qual, b) < 0) { + res = -1; + break; + } + continue; + } + + // full read + if (idx_seq.l) { + if (opts->idx_both || ids[i] == FQ_SINGLE || + ids[i] == FQ_R0 || ids[i] == FQ_R1) { + if (bam_aux_append(b, opts->barcode_seq, 'Z', idx_seq.l, + (uint8_t *)idx_seq.s) || + bam_aux_append(b, opts->barcode_qual, 'Z', idx_qual.l, + (uint8_t *)idx_qual.s)) { + res = -1; + break; + } + } + } + + switch(ids[i]) { + case FQ_R0: + // unpaired; no flags to declare + break; + case FQ_SINGLE: + // paired (but don't know if R1 or R2) or unpaired. + // We rely on the /1 and /2 read suffix parsing in htslib + // to distinguish the two cases, or CASAVA tags if + // explicitly enabled. + break; + case FQ_R1: + if ((b->core.flag & (BAM_FREAD1 | BAM_FREAD2)) == 0) + b->core.flag |= BAM_FREAD1; + b->core.flag |= BAM_FPAIRED; + if (i+1 < n && ids[i+1] == FQ_R2) + b->core.flag |= BAM_FMUNMAP; + break; + case FQ_R2: + b->core.flag |= BAM_FPAIRED | BAM_FREAD2; + if (i > 0 && ids[i-1] == FQ_R1) + b->core.flag |= BAM_FMUNMAP; + break; + } + + if (rg) { + if (bam_aux_append(b, "RG", 'Z', strlen(rg)+1, + (uint8_t *)rg) < 0) { + ret = -1; + goto err; + } + } + + if (opts->order) { + if (bam_aux_update_int(b, opts->order, read_num++) < 0) { + ret = -1; + goto err; + } + } + + res = sam_write1(fp_out, hdr_out, b); + } + } while (res >= 0); + + if (res != -1) { + print_error("import", "truncated file. Aborting"); + ret = res; + goto err; + } + + if (eof != n) { + print_error("import", "input files with differing number of records"); + ret = -1; + goto err; + } + + // Close and return + ret = 0; +err: + bam_destroy1(b); + sam_hdr_destroy(hdr_out); + ks_free(&rg_line); + ks_free(&index_str); + ks_free(&read_str); + if (fp_out) { + if (sam_close(fp_out) < 0) { + perror(opts->fn_out); + ret |= -1; + } + } + for (i = 0; i < FQ_END; i++) { + if (fp_in[i] && sam_close(fp_in[i]) < 0) { + perror(opts->fn[i]); + ret |= -1; + } + } + ks_free(&idx_seq); + ks_free(&idx_qual); + + return ret; +} + +int main_import(int argc, char *argv[]) { + int c; + opts_t opts = { + .no_pg = 0, + .ga = SAM_GLOBAL_ARGS_INIT, + .fn = {NULL}, + .fn_out = "-", + .casava = 0, + .barcode_seq = "BC", + .barcode_qual = "QT", + .aux = NULL, + .rg = NULL, + .rg_line = NULL, + .order = NULL, + .compress_level = -1, + }; + kstring_t rg = {0}; + + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, '-', '@'), + {"no-PG", no_argument, NULL, 9}, + {"i1", required_argument, NULL, 1}, + {"i2", required_argument, NULL, 2}, + {"r1", required_argument, NULL, '1'}, + {"r2", required_argument, NULL, '2'}, + {"rg", required_argument, NULL, 'R'}, + {"rg-line", required_argument, NULL, 'r'}, + {"order", required_argument, NULL, 3}, + {"barcode-tag", required_argument, NULL, 4}, + {"quality-tag", required_argument, NULL, 5}, + { NULL, 0, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) { + switch (c) { + case 'b': opts.idx_both = 1; break; + case '0': opts.fn[FQ_R0] = optarg; break; + case '1': opts.fn[FQ_R1] = optarg; break; + case '2': opts.fn[FQ_R2] = optarg; break; + case 1: opts.fn[FQ_I1] = optarg; break; + case 2: opts.fn[FQ_I2] = optarg; break; + case 's': opts.fn[FQ_SINGLE] = optarg; break; + case 'o': opts.fn_out = optarg; break; + case 'i': opts.casava = 1; break; + case 4: opts.barcode_seq = optarg; break; + case 5: opts.barcode_qual = optarg; break; + case 'T': opts.aux = optarg; break; + case 'u': opts.compress_level = 0; break; + case 'R': opts.rg = optarg; break; + case 'r': + if (*optarg != '@' && ks_len(&rg) == 0) + kputs("@RG", &rg); + if (ks_len(&rg)) + kputc_('\t', &rg); + kputs(optarg, &rg); + opts.rg_line = rg.s; + break; + + case 9: opts.no_pg = 1; break; + case 3: opts.order = optarg; break; + + case 'h': return usage(samtools_stdout, EXIT_SUCCESS); + case '?': return usage(samtools_stderr, EXIT_FAILURE); + + default: + if (parse_sam_global_opt(c, optarg, lopts, &opts.ga) != 0) + return usage(samtools_stderr, EXIT_FAILURE); + break; + } + } + + if (opts.ga.nthreads > 0) { + if (!(opts.p.pool = hts_tpool_init(opts.ga.nthreads))) { + fprintf(samtools_stderr, "Failed to create thread pool\n"); + if (rg.s) + free(rg.s); + return -1;; + } + } + + int ret = import_fastq(argc-optind, argv+optind, &opts) ? 1 : 0; + + if (rg.s) + free(rg.s); + + if (opts.p.pool) + hts_tpool_destroy(opts.p.pool); + + return ret; +} diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c index 5399da7..7b2ee3e 100644 --- a/samtools/bam_index.c.pysam.c +++ b/samtools/bam_index.c.pysam.c @@ -170,7 +170,7 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, "Usage: samtools idxstats [options] \n"); sam_global_opt_help(fp, "-.---@-."); - exit(exit_status); + samtools_exit(exit_status); } int bam_idxstats(int argc, char *argv[]) diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c index 1619b5b..2da184f 100644 --- a/samtools/bam_markdup.c +++ b/samtools/bam_markdup.c @@ -1,7 +1,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2019 Genome Research Ltd. + Copyright (C) 2017-2020 Genome Research Ltd. Author: Andrew Whitwham @@ -62,6 +62,7 @@ typedef struct { int mode; int write_index; int include_fails; + int check_chain; char *stats_file; char *arg_list; char *out_fn; @@ -83,6 +84,7 @@ typedef struct read_queue_s { bam1_t *b; struct read_queue_s *duplicate; hts_pos_t pos; + int dup_checked; } read_queue_t; typedef struct { @@ -94,8 +96,23 @@ typedef struct { char type; } dup_map_t; +typedef struct { + bam1_t *b; + int64_t score; + int64_t mate_score; + long x; + long y; + int opt; + int xpos; +} check_t; +typedef struct { + check_t *c; + size_t size; + size_t length; +} check_list_t; + static khint32_t do_hash(unsigned char *key, khint32_t len); static khint_t hash_key(key_data_t key) { @@ -665,6 +682,7 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n } +/* Get the position of the coordinates from the read name. */ static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { int sep = 0; int pos = 0; @@ -693,6 +711,66 @@ static inline int get_coordinate_positions(const char *qname, int *xpos, int *yp return sep; } + +static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) { + int ret = 1; + int seps, xpos = 0, ypos = 0; + long x = 0, y = 0; + char *end; + + seps = get_coordinate_positions(name, &xpos, &ypos); + + /* The most current Illumina read format at time of writing is: + @machine:run:flowcell:lane:tile:x:y:UMI or + @machine:run:flowcell:lane:tile:x:y + + Counting the separating colons gives us a quick format check. + Older name formats have fewer elements. + */ + + if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name); + } + + return ret; + } + + x = strtol(name + xpos, &end, 10); + + if ((name + xpos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name); + } + + return ret; + } + + y = strtol(name + ypos, &end, 10); + + if ((name + ypos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name); + } + + return ret; + } + + *x_coord = x; + *y_coord = y; + *xpos_out = xpos; + ret = 0; + + return ret; +} + + /* Using the coordinates from the Illumina read name, see whether the duplicated read is close enough (set by max_dist) to the original to be counted as optical.*/ @@ -806,6 +884,59 @@ static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warn } +/* Using the coordinates from the Illumina read name, see whether the duplicated read is + close enough (set by max_dist) to the original to be counted as optical. + + This function needs the values from the first read to be already calculated. */ + +static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) { + int ret = 0; + char *duplicate; + int dxpos = 0; + long dx, dy; + + duplicate = bam_get_qname(dup); + + if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) { + return ret; + } + + if (strncmp(name, duplicate, oxpos - 1) == 0) { + // the initial parts match, look at the numbers + long xdiff, ydiff; + + if (ox > dx) { + xdiff = ox - dx; + } else { + xdiff = dx - ox; + } + + if (xdiff <= max_dist) { + // still might be optical + + if (oy > dy) { + ydiff = oy - dy; + } else { + ydiff = dy - oy; + } + + if (ydiff <= max_dist) ret = 1; + } + } + + c->x = dx; + c->y = dy; + c->xpos = dxpos; + + if (ret) { + c->opt = ret; + } + + return ret; +} + + +/* Mark the read as a duplicate and update the duplicate hash (if needed) */ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, long *optical, long *warn) { char dup_type = 0; @@ -814,7 +945,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam dup->core.flag |= BAM_FDUP; if (param->tag) { - if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) { + if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) { fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); return -1; } @@ -822,12 +953,12 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam if (param->opt_dist) { // mark optical duplicates if (optical_duplicate(ori, dup, param->opt_dist, warn)) { - bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ"); + bam_aux_update_str(dup, "dt", 3, "SQ"); dup_type = 'O'; (*optical)++; } else { // not an optical duplicate - bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB"); + bam_aux_update_str(dup, "dt", 3, "LB"); } } @@ -853,17 +984,12 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam } +/* If the duplicate type has changed to optical then retag and duplicate hash. */ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { int ret = 0; - uint8_t *data; - // remove any existing dt tag - if ((data = bam_aux_get(b, "dt")) != NULL) { - bam_aux_del(b, data); - } - - if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) { - fprintf(stderr, "[markdup] error: unable to append 'dt' tag.\n"); + if (bam_aux_update_str(b, "dt", 3, "SQ")) { + fprintf(stderr, "[markdup] error: unable to update 'dt' tag.\n"); ret = -1; } @@ -897,23 +1023,54 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash } +/* Check all duplicates of the highest quality read (the "original") for consistancy. Also + pre-calculate any values for use in check_duplicate_chain later. + Returns 0 on success, >0 on coordinate reading error (program can continue) or + <0 on an error (program should not continue. */ +static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, + check_list_t *list, long *warn, long *optical_single, long *optical_pair) { -/* - Where there is more than one duplicate go down the list and check for optical duplicates and change - do tags (where used) to point to original (non-duplicate) read. -*/ -static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, - long *warn, long *optical_single, long *optical_pair) { int ret = 0; - read_queue_t *current = ori->duplicate; char *ori_name = bam_get_qname(ori->b); - int have_original = !(ori->b->core.flag & BAM_FDUP); - int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP); + read_queue_t *current = ori->duplicate; + int xpos; + long x, y; + + if (param->opt_dist) { + if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) { + return ret; + } + } + + list->length = 0; while (current) { - int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + check_t *c; + + if (list->length >= list->size) { + check_t *tmp; + + list->size *= 2; + + if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) { + fprintf(stderr, "[markdup] error: Unable to expand opt check list.\n"); + return -1; + } + + list->c = tmp; + } + + c = &list->c[list->length]; - if (param->tag && have_original) { + c->b = current->b; + c->x = -1; + c->y = -1; + c->opt = 0; + c->score = 0; + c->mate_score = 0; + current->dup_checked = 1; + + if (param->tag) { uint8_t *data; // at this stage all duplicates should have a do tag @@ -923,10 +1080,8 @@ static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_has if (old_name) { if (strcmp(old_name, ori_name) != 0) { - bam_aux_del(current->b, data); - - if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) { - fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); + if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) { + fprintf(stderr, "[markdup] error: unable to update 'do' tag.\n"); ret = -1; break; } @@ -940,118 +1095,226 @@ static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_has } if (param->opt_dist) { - int is_cur_opt = 0, is_ori_opt = 0; uint8_t *data; char *dup_type; + int is_opt = 0; + int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); - if ((data = bam_aux_get(ori->b, "dt"))) { + if ((data = bam_aux_get(current->b, "dt"))) { if ((dup_type = bam_aux2Z(data))) { if (strcmp(dup_type, "SQ") == 0) { - is_ori_opt = 1; + c->opt = 1; } } } - if ((data = bam_aux_get(current->b, "dt"))) { - if ((dup_type = bam_aux2Z(data))) { - if (strcmp(dup_type, "SQ") == 0) { - is_cur_opt = 1; - } + // need to run this to get the duplicates x and y scores + is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn); + + if (!c->opt && is_opt) { + if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + ret = -1; + break; } + + c->opt = 1; } - if (!(is_ori_opt && is_cur_opt)) { - // if both are already optical duplicates there is no need to check again, otherwise... + c->score = calc_score(current->b); - if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) { - // find out which one is the duplicate - int is_cur_dup = 0; + if (current_paired) { + if ((c->mate_score = get_mate_score(current->b)) == -1) { + fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + ret = -1; + break; + } + } + } - if (have_original) { - // compared against an original, this is a dup. - is_cur_dup = 1; - } else if (ori_paired != current_paired) { - if (!current_paired) { - // current is single vs pair, this is a dup. - is_cur_dup = 1; - } - } else { - // do it by scores - int64_t ori_score, curr_score; + current = current->duplicate; + list->length++; + } - if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) { - if (ori->b->core.flag & BAM_FQCFAIL) { - ori_score = 0; - curr_score = 1; - } else { - ori_score = 1; - curr_score = 0; - } - } else { - ori_score = calc_score(ori->b); - curr_score = calc_score(current->b); - - if (current_paired) { - // they are pairs so add mate scores. - int64_t mate_tmp; - - if ((mate_tmp = get_mate_score(ori->b)) == -1) { - fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); - ret = -1; - break; - } else { - ori_score += mate_tmp; - } + return ret; +} - if ((mate_tmp = get_mate_score(current->b)) == -1) { - fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); - ret = -1; - break; - } else { - curr_score += mate_tmp; - } - } - } - if (ori_score == curr_score) { - if (strcmp(bam_get_qname(current->b), ori_name) < 0) { - curr_score++; - } else { - curr_score--; - } - } +static int xcoord_sort(const void *a, const void *b) { + check_t *ac = (check_t *) a; + check_t *bc = (check_t *) b; - if (ori_score > curr_score) { - is_cur_dup = 1; - } + return (ac->x - bc->x); +} + + +/* Check all the duplicates against each other to see if they are optical duplicates. */ +static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list, + long *warn, long *optical_single, long *optical_pair) { + int ret = 0; + size_t curr = 0; + + qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort); + + while (curr < list->length - 1) { + check_t *current = &list->c[curr]; + size_t count = curr; + char *cur_name = bam_get_qname(current->b); + int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + + while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) { + // while close enough along the x coordinate + check_t *chk = &list->c[count]; + + if (current->opt && chk->opt) + continue; + + // if both are already optical duplicates there is no need to check again, otherwise... + + long ydiff; + + if (current->y > chk->y) { + ydiff = current->y - chk->y; + } else { + ydiff = chk->y - current->y; + } + + if (ydiff > param->opt_dist) + continue; + + // the number are right, check the names + if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0) + continue; + + // optical duplicates + int chk_dup = 0; + int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP); + + if (current_paired != chk_paired) { + if (!chk_paired) { + // chk is single vs pair, this is a dup. + chk_dup = 1; + } + } else { + // do it by scores + int64_t cur_score, chk_score; + + if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) { + if (current->b->core.flag & BAM_FQCFAIL) { + cur_score = 0; + chk_score = 1; + } else { + cur_score = 1; + chk_score = 0; } + } else { + cur_score = current->score; + chk_score = chk->score; - if (is_cur_dup) { - // the current is the optical duplicate - if (!is_cur_opt) { // only change if not already an optical duplicate - if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { - ret = -1; - break; - } - } + if (current_paired) { + // they are pairs so add mate scores. + chk_score += chk->mate_score; + cur_score += current->mate_score; + } + } + + if (cur_score == chk_score) { + if (strcmp(bam_get_qname(chk->b), cur_name) < 0) { + chk_score++; } else { - if (!is_ori_opt) { - if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) { - ret = -1; - break; - } - } + chk_score--; } } + + if (cur_score > chk_score) { + chk_dup = 1; + } + } + + if (chk_dup) { + // the duplicate is the optical duplicate + if (!chk->opt) { // only change if not already an optical duplicate + if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) { + ret = -1; + goto fail; + } + + chk->opt = 1; + } + } else { + if (!current->opt) { + if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + ret = -1; + goto fail; + } + + current->opt = 1; + } } } - current = current->duplicate; + curr++; + } + + fail: + return ret; +} + + +/* Where there is more than one duplicate go down the list and check for optical duplicates and change + do tags (where used) to point to original (non-duplicate) read. */ +static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list, + const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single, + long *optical_pair, const int check_range) { + int ret = 0; + kliter_t(read_queue) *rq; + + rq = kl_begin(read_buffer); + + while (rq != kl_end(read_buffer)) { + read_queue_t *in_read = &kl_val(rq); + + if (check_range) { + /* Just check against the moving window of reads based on coordinates and max read length. */ + if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { + break; + } + } else { + // this is the last set of results and the end entry will be blank + if (!bam_get_qname(in_read->b)) { + break; + } + } + + if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain + + // check against the original for tagging and optical duplication + if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) { + if (ret < 0) { // real error + ret = -1; + break; + } else { // coordinate decoding error + ret = 0; + in_read->duplicate = NULL; + continue; + } + } + + // check the rest of the duplicates against each other for optical duplication + if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) { + ret = -1; + break; + } + + in_read->duplicate = NULL; + } + + rq = kl_next(rq); } return ret; } + /* Function to use when estimating library size. @@ -1080,30 +1343,29 @@ static inline double coverage_equation(double x, double c, double n) { /* estimate the library size, based on the Picard code in DuplicationMetrics.java*/ -static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) { +static unsigned long estimate_library_size(unsigned long paired_reads, unsigned long paired_duplicate_reads, unsigned long optical) { unsigned long estimated_size = 0; + unsigned long non_optical_pairs = (paired_reads - optical) / 2; + unsigned long unique_pairs = (paired_reads - paired_duplicate_reads) / 2; + unsigned long duplicate_pairs = (paired_duplicate_reads - optical) / 2; - read_pairs /= 2; - duplicate_pairs /= 2; - - if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) { - unsigned long unique_pairs = read_pairs - duplicate_pairs; + if ((non_optical_pairs && duplicate_pairs && unique_pairs) && (non_optical_pairs > duplicate_pairs)) { double m = 1; double M = 100; int i; - if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) { + if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) { fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n"); return estimated_size; } - while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) { + while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) > 0) { M *= 10; } for (i = 0; i < 40; i++) { double r = (m + M) / 2; - double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs); + double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs); if (u > 0) { m = r; @@ -1119,7 +1381,7 @@ static unsigned long estimate_library_size(unsigned long read_pairs, unsigned lo fprintf(stderr, "[markdup] warning: unable to calculate estimated library size." " Read pairs %ld should be greater than duplicate pairs %ld," " which should both be non zero.\n", - read_pairs, duplicate_pairs); + non_optical_pairs, duplicate_pairs); } return estimated_size; @@ -1153,6 +1415,7 @@ static int bam_mark_duplicates(md_param_t *param) { tmp_file_t temp; char *idx_fn = NULL; int exclude = 0; + check_list_t dup_list = {NULL, 0, 0}; if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { fprintf(stderr, "[markdup] out of memory\n"); @@ -1213,10 +1476,24 @@ static int bam_mark_duplicates(md_param_t *param) { goto fail; } + if (param->check_chain && !(param->tag || param->opt_dist)) + param->check_chain = 0; + + if (param->check_chain) { + dup_list.size = 128; + dup_list.c = NULL; + + if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) { + fprintf(stderr, "[markdup] error: unable to allocate memory for dup_list.\n"); + goto fail; + } + } + reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; np_duplicate = np_opt_duplicate = 0; while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { + int dup_checked = 0; // do some basic coordinate order checks if (in_read->b->core.tid >= 0) { // -1 for unmapped reads @@ -1231,6 +1508,8 @@ static int bam_mark_duplicates(md_param_t *param) { prev_tid = in_read->b->core.tid; in_read->pair_key.single = 1; in_read->single_key.single = 0; + in_read->duplicate = NULL; + in_read->dup_checked = 0; reading++; @@ -1257,7 +1536,7 @@ static int bam_mark_duplicates(md_param_t *param) { // read must not be secondary, supplementary, unmapped or (possibly) failed QC if (!(in_read->b->core.flag & exclude)) { examined++; - in_read->duplicate = NULL; + // look at the pairs first if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { @@ -1300,17 +1579,15 @@ static int bam_mark_duplicates(md_param_t *param) { // scores more than one read of the pair bam1_t *dup = bp->p->b; - in_read->duplicate = bp->p; + if (param->check_chain) + in_read->duplicate = bp->p; + bp->p = in_read; if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) goto fail; single_dup++; - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - } } else { fprintf(stderr, "[markdup] error: single hashing failure.\n"); @@ -1327,8 +1604,7 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->pair_key = pair_key; } else if (ret == 0) { int64_t old_score, new_score, tie_add = 0; - bam1_t *dup; - int check_chain = 0; + bam1_t *dup = NULL; bp = &kh_val(pair_hash, k); @@ -1369,29 +1645,48 @@ static int bam_mark_duplicates(md_param_t *param) { if (new_score + tie_add > old_score) { // swap reads dup = bp->p->b; - in_read->duplicate = bp->p; + + if (param->check_chain) { + + if (in_read->duplicate) { + read_queue_t *current = in_read->duplicate; + + while (current->duplicate) { + current = current->duplicate; + } + + current->duplicate = bp->p; + } else { + in_read->duplicate = bp->p; + } + } + bp->p = in_read; } else { - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; + if (param->check_chain) { + if (bp->p->duplicate) { + if (in_read->duplicate) { + read_queue_t *current = bp->p->duplicate; + + while (current->duplicate) { + current = current->duplicate; + } + + current->duplicate = in_read->duplicate; + } + + in_read->duplicate = bp->p->duplicate; + } + + bp->p->duplicate = in_read; } - bp->p->duplicate = in_read; dup = in_read->b; } if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) goto fail; - if (check_chain) { - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; - } - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - duplicate++; } else { fprintf(stderr, "[markdup] error: pair hashing failure.\n"); @@ -1401,7 +1696,6 @@ static int bam_mark_duplicates(md_param_t *param) { int ret; key_data_t single_key; in_hash_t *bp; - int check_chain = 0; make_single_key(&single_key, in_read->b); @@ -1420,29 +1714,20 @@ static int bam_mark_duplicates(md_param_t *param) { if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { // if matched against one of a pair just mark as duplicate - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; - } - - bp->p->duplicate = in_read; - - if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) - goto fail; + if (param->check_chain) { + if (bp->p->duplicate) { + in_read->duplicate = bp->p->duplicate; + } - if (check_chain) { - // check the new duplicate entry in the chain - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; + bp->p->duplicate = in_read; } - // check against the new original - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) + if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) goto fail; } else { int64_t old_score, new_score; - bam1_t *dup; + bam1_t *dup = NULL; old_score = calc_score(bp->p->b); new_score = calc_score(in_read->b); @@ -1451,32 +1736,26 @@ static int bam_mark_duplicates(md_param_t *param) { // to the single hash and mark the other as duplicate if (new_score > old_score) { // swap reads dup = bp->p->b; - in_read->duplicate = bp->p; + + if (param->check_chain) + in_read->duplicate = bp->p; + bp->p = in_read; } else { - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; + if (param->check_chain) { + if (bp->p->duplicate) { + in_read->duplicate = bp->p->duplicate; + } + + bp->p->duplicate = in_read; } - bp->p->duplicate = in_read; dup = in_read->b; } if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) goto fail; - - - if (check_chain) { - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; - } - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - - - } + } single_dup++; } else { @@ -1500,6 +1779,22 @@ static int bam_mark_duplicates(md_param_t *param) { break; } + if (!dup_checked && param->check_chain) { + // check for multiple optical duplicates of the same original read + + if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) { + fprintf(stderr, "[markdup] error: duplicate checking failed.\n"); + goto fail; + } + + dup_checked = 1; + } + + + if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) { + break; + } + if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { @@ -1550,6 +1845,14 @@ static int bam_mark_duplicates(md_param_t *param) { goto fail; } + // one last check + if (param->tag || param->opt_dist) { + if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) { + fprintf(stderr, "[markdup] error: duplicate checking failed.\n"); + goto fail; + } + } + // write out the end of the list rq = kl_begin(read_buffer); while (rq != kl_end(read_buffer)) { @@ -1606,7 +1909,7 @@ static int bam_mark_duplicates(md_param_t *param) { np_duplicate++; if (param->tag && kh_val(dup_hash, k).name) { - if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) { + if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) { fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); goto fail; } @@ -1614,10 +1917,10 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->opt_dist) { if (kh_val(dup_hash, k).type) { - bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ"); + bam_aux_update_str(b, "dt", 3, "SQ"); np_opt_duplicate++; } else { - bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB"); + bam_aux_update_str(b, "dt", 3, "LB"); } } } @@ -1669,7 +1972,7 @@ static int bam_mark_duplicates(md_param_t *param) { fp = stderr; } - els = estimate_library_size(pair, duplicate - optical); + els = estimate_library_size(pair, duplicate, optical); fprintf(fp, "COMMAND: %s\n" @@ -1703,6 +2006,9 @@ static int bam_mark_duplicates(md_param_t *param) { } } + if (param->check_chain && (param->tag || param->opt_dist)) + free(dup_list.c); + kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); kl_destroy(read_queue, read_buffer); @@ -1723,6 +2029,9 @@ static int bam_mark_duplicates(md_param_t *param) { } kh_destroy(duplicates, dup_hash); + if (param->check_chain && (param->tag || param->opt_dist)) + free(dup_list.c); + kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); sam_hdr_destroy(header); @@ -1745,8 +2054,11 @@ static int markdup_usage(void) { fprintf(stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" " TYPE = t measure positions based on template start/end (default).\n" " s measure positions based on sequence start.\n"); + fprintf(stderr, " -n Reduce optical duplicate accuracy (faster results with many duplicates).\n"); + fprintf(stderr, " -u Output uncompressed data\n"); fprintf(stderr, " --include-fails Include quality check failed reads.\n"); fprintf(stderr, " --no-PG Do not add a PG line\n"); + fprintf(stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n"); fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); @@ -1761,23 +2073,24 @@ static int markdup_usage(void) { int bam_markdup(int argc, char **argv) { int c, ret; - char wmode[3] = {'w', 'b', 0}; + char wmode[4] = {'w', 'b', 0, 0}; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; kstring_t tmpprefix = {0, 0, NULL}; struct stat st; unsigned int t; - md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL}; + md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), {"include-fails", no_argument, NULL, 1001}, {"no-PG", no_argument, NULL, 1002}, {"mode", required_argument, NULL, 'm'}, + {"no-multi-dup", no_argument, NULL, 1003}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) { switch (c) { case 'r': param.remove_dups = 1; break; case 'l': param.max_length = atoi(optarg); break; @@ -1799,8 +2112,10 @@ int bam_markdup(int argc, char **argv) { } break; + case 'u': wmode[2] = '0'; break; case 1001: param.include_fails = 1; break; case 1002: param.no_pg = 1; break; + case 1003: param.check_chain = 0; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c index bcb9243..7132687 100644 --- a/samtools/bam_markdup.c.pysam.c +++ b/samtools/bam_markdup.c.pysam.c @@ -3,7 +3,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2019 Genome Research Ltd. + Copyright (C) 2017-2020 Genome Research Ltd. Author: Andrew Whitwham @@ -64,6 +64,7 @@ typedef struct { int mode; int write_index; int include_fails; + int check_chain; char *stats_file; char *arg_list; char *out_fn; @@ -85,6 +86,7 @@ typedef struct read_queue_s { bam1_t *b; struct read_queue_s *duplicate; hts_pos_t pos; + int dup_checked; } read_queue_t; typedef struct { @@ -96,8 +98,23 @@ typedef struct { char type; } dup_map_t; +typedef struct { + bam1_t *b; + int64_t score; + int64_t mate_score; + long x; + long y; + int opt; + int xpos; +} check_t; +typedef struct { + check_t *c; + size_t size; + size_t length; +} check_list_t; + static khint32_t do_hash(unsigned char *key, khint32_t len); static khint_t hash_key(key_data_t key) { @@ -667,6 +684,7 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n } +/* Get the position of the coordinates from the read name. */ static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { int sep = 0; int pos = 0; @@ -695,6 +713,66 @@ static inline int get_coordinate_positions(const char *qname, int *xpos, int *yp return sep; } + +static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) { + int ret = 1; + int seps, xpos = 0, ypos = 0; + long x = 0, y = 0; + char *end; + + seps = get_coordinate_positions(name, &xpos, &ypos); + + /* The most current Illumina read format at time of writing is: + @machine:run:flowcell:lane:tile:x:y:UMI or + @machine:run:flowcell:lane:tile:x:y + + Counting the separating colons gives us a quick format check. + Older name formats have fewer elements. + */ + + if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name); + } + + return ret; + } + + x = strtol(name + xpos, &end, 10); + + if ((name + xpos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name); + } + + return ret; + } + + y = strtol(name + ypos, &end, 10); + + if ((name + ypos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name); + } + + return ret; + } + + *x_coord = x; + *y_coord = y; + *xpos_out = xpos; + ret = 0; + + return ret; +} + + /* Using the coordinates from the Illumina read name, see whether the duplicated read is close enough (set by max_dist) to the original to be counted as optical.*/ @@ -808,6 +886,59 @@ static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warn } +/* Using the coordinates from the Illumina read name, see whether the duplicated read is + close enough (set by max_dist) to the original to be counted as optical. + + This function needs the values from the first read to be already calculated. */ + +static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) { + int ret = 0; + char *duplicate; + int dxpos = 0; + long dx, dy; + + duplicate = bam_get_qname(dup); + + if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) { + return ret; + } + + if (strncmp(name, duplicate, oxpos - 1) == 0) { + // the initial parts match, look at the numbers + long xdiff, ydiff; + + if (ox > dx) { + xdiff = ox - dx; + } else { + xdiff = dx - ox; + } + + if (xdiff <= max_dist) { + // still might be optical + + if (oy > dy) { + ydiff = oy - dy; + } else { + ydiff = dy - oy; + } + + if (ydiff <= max_dist) ret = 1; + } + } + + c->x = dx; + c->y = dy; + c->xpos = dxpos; + + if (ret) { + c->opt = ret; + } + + return ret; +} + + +/* Mark the read as a duplicate and update the duplicate hash (if needed) */ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, long *optical, long *warn) { char dup_type = 0; @@ -816,7 +947,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam dup->core.flag |= BAM_FDUP; if (param->tag) { - if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) { + if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) { fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); return -1; } @@ -824,12 +955,12 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam if (param->opt_dist) { // mark optical duplicates if (optical_duplicate(ori, dup, param->opt_dist, warn)) { - bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ"); + bam_aux_update_str(dup, "dt", 3, "SQ"); dup_type = 'O'; (*optical)++; } else { // not an optical duplicate - bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB"); + bam_aux_update_str(dup, "dt", 3, "LB"); } } @@ -855,17 +986,12 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam } +/* If the duplicate type has changed to optical then retag and duplicate hash. */ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { int ret = 0; - uint8_t *data; - // remove any existing dt tag - if ((data = bam_aux_get(b, "dt")) != NULL) { - bam_aux_del(b, data); - } - - if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) { - fprintf(samtools_stderr, "[markdup] error: unable to append 'dt' tag.\n"); + if (bam_aux_update_str(b, "dt", 3, "SQ")) { + fprintf(samtools_stderr, "[markdup] error: unable to update 'dt' tag.\n"); ret = -1; } @@ -899,23 +1025,54 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash } +/* Check all duplicates of the highest quality read (the "original") for consistancy. Also + pre-calculate any values for use in check_duplicate_chain later. + Returns 0 on success, >0 on coordinate reading error (program can continue) or + <0 on an error (program should not continue. */ +static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, + check_list_t *list, long *warn, long *optical_single, long *optical_pair) { -/* - Where there is more than one duplicate go down the list and check for optical duplicates and change - do tags (where used) to point to original (non-duplicate) read. -*/ -static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, - long *warn, long *optical_single, long *optical_pair) { int ret = 0; - read_queue_t *current = ori->duplicate; char *ori_name = bam_get_qname(ori->b); - int have_original = !(ori->b->core.flag & BAM_FDUP); - int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP); + read_queue_t *current = ori->duplicate; + int xpos; + long x, y; + + if (param->opt_dist) { + if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) { + return ret; + } + } + + list->length = 0; while (current) { - int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + check_t *c; + + if (list->length >= list->size) { + check_t *tmp; + + list->size *= 2; + + if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) { + fprintf(samtools_stderr, "[markdup] error: Unable to expand opt check list.\n"); + return -1; + } + + list->c = tmp; + } + + c = &list->c[list->length]; - if (param->tag && have_original) { + c->b = current->b; + c->x = -1; + c->y = -1; + c->opt = 0; + c->score = 0; + c->mate_score = 0; + current->dup_checked = 1; + + if (param->tag) { uint8_t *data; // at this stage all duplicates should have a do tag @@ -925,10 +1082,8 @@ static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_has if (old_name) { if (strcmp(old_name, ori_name) != 0) { - bam_aux_del(current->b, data); - - if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) { - fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); + if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) { + fprintf(samtools_stderr, "[markdup] error: unable to update 'do' tag.\n"); ret = -1; break; } @@ -942,118 +1097,226 @@ static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_has } if (param->opt_dist) { - int is_cur_opt = 0, is_ori_opt = 0; uint8_t *data; char *dup_type; + int is_opt = 0; + int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); - if ((data = bam_aux_get(ori->b, "dt"))) { + if ((data = bam_aux_get(current->b, "dt"))) { if ((dup_type = bam_aux2Z(data))) { if (strcmp(dup_type, "SQ") == 0) { - is_ori_opt = 1; + c->opt = 1; } } } - if ((data = bam_aux_get(current->b, "dt"))) { - if ((dup_type = bam_aux2Z(data))) { - if (strcmp(dup_type, "SQ") == 0) { - is_cur_opt = 1; - } + // need to run this to get the duplicates x and y scores + is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn); + + if (!c->opt && is_opt) { + if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + ret = -1; + break; } + + c->opt = 1; } - if (!(is_ori_opt && is_cur_opt)) { - // if both are already optical duplicates there is no need to check again, otherwise... + c->score = calc_score(current->b); - if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) { - // find out which one is the duplicate - int is_cur_dup = 0; + if (current_paired) { + if ((c->mate_score = get_mate_score(current->b)) == -1) { + fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + ret = -1; + break; + } + } + } - if (have_original) { - // compared against an original, this is a dup. - is_cur_dup = 1; - } else if (ori_paired != current_paired) { - if (!current_paired) { - // current is single vs pair, this is a dup. - is_cur_dup = 1; - } - } else { - // do it by scores - int64_t ori_score, curr_score; + current = current->duplicate; + list->length++; + } - if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) { - if (ori->b->core.flag & BAM_FQCFAIL) { - ori_score = 0; - curr_score = 1; - } else { - ori_score = 1; - curr_score = 0; - } - } else { - ori_score = calc_score(ori->b); - curr_score = calc_score(current->b); - - if (current_paired) { - // they are pairs so add mate scores. - int64_t mate_tmp; - - if ((mate_tmp = get_mate_score(ori->b)) == -1) { - fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); - ret = -1; - break; - } else { - ori_score += mate_tmp; - } + return ret; +} - if ((mate_tmp = get_mate_score(current->b)) == -1) { - fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); - ret = -1; - break; - } else { - curr_score += mate_tmp; - } - } - } - if (ori_score == curr_score) { - if (strcmp(bam_get_qname(current->b), ori_name) < 0) { - curr_score++; - } else { - curr_score--; - } - } +static int xcoord_sort(const void *a, const void *b) { + check_t *ac = (check_t *) a; + check_t *bc = (check_t *) b; - if (ori_score > curr_score) { - is_cur_dup = 1; - } + return (ac->x - bc->x); +} + + +/* Check all the duplicates against each other to see if they are optical duplicates. */ +static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list, + long *warn, long *optical_single, long *optical_pair) { + int ret = 0; + size_t curr = 0; + + qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort); + + while (curr < list->length - 1) { + check_t *current = &list->c[curr]; + size_t count = curr; + char *cur_name = bam_get_qname(current->b); + int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + + while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) { + // while close enough along the x coordinate + check_t *chk = &list->c[count]; + + if (current->opt && chk->opt) + continue; + + // if both are already optical duplicates there is no need to check again, otherwise... + + long ydiff; + + if (current->y > chk->y) { + ydiff = current->y - chk->y; + } else { + ydiff = chk->y - current->y; + } + + if (ydiff > param->opt_dist) + continue; + + // the number are right, check the names + if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0) + continue; + + // optical duplicates + int chk_dup = 0; + int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP); + + if (current_paired != chk_paired) { + if (!chk_paired) { + // chk is single vs pair, this is a dup. + chk_dup = 1; + } + } else { + // do it by scores + int64_t cur_score, chk_score; + + if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) { + if (current->b->core.flag & BAM_FQCFAIL) { + cur_score = 0; + chk_score = 1; + } else { + cur_score = 1; + chk_score = 0; } + } else { + cur_score = current->score; + chk_score = chk->score; - if (is_cur_dup) { - // the current is the optical duplicate - if (!is_cur_opt) { // only change if not already an optical duplicate - if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { - ret = -1; - break; - } - } + if (current_paired) { + // they are pairs so add mate scores. + chk_score += chk->mate_score; + cur_score += current->mate_score; + } + } + + if (cur_score == chk_score) { + if (strcmp(bam_get_qname(chk->b), cur_name) < 0) { + chk_score++; } else { - if (!is_ori_opt) { - if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) { - ret = -1; - break; - } - } + chk_score--; } } + + if (cur_score > chk_score) { + chk_dup = 1; + } + } + + if (chk_dup) { + // the duplicate is the optical duplicate + if (!chk->opt) { // only change if not already an optical duplicate + if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) { + ret = -1; + goto fail; + } + + chk->opt = 1; + } + } else { + if (!current->opt) { + if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + ret = -1; + goto fail; + } + + current->opt = 1; + } } } - current = current->duplicate; + curr++; + } + + fail: + return ret; +} + + +/* Where there is more than one duplicate go down the list and check for optical duplicates and change + do tags (where used) to point to original (non-duplicate) read. */ +static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list, + const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single, + long *optical_pair, const int check_range) { + int ret = 0; + kliter_t(read_queue) *rq; + + rq = kl_begin(read_buffer); + + while (rq != kl_end(read_buffer)) { + read_queue_t *in_read = &kl_val(rq); + + if (check_range) { + /* Just check against the moving window of reads based on coordinates and max read length. */ + if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { + break; + } + } else { + // this is the last set of results and the end entry will be blank + if (!bam_get_qname(in_read->b)) { + break; + } + } + + if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain + + // check against the original for tagging and optical duplication + if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) { + if (ret < 0) { // real error + ret = -1; + break; + } else { // coordinate decoding error + ret = 0; + in_read->duplicate = NULL; + continue; + } + } + + // check the rest of the duplicates against each other for optical duplication + if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) { + ret = -1; + break; + } + + in_read->duplicate = NULL; + } + + rq = kl_next(rq); } return ret; } + /* Function to use when estimating library size. @@ -1082,30 +1345,29 @@ static inline double coverage_equation(double x, double c, double n) { /* estimate the library size, based on the Picard code in DuplicationMetrics.java*/ -static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) { +static unsigned long estimate_library_size(unsigned long paired_reads, unsigned long paired_duplicate_reads, unsigned long optical) { unsigned long estimated_size = 0; + unsigned long non_optical_pairs = (paired_reads - optical) / 2; + unsigned long unique_pairs = (paired_reads - paired_duplicate_reads) / 2; + unsigned long duplicate_pairs = (paired_duplicate_reads - optical) / 2; - read_pairs /= 2; - duplicate_pairs /= 2; - - if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) { - unsigned long unique_pairs = read_pairs - duplicate_pairs; + if ((non_optical_pairs && duplicate_pairs && unique_pairs) && (non_optical_pairs > duplicate_pairs)) { double m = 1; double M = 100; int i; - if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) { + if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) { fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size.\n"); return estimated_size; } - while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) { + while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) > 0) { M *= 10; } for (i = 0; i < 40; i++) { double r = (m + M) / 2; - double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs); + double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs); if (u > 0) { m = r; @@ -1121,7 +1383,7 @@ static unsigned long estimate_library_size(unsigned long read_pairs, unsigned lo fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size." " Read pairs %ld should be greater than duplicate pairs %ld," " which should both be non zero.\n", - read_pairs, duplicate_pairs); + non_optical_pairs, duplicate_pairs); } return estimated_size; @@ -1155,6 +1417,7 @@ static int bam_mark_duplicates(md_param_t *param) { tmp_file_t temp; char *idx_fn = NULL; int exclude = 0; + check_list_t dup_list = {NULL, 0, 0}; if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { fprintf(samtools_stderr, "[markdup] out of memory\n"); @@ -1215,10 +1478,24 @@ static int bam_mark_duplicates(md_param_t *param) { goto fail; } + if (param->check_chain && !(param->tag || param->opt_dist)) + param->check_chain = 0; + + if (param->check_chain) { + dup_list.size = 128; + dup_list.c = NULL; + + if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) { + fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for dup_list.\n"); + goto fail; + } + } + reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; np_duplicate = np_opt_duplicate = 0; while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { + int dup_checked = 0; // do some basic coordinate order checks if (in_read->b->core.tid >= 0) { // -1 for unmapped reads @@ -1233,6 +1510,8 @@ static int bam_mark_duplicates(md_param_t *param) { prev_tid = in_read->b->core.tid; in_read->pair_key.single = 1; in_read->single_key.single = 0; + in_read->duplicate = NULL; + in_read->dup_checked = 0; reading++; @@ -1259,7 +1538,7 @@ static int bam_mark_duplicates(md_param_t *param) { // read must not be secondary, supplementary, unmapped or (possibly) failed QC if (!(in_read->b->core.flag & exclude)) { examined++; - in_read->duplicate = NULL; + // look at the pairs first if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { @@ -1302,17 +1581,15 @@ static int bam_mark_duplicates(md_param_t *param) { // scores more than one read of the pair bam1_t *dup = bp->p->b; - in_read->duplicate = bp->p; + if (param->check_chain) + in_read->duplicate = bp->p; + bp->p = in_read; if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) goto fail; single_dup++; - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - } } else { fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); @@ -1329,8 +1606,7 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->pair_key = pair_key; } else if (ret == 0) { int64_t old_score, new_score, tie_add = 0; - bam1_t *dup; - int check_chain = 0; + bam1_t *dup = NULL; bp = &kh_val(pair_hash, k); @@ -1371,29 +1647,48 @@ static int bam_mark_duplicates(md_param_t *param) { if (new_score + tie_add > old_score) { // swap reads dup = bp->p->b; - in_read->duplicate = bp->p; + + if (param->check_chain) { + + if (in_read->duplicate) { + read_queue_t *current = in_read->duplicate; + + while (current->duplicate) { + current = current->duplicate; + } + + current->duplicate = bp->p; + } else { + in_read->duplicate = bp->p; + } + } + bp->p = in_read; } else { - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; + if (param->check_chain) { + if (bp->p->duplicate) { + if (in_read->duplicate) { + read_queue_t *current = bp->p->duplicate; + + while (current->duplicate) { + current = current->duplicate; + } + + current->duplicate = in_read->duplicate; + } + + in_read->duplicate = bp->p->duplicate; + } + + bp->p->duplicate = in_read; } - bp->p->duplicate = in_read; dup = in_read->b; } if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) goto fail; - if (check_chain) { - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; - } - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - duplicate++; } else { fprintf(samtools_stderr, "[markdup] error: pair hashing failure.\n"); @@ -1403,7 +1698,6 @@ static int bam_mark_duplicates(md_param_t *param) { int ret; key_data_t single_key; in_hash_t *bp; - int check_chain = 0; make_single_key(&single_key, in_read->b); @@ -1422,29 +1716,20 @@ static int bam_mark_duplicates(md_param_t *param) { if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { // if matched against one of a pair just mark as duplicate - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; - } - - bp->p->duplicate = in_read; - - if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) - goto fail; + if (param->check_chain) { + if (bp->p->duplicate) { + in_read->duplicate = bp->p->duplicate; + } - if (check_chain) { - // check the new duplicate entry in the chain - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; + bp->p->duplicate = in_read; } - // check against the new original - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) + if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) goto fail; } else { int64_t old_score, new_score; - bam1_t *dup; + bam1_t *dup = NULL; old_score = calc_score(bp->p->b); new_score = calc_score(in_read->b); @@ -1453,32 +1738,26 @@ static int bam_mark_duplicates(md_param_t *param) { // to the single hash and mark the other as duplicate if (new_score > old_score) { // swap reads dup = bp->p->b; - in_read->duplicate = bp->p; + + if (param->check_chain) + in_read->duplicate = bp->p; + bp->p = in_read; } else { - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; + if (param->check_chain) { + if (bp->p->duplicate) { + in_read->duplicate = bp->p->duplicate; + } + + bp->p->duplicate = in_read; } - bp->p->duplicate = in_read; dup = in_read->b; } if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) goto fail; - - - if (check_chain) { - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; - } - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - - - } + } single_dup++; } else { @@ -1502,6 +1781,22 @@ static int bam_mark_duplicates(md_param_t *param) { break; } + if (!dup_checked && param->check_chain) { + // check for multiple optical duplicates of the same original read + + if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) { + fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n"); + goto fail; + } + + dup_checked = 1; + } + + + if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) { + break; + } + if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { @@ -1552,6 +1847,14 @@ static int bam_mark_duplicates(md_param_t *param) { goto fail; } + // one last check + if (param->tag || param->opt_dist) { + if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) { + fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n"); + goto fail; + } + } + // write out the end of the list rq = kl_begin(read_buffer); while (rq != kl_end(read_buffer)) { @@ -1608,7 +1911,7 @@ static int bam_mark_duplicates(md_param_t *param) { np_duplicate++; if (param->tag && kh_val(dup_hash, k).name) { - if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) { + if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) { fprintf(samtools_stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); goto fail; } @@ -1616,10 +1919,10 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->opt_dist) { if (kh_val(dup_hash, k).type) { - bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ"); + bam_aux_update_str(b, "dt", 3, "SQ"); np_opt_duplicate++; } else { - bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB"); + bam_aux_update_str(b, "dt", 3, "LB"); } } } @@ -1671,7 +1974,7 @@ static int bam_mark_duplicates(md_param_t *param) { fp = samtools_stderr; } - els = estimate_library_size(pair, duplicate - optical); + els = estimate_library_size(pair, duplicate, optical); fprintf(fp, "COMMAND: %s\n" @@ -1705,6 +2008,9 @@ static int bam_mark_duplicates(md_param_t *param) { } } + if (param->check_chain && (param->tag || param->opt_dist)) + free(dup_list.c); + kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); kl_destroy(read_queue, read_buffer); @@ -1725,6 +2031,9 @@ static int bam_mark_duplicates(md_param_t *param) { } kh_destroy(duplicates, dup_hash); + if (param->check_chain && (param->tag || param->opt_dist)) + free(dup_list.c); + kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); sam_hdr_destroy(header); @@ -1747,8 +2056,11 @@ static int markdup_usage(void) { fprintf(samtools_stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" " TYPE = t measure positions based on template start/end (default).\n" " s measure positions based on sequence start.\n"); + fprintf(samtools_stderr, " -n Reduce optical duplicate accuracy (faster results with many duplicates).\n"); + fprintf(samtools_stderr, " -u Output uncompressed data\n"); fprintf(samtools_stderr, " --include-fails Include quality check failed reads.\n"); fprintf(samtools_stderr, " --no-PG Do not add a PG line\n"); + fprintf(samtools_stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n"); fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); @@ -1763,23 +2075,24 @@ static int markdup_usage(void) { int bam_markdup(int argc, char **argv) { int c, ret; - char wmode[3] = {'w', 'b', 0}; + char wmode[4] = {'w', 'b', 0, 0}; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; kstring_t tmpprefix = {0, 0, NULL}; struct stat st; unsigned int t; - md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL}; + md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), {"include-fails", no_argument, NULL, 1001}, {"no-PG", no_argument, NULL, 1002}, {"mode", required_argument, NULL, 'm'}, + {"no-multi-dup", no_argument, NULL, 1003}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) { switch (c) { case 'r': param.remove_dups = 1; break; case 'l': param.max_length = atoi(optarg); break; @@ -1801,8 +2114,10 @@ int bam_markdup(int argc, char **argv) { } break; + case 'u': wmode[2] = '0'; break; case 1001: param.include_fails = 1; break; case 1002: param.no_pg = 1; break; + case 1003: param.check_chain = 0; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c index 6d40144..4239fd1 100644 --- a/samtools/bam_mate.c +++ b/samtools/bam_mate.c @@ -372,7 +372,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop curr = 1 - curr; pre_end = cur_end; } - if (result < -1) goto fail; + if (result < -1) goto read_fail; if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired bam1_t *pre = b[1-curr]; if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped @@ -391,6 +391,10 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop ks_free(&str); return 0; + read_fail: + print_error("fixmate", "Couldn't read from input file"); + goto fail; + write_fail: print_error_errno("fixmate", "Couldn't write to output file"); fail: @@ -410,6 +414,7 @@ void usage(FILE* where) " -p Disable FR proper pair check\n" " -c Add template cigar ct tag\n" " -m Add mate score tag\n" +" -u Uncompressed output\n" " --no-PG do not add a PG line\n"); sam_global_opt_help(where, "-.O..@-."); @@ -427,7 +432,7 @@ int bam_mating(int argc, char *argv[]) samFile *in = NULL, *out = NULL; int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - char wmode[3] = {'w', 'b', 0}; + char wmode[4] = {'w', 'b', 0, 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), {"no-PG", no_argument, NULL, 1}, @@ -437,12 +442,13 @@ int bam_mating(int argc, char *argv[]) // parse args if (argc == 1) { usage(stdout); return 0; } - while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; case 'p': proper_pair_check = 0; break; case 'c': add_ct = 1; break; case 'm': mate_score = 1; break; + case 'u': wmode[2] = '0'; break; case 1: no_pg = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c index edefb0b..0aa83ec 100644 --- a/samtools/bam_mate.c.pysam.c +++ b/samtools/bam_mate.c.pysam.c @@ -374,7 +374,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop curr = 1 - curr; pre_end = cur_end; } - if (result < -1) goto fail; + if (result < -1) goto read_fail; if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired bam1_t *pre = b[1-curr]; if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped @@ -393,6 +393,10 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop ks_free(&str); return 0; + read_fail: + print_error("fixmate", "Couldn't read from input file"); + goto fail; + write_fail: print_error_errno("fixmate", "Couldn't write to output file"); fail: @@ -412,6 +416,7 @@ void usage(FILE* where) " -p Disable FR proper pair check\n" " -c Add template cigar ct tag\n" " -m Add mate score tag\n" +" -u Uncompressed output\n" " --no-PG do not add a PG line\n"); sam_global_opt_help(where, "-.O..@-."); @@ -429,7 +434,7 @@ int bam_mating(int argc, char *argv[]) samFile *in = NULL, *out = NULL; int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - char wmode[3] = {'w', 'b', 0}; + char wmode[4] = {'w', 'b', 0, 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), {"no-PG", no_argument, NULL, 1}, @@ -439,12 +444,13 @@ int bam_mating(int argc, char *argv[]) // parse args if (argc == 1) { usage(samtools_stdout); return 0; } - while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; case 'p': proper_pair_check = 0; break; case 'c': add_ct = 1; break; case 'm': mate_score = 1; break; + case 'u': wmode[2] = '0'; break; case 1: no_pg = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ diff --git a/samtools/bam_md.c b/samtools/bam_md.c index 9277788..7d5aeaa 100644 --- a/samtools/bam_md.c +++ b/samtools/bam_md.c @@ -1,6 +1,6 @@ /* bam_md.c -- calmd subcommand. - Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li @@ -30,6 +30,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "htslib/faidx.h" #include "htslib/sam.h" #include "htslib/kstring.h" @@ -46,102 +47,136 @@ DEALINGS IN THE SOFTWARE. */ int bam_aux_drop_other(bam1_t *b, uint8_t *s); -void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode) +static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref, + hts_pos_t ref_len, int flag, int max_nm, + int quiet_mode, uint32_t *skipped) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; - int i, y, u = 0; - hts_pos_t x; - kstring_t *str; + int i, qpos, matched = 0; + hts_pos_t rpos; + kstring_t str = KS_INITIALIZE; int32_t old_nm_i = -1, nm = 0; + uint32_t err = 0; - str = (kstring_t*)calloc(1, sizeof(kstring_t)); - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (c->l_qseq == 0) { + if (!quiet_mode) { + if (ref_name) { + fprintf(stderr, "[bam_fillmd1] no sequence in alignment " + "record for '%s' at %s:%"PRIhts_pos", skipped\n", + bam_get_qname(b), ref_name, c->pos + 1); + } else { + fprintf(stderr, "[bam_fillmd1] no sequence in alignment " + "record for '%s', skipped", bam_get_qname(b)); + } + } + if (skipped) (*skipped)++; + return 0; + } + + for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) { + int j, oplen = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int c1, c2, z = y + j; - if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds - c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; + for (j = 0; j < oplen; ++j) { + int c1, c2, z = qpos + j; + if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0') + break; // out of bounds + c1 = bam_seqi(seq, z); + c2 = seq_nt16_table[(uint8_t)ref[rpos+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; - ++u; + ++matched; } else { - kputw(u, str); kputc(toupper(ref[x+j]), str); - u = 0; ++nm; + err |= kputw(matched, &str) < 0; + err |= kputc(toupper(ref[rpos+j]), &str) < 0; + matched = 0; ++nm; } } - if (j < l) break; - x += l; y += l; + if (j < oplen) break; + rpos += oplen; qpos += oplen; } else if (op == BAM_CDEL) { - kputw(u, str); kputc('^', str); - for (j = 0; j < l; ++j) { - if (x+j >= ref_len || ref[x+j] == '\0') break; - kputc(toupper(ref[x+j]), str); + err |= kputw(matched, &str) < 0; + err |= kputc('^', &str) < 0; + for (j = 0; j < oplen; ++j) { + if (rpos+j >= ref_len || ref[rpos+j] == '\0') break; + err |= kputc(toupper(ref[rpos+j]), &str) < 0; } - u = 0; - x += j; nm += j; - if (j < l) break; + matched = 0; + rpos += j; nm += j; + if (j < oplen) break; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { - y += l; - if (op == BAM_CINS) nm += l; + qpos += oplen; + if (op == BAM_CINS) nm += oplen; } else if (op == BAM_CREF_SKIP) { - x += l; + rpos += oplen; } } - kputw(u, str); + err |= kputw(matched, &str) < 0; + if (err) { + print_error_errno("calmd", "Couldn't build new MD string"); + goto fail; + } // apply max_nm if (max_nm > 0 && nm >= max_nm) { - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; + for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) { + int j, oplen = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int c1, c2, z = y + j; - if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds - c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; + for (j = 0; j < oplen; ++j) { + int c1, c2, z = qpos + j; + if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0') + break; // out of bounds + c1 = bam_seqi(seq, z); + c2 = seq_nt16_table[(uint8_t)ref[rpos+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match seq[z/2] |= (z&1)? 0x0f : 0xf0; bam_get_qual(b)[z] = 0; } } - if (j < l) break; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + if (j < oplen) break; + rpos += oplen; qpos += oplen; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rpos += oplen; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) qpos += oplen; } } // update NM if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_nm = bam_aux_get(b, "NM"); if (old_nm) old_nm_i = bam_aux2i(old_nm); - if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + if (!old_nm) { + if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0) + goto aux_fail; + } else if (nm != old_nm_i) { if (!quiet_mode) { fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); } - bam_aux_del(b, old_nm); - bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + if (bam_aux_del(b, old_nm) < 0) goto aux_fail; + if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0) + goto aux_fail; } } // update MD if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_md = bam_aux_get(b, "MD"); - if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); - else { + if (!old_md) { + if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0) + goto aux_fail; + } else { int is_diff = 0; - if (strlen((char*)old_md+1) == str->l) { - for (i = 0; i < str->l; ++i) - if (toupper(old_md[i+1]) != toupper(str->s[i])) + if (strlen((char*)old_md+1) == str.l) { + for (i = 0; i < str.l; ++i) + if (toupper(old_md[i+1]) != toupper(str.s[i])) break; - if (i < str->l) is_diff = 1; + if (i < str.l) is_diff = 1; } else is_diff = 1; if (is_diff) { if (!quiet_mode) { - fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str.s); } - bam_aux_del(b, old_md); - bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + if (bam_aux_del(b, old_md) < 0) goto aux_fail; + if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0) + goto aux_fail; } } } @@ -158,12 +193,25 @@ void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; } - free(str->s); free(str); + free(str.s); + return 0; + + aux_fail: + if (errno == ENOMEM) { + print_error("calmd", "Couldn't add aux tag (too long)"); + } else if (errno == EINVAL) { + print_error("calmd", "Corrupt aux data"); + } else { + print_error_errno("calmd", "Couldn't add aux tag"); + } + fail: + free(str.s); + return -1; } -void bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode) +int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode) { - bam_fillmd1_core(b, ref, INT_MAX, flag, 0, quiet_mode); + return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL); } int calmd_usage() { @@ -193,8 +241,10 @@ int bam_fillmd(int argc, char *argv[]) sam_hdr_t *header = NULL; faidx_t *fai = NULL; char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL; + const char *ref_name = NULL; bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + uint32_t skipped = 0; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'), @@ -294,20 +344,34 @@ int bam_fillmd(int argc, char *argv[]) if (b->core.tid >= 0) { if (tid != b->core.tid) { free(ref); - ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len); + ref = NULL; + len = 0; + ref_name = sam_hdr_tid2name(header, b->core.tid); + if (ref_name) { + ref = fai_fetch64(fai, ref_name, &len); + } tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", - sam_hdr_tid2name(header, tid)); + ref_name ? ref_name : "(unknown)"); if (is_realn || capQ > 10) goto fail; // Would otherwise crash } } - if (is_realn) sam_prob_realn(b, ref, len, baq_flag); + if (is_realn) { + if (sam_prob_realn(b, ref, len, baq_flag) < -3) { + print_error_errno("calmd", "BAQ alignment failed"); + goto fail; + } + } if (capQ > 10) { int q = sam_cap_mapq(b, ref, len, capQ); if (b->core.qual > q) b->core.qual = q; } - if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm, quiet_mode); + if (ref) { + if (bam_fillmd1_core(ref_name, b, ref, len, flt_flag, max_nm, + quiet_mode, &skipped) < 0) + goto fail; + } } if (sam_write1(fpout, header, b) < 0) { print_error_errno("calmd", "failed to write to output file"); @@ -318,6 +382,13 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, "[bam_fillmd] Error reading input.\n"); goto fail; } + + if (skipped) { + fprintf(stderr, "[calmd] Warning: %"PRIu32" records skipped due " + "to no query sequence\n", + skipped); + } + bam_destroy1(b); sam_hdr_destroy(header); diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c index 93990b9..b71e77c 100644 --- a/samtools/bam_md.c.pysam.c +++ b/samtools/bam_md.c.pysam.c @@ -2,7 +2,7 @@ /* bam_md.c -- calmd subcommand. - Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li @@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "htslib/faidx.h" #include "htslib/sam.h" #include "htslib/kstring.h" @@ -48,102 +49,136 @@ DEALINGS IN THE SOFTWARE. */ int bam_aux_drop_other(bam1_t *b, uint8_t *s); -void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode) +static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref, + hts_pos_t ref_len, int flag, int max_nm, + int quiet_mode, uint32_t *skipped) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; - int i, y, u = 0; - hts_pos_t x; - kstring_t *str; + int i, qpos, matched = 0; + hts_pos_t rpos; + kstring_t str = KS_INITIALIZE; int32_t old_nm_i = -1, nm = 0; + uint32_t err = 0; - str = (kstring_t*)calloc(1, sizeof(kstring_t)); - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (c->l_qseq == 0) { + if (!quiet_mode) { + if (ref_name) { + fprintf(samtools_stderr, "[bam_fillmd1] no sequence in alignment " + "record for '%s' at %s:%"PRIhts_pos", skipped\n", + bam_get_qname(b), ref_name, c->pos + 1); + } else { + fprintf(samtools_stderr, "[bam_fillmd1] no sequence in alignment " + "record for '%s', skipped", bam_get_qname(b)); + } + } + if (skipped) (*skipped)++; + return 0; + } + + for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) { + int j, oplen = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int c1, c2, z = y + j; - if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds - c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; + for (j = 0; j < oplen; ++j) { + int c1, c2, z = qpos + j; + if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0') + break; // out of bounds + c1 = bam_seqi(seq, z); + c2 = seq_nt16_table[(uint8_t)ref[rpos+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; - ++u; + ++matched; } else { - kputw(u, str); kputc(toupper(ref[x+j]), str); - u = 0; ++nm; + err |= kputw(matched, &str) < 0; + err |= kputc(toupper(ref[rpos+j]), &str) < 0; + matched = 0; ++nm; } } - if (j < l) break; - x += l; y += l; + if (j < oplen) break; + rpos += oplen; qpos += oplen; } else if (op == BAM_CDEL) { - kputw(u, str); kputc('^', str); - for (j = 0; j < l; ++j) { - if (x+j >= ref_len || ref[x+j] == '\0') break; - kputc(toupper(ref[x+j]), str); + err |= kputw(matched, &str) < 0; + err |= kputc('^', &str) < 0; + for (j = 0; j < oplen; ++j) { + if (rpos+j >= ref_len || ref[rpos+j] == '\0') break; + err |= kputc(toupper(ref[rpos+j]), &str) < 0; } - u = 0; - x += j; nm += j; - if (j < l) break; + matched = 0; + rpos += j; nm += j; + if (j < oplen) break; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { - y += l; - if (op == BAM_CINS) nm += l; + qpos += oplen; + if (op == BAM_CINS) nm += oplen; } else if (op == BAM_CREF_SKIP) { - x += l; + rpos += oplen; } } - kputw(u, str); + err |= kputw(matched, &str) < 0; + if (err) { + print_error_errno("calmd", "Couldn't build new MD string"); + goto fail; + } // apply max_nm if (max_nm > 0 && nm >= max_nm) { - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; + for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) { + int j, oplen = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int c1, c2, z = y + j; - if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds - c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; + for (j = 0; j < oplen; ++j) { + int c1, c2, z = qpos + j; + if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0') + break; // out of bounds + c1 = bam_seqi(seq, z); + c2 = seq_nt16_table[(uint8_t)ref[rpos+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match seq[z/2] |= (z&1)? 0x0f : 0xf0; bam_get_qual(b)[z] = 0; } } - if (j < l) break; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + if (j < oplen) break; + rpos += oplen; qpos += oplen; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rpos += oplen; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) qpos += oplen; } } // update NM if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_nm = bam_aux_get(b, "NM"); if (old_nm) old_nm_i = bam_aux2i(old_nm); - if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + if (!old_nm) { + if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0) + goto aux_fail; + } else if (nm != old_nm_i) { if (!quiet_mode) { fprintf(samtools_stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); } - bam_aux_del(b, old_nm); - bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + if (bam_aux_del(b, old_nm) < 0) goto aux_fail; + if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0) + goto aux_fail; } } // update MD if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_md = bam_aux_get(b, "MD"); - if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); - else { + if (!old_md) { + if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0) + goto aux_fail; + } else { int is_diff = 0; - if (strlen((char*)old_md+1) == str->l) { - for (i = 0; i < str->l; ++i) - if (toupper(old_md[i+1]) != toupper(str->s[i])) + if (strlen((char*)old_md+1) == str.l) { + for (i = 0; i < str.l; ++i) + if (toupper(old_md[i+1]) != toupper(str.s[i])) break; - if (i < str->l) is_diff = 1; + if (i < str.l) is_diff = 1; } else is_diff = 1; if (is_diff) { if (!quiet_mode) { - fprintf(samtools_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); + fprintf(samtools_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str.s); } - bam_aux_del(b, old_md); - bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + if (bam_aux_del(b, old_md) < 0) goto aux_fail; + if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0) + goto aux_fail; } } } @@ -160,12 +195,25 @@ void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; } - free(str->s); free(str); + free(str.s); + return 0; + + aux_fail: + if (errno == ENOMEM) { + print_error("calmd", "Couldn't add aux tag (too long)"); + } else if (errno == EINVAL) { + print_error("calmd", "Corrupt aux data"); + } else { + print_error_errno("calmd", "Couldn't add aux tag"); + } + fail: + free(str.s); + return -1; } -void bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode) +int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode) { - bam_fillmd1_core(b, ref, INT_MAX, flag, 0, quiet_mode); + return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL); } int calmd_usage() { @@ -195,8 +243,10 @@ int bam_fillmd(int argc, char *argv[]) sam_hdr_t *header = NULL; faidx_t *fai = NULL; char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL; + const char *ref_name = NULL; bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + uint32_t skipped = 0; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'), @@ -296,20 +346,34 @@ int bam_fillmd(int argc, char *argv[]) if (b->core.tid >= 0) { if (tid != b->core.tid) { free(ref); - ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len); + ref = NULL; + len = 0; + ref_name = sam_hdr_tid2name(header, b->core.tid); + if (ref_name) { + ref = fai_fetch64(fai, ref_name, &len); + } tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? fprintf(samtools_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", - sam_hdr_tid2name(header, tid)); + ref_name ? ref_name : "(unknown)"); if (is_realn || capQ > 10) goto fail; // Would otherwise crash } } - if (is_realn) sam_prob_realn(b, ref, len, baq_flag); + if (is_realn) { + if (sam_prob_realn(b, ref, len, baq_flag) < -3) { + print_error_errno("calmd", "BAQ alignment failed"); + goto fail; + } + } if (capQ > 10) { int q = sam_cap_mapq(b, ref, len, capQ); if (b->core.qual > q) b->core.qual = q; } - if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm, quiet_mode); + if (ref) { + if (bam_fillmd1_core(ref_name, b, ref, len, flt_flag, max_nm, + quiet_mode, &skipped) < 0) + goto fail; + } } if (sam_write1(fpout, header, b) < 0) { print_error_errno("calmd", "failed to write to output file"); @@ -320,6 +384,13 @@ int bam_fillmd(int argc, char *argv[]) fprintf(samtools_stderr, "[bam_fillmd] Error reading input.\n"); goto fail; } + + if (skipped) { + fprintf(samtools_stderr, "[calmd] Warning: %"PRIu32" records skipped due " + "to no query sequence\n", + skipped); + } + bam_destroy1(b); sam_hdr_destroy(header); diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index 0497fb6..6fd282c 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -1,6 +1,6 @@ /* bam_plcmd.c -- mpileup subcommand. - Copyright (C) 2008-2015, 2019 Genome Research Ltd. + Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -90,8 +90,10 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, int del_len = -p->indel; if (p->indel > 0) { int len = bam_plp_insertion(p, ks, &del_len); - if (len < 0) + if (len < 0) { + print_error("mpileup", "bam_plp_insertion() failed"); return -1; + } putc('+', fp); printw(len, fp); if (bam_is_rev(p->b)) { char pad = rev_del ? '#' : '*'; @@ -126,10 +128,11 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, #define MPLP_REDO_BAQ (1<<6) #define MPLP_ILLUMINA13 (1<<7) #define MPLP_IGNORE_RG (1<<8) -#define MPLP_PRINT_QPOS (1<<9) -#define MPLP_PER_SAMPLE (1<<11) -#define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_PER_SAMPLE (1<<9) +#define MPLP_SMART_OVERLAPS (1<<10) +#define MPLP_PRINT_MAPQ_CHAR (1<<11) +#define MPLP_PRINT_QPOS (1<<12) #define MPLP_PRINT_QNAME (1<<13) #define MPLP_PRINT_FLAG (1<<14) #define MPLP_PRINT_RNAME (1<<15) @@ -294,9 +297,7 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { fputs("\t0\t*\t*", fp); - if (conf->flag & MPLP_PRINT_QPOS) - fputs("\t*", fp); - int flag_value = MPLP_PRINT_QNAME; + int flag_value = MPLP_PRINT_MAPQ_CHAR; while(flag_value < MPLP_PRINT_QUAL + 1) { if (conf->flag & flag_value) fputs("\t*", fp); @@ -757,9 +758,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); - if (conf->flag & MPLP_PRINT_QPOS) - fputs("\t*", pileup_fp); - int flag_value = MPLP_PRINT_QNAME; + int flag_value = MPLP_PRINT_MAPQ_CHAR; while(flag_value < MPLP_PRINT_QUAL + 1) { if (conf->flag & flag_value) fputs("\t*", pileup_fp); @@ -805,25 +804,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) } if (!n) putc('*', pileup_fp); - /* Print mpileup positions */ - if (conf->flag & MPLP_PRINT_QPOS) { - n = 0; - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if ( c < conf->min_baseQ ) continue; - if (n > 0) putc(',', pileup_fp); - n++; - fprintf(pileup_fp, "%d", p->qpos + 1); - } - if (!n) putc('*', pileup_fp); - } - /* Print selected columns */ - int flag_value = MPLP_PRINT_QNAME; + int flag_value = MPLP_PRINT_MAPQ_CHAR; while(flag_value < MPLP_PRINT_QUAL + 1) { if (conf->flag & flag_value) { n = 0; @@ -834,10 +816,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) ? bam_get_qual(p->b)[p->qpos] : 0; if ( c < conf->min_baseQ ) continue; - if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp); + if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp); n++; switch (flag_value) { + case MPLP_PRINT_MAPQ_CHAR: + c = p->b->core.qual + 33; + if (c > 126) c = 126; + putc(c, pileup_fp); + break; + case MPLP_PRINT_QPOS: + fprintf(pileup_fp, "%d", p->qpos + 1); + break; case MPLP_PRINT_QNAME: fputs(bam_get_qname(p->b), pileup_fp); break; @@ -854,9 +844,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); break; case MPLP_PRINT_MAPQ: - c = p->b->core.qual + 33; - if (c > 126) c = 126; - putc(c, pileup_fp); + fprintf(pileup_fp, "%d", p->b->core.qual); break; case MPLP_PRINT_RNEXT: if (p->b->core.mtid >= 0) @@ -930,6 +918,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) } } + if (ret < 0) { + print_error("mpileup", "error reading from input file"); + ret = EXIT_FAILURE; + goto fail; + } + if (conf->all && !(conf->flag & MPLP_BCF)) { // Handle terminating region if (last_tid < 0 && conf->reg && conf->all > 1) { @@ -1110,9 +1104,9 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " -r, --region REG region in which pileup is generated\n" " -R, --ignore-RG ignore RG tags (one BAM = one sample)\n" -" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); +" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require); fprintf(fp, -" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" +" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, " -x, --ignore-overlaps disable read-pair overlap detection\n" @@ -1281,7 +1275,7 @@ int bam_mpileup(int argc, char *argv[]) case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; - case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; + case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break; case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index 7c9986f..bcb8a5c 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -2,7 +2,7 @@ /* bam_plcmd.c -- mpileup subcommand. - Copyright (C) 2008-2015, 2019 Genome Research Ltd. + Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -92,8 +92,10 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, int del_len = -p->indel; if (p->indel > 0) { int len = bam_plp_insertion(p, ks, &del_len); - if (len < 0) + if (len < 0) { + print_error("mpileup", "bam_plp_insertion() failed"); return -1; + } putc('+', fp); printw(len, fp); if (bam_is_rev(p->b)) { char pad = rev_del ? '#' : '*'; @@ -128,10 +130,11 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, #define MPLP_REDO_BAQ (1<<6) #define MPLP_ILLUMINA13 (1<<7) #define MPLP_IGNORE_RG (1<<8) -#define MPLP_PRINT_QPOS (1<<9) -#define MPLP_PER_SAMPLE (1<<11) -#define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_PER_SAMPLE (1<<9) +#define MPLP_SMART_OVERLAPS (1<<10) +#define MPLP_PRINT_MAPQ_CHAR (1<<11) +#define MPLP_PRINT_QPOS (1<<12) #define MPLP_PRINT_QNAME (1<<13) #define MPLP_PRINT_FLAG (1<<14) #define MPLP_PRINT_RNAME (1<<15) @@ -296,9 +299,7 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { fputs("\t0\t*\t*", fp); - if (conf->flag & MPLP_PRINT_QPOS) - fputs("\t*", fp); - int flag_value = MPLP_PRINT_QNAME; + int flag_value = MPLP_PRINT_MAPQ_CHAR; while(flag_value < MPLP_PRINT_QUAL + 1) { if (conf->flag & flag_value) fputs("\t*", fp); @@ -389,7 +390,7 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, if (id < 0 || id >= m->n) { assert(q); // otherwise a bug fprintf(samtools_stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; @@ -442,7 +443,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (n == 0) { fprintf(samtools_stderr,"[%s] no input file/data given\n", __func__); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } // read the header of each file in the list and initialize data @@ -453,23 +454,23 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if ( !data[i]->fp ) { fprintf(samtools_stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) { fprintf(samtools_stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } data[i]->conf = conf; data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { fprintf(samtools_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp)); if (conf->flag & MPLP_BCF) { @@ -487,11 +488,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (idx == NULL) { fprintf(samtools_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) { fprintf(samtools_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid; hts_idx_destroy(idx); @@ -529,7 +530,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } // BCF header creation @@ -613,7 +614,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) { print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"", conf->output_fname? conf->output_fname : "standard output"); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } // End of BCF header creation @@ -652,7 +653,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (pileup_fp == NULL) { fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } } @@ -698,7 +699,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", conf->output_fname?conf->output_fname:"standard output"); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) @@ -712,7 +713,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", conf->output_fname?conf->output_fname:"standard output"); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } } } @@ -759,9 +760,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); - if (conf->flag & MPLP_PRINT_QPOS) - fputs("\t*", pileup_fp); - int flag_value = MPLP_PRINT_QNAME; + int flag_value = MPLP_PRINT_MAPQ_CHAR; while(flag_value < MPLP_PRINT_QUAL + 1) { if (conf->flag & flag_value) fputs("\t*", pileup_fp); @@ -807,25 +806,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) } if (!n) putc('*', pileup_fp); - /* Print mpileup positions */ - if (conf->flag & MPLP_PRINT_QPOS) { - n = 0; - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; - if ( c < conf->min_baseQ ) continue; - if (n > 0) putc(',', pileup_fp); - n++; - fprintf(pileup_fp, "%d", p->qpos + 1); - } - if (!n) putc('*', pileup_fp); - } - /* Print selected columns */ - int flag_value = MPLP_PRINT_QNAME; + int flag_value = MPLP_PRINT_MAPQ_CHAR; while(flag_value < MPLP_PRINT_QUAL + 1) { if (conf->flag & flag_value) { n = 0; @@ -836,10 +818,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) ? bam_get_qual(p->b)[p->qpos] : 0; if ( c < conf->min_baseQ ) continue; - if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp); + if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp); n++; switch (flag_value) { + case MPLP_PRINT_MAPQ_CHAR: + c = p->b->core.qual + 33; + if (c > 126) c = 126; + putc(c, pileup_fp); + break; + case MPLP_PRINT_QPOS: + fprintf(pileup_fp, "%d", p->qpos + 1); + break; case MPLP_PRINT_QNAME: fputs(bam_get_qname(p->b), pileup_fp); break; @@ -856,9 +846,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); break; case MPLP_PRINT_MAPQ: - c = p->b->core.qual + 33; - if (c > 126) c = 126; - putc(c, pileup_fp); + fprintf(pileup_fp, "%d", p->b->core.qual); break; case MPLP_PRINT_RNEXT: if (p->b->core.mtid >= 0) @@ -932,6 +920,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) } } + if (ret < 0) { + print_error("mpileup", "error reading from input file"); + ret = EXIT_FAILURE; + goto fail; + } + if (conf->all && !(conf->flag & MPLP_BCF)) { // Handle terminating region if (last_tid < 0 && conf->reg && conf->all > 1) { @@ -1073,7 +1067,7 @@ int parse_format_flag(const char *str) else { fprintf(samtools_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str); - exit(EXIT_FAILURE); + samtools_exit(EXIT_FAILURE); } free(tags[i]); } @@ -1112,9 +1106,9 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " -r, --region REG region in which pileup is generated\n" " -R, --ignore-RG ignore RG tags (one BAM = one sample)\n" -" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); +" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require); fprintf(fp, -" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" +" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, " -x, --ignore-overlaps disable read-pair overlap detection\n" @@ -1283,7 +1277,7 @@ int bam_mpileup(int argc, char *argv[]) case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; - case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; + case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break; case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index 8149514..a48d7f6 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -444,7 +444,7 @@ static void usage(FILE *fp, int ret) { " -i, --in-place Modify the CRAM file directly, if possible.\n" " (Defaults to outputting to samtools_stdout.)\n" " -c, --command CMD Pass the header in SAM format to external program CMD.\n"); - exit(ret); + samtools_exit(ret); } static sam_hdr_t* external_reheader(samFile* in, const char* external) { @@ -533,7 +533,7 @@ cleanup: return h; } -int main_reheader(int argc, char *argv[]) +int samtools_main_reheader(int argc, char *argv[]) { int inplace = 0, r, no_pg = 0, c, skip_header = 0; sam_hdr_t *h; diff --git a/samtools/bam_rmdupse.c.pysam.c b/samtools/bam_rmdupse.c.pysam.c index 2c67fac..65689d7 100644 --- a/samtools/bam_rmdupse.c.pysam.c +++ b/samtools/bam_rmdupse.c.pysam.c @@ -86,8 +86,8 @@ static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, in p->discarded = 0; p->endpos = endpos; p->score = score; if (p->b == 0) p->b = bam_init1(); - if (!p->b) { perror(NULL); exit(EXIT_FAILURE); } - if (bam_copy1(p->b, b) == NULL) { perror(NULL); exit(EXIT_FAILURE); } + if (!p->b) { perror(NULL); samtools_exit(EXIT_FAILURE); } + if (bam_copy1(p->b, b) == NULL) { perror(NULL); samtools_exit(EXIT_FAILURE); } return p; } @@ -183,7 +183,7 @@ int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se) } else { // replace p->score = score; p->endpos = endpos; if (bam_copy1(p->b, b) == NULL) { - perror(NULL); exit(EXIT_FAILURE); + perror(NULL); samtools_exit(EXIT_FAILURE); } } } // otherwise, discard the alignment diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 0bf346c..46a1d80 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -1,6 +1,6 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2019 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -33,11 +33,13 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include #include #include +#include #include "htslib/ksort.h" #include "htslib/hts_os.h" #include "htslib/khash.h" @@ -47,6 +49,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts_endian.h" #include "sam_opts.h" #include "samtools.h" +#include "bedidx.h" // Struct which contains the a record, and the pointer to the sort tag (if any) or @@ -97,6 +100,7 @@ KLIST_INIT(hdrln, char*, hdrln_free_char) static int g_is_by_qname = 0; static int g_is_by_tag = 0; +static int g_is_by_minhash = 0; static char g_sort_tag[2] = {0,0}; static int strnum_cmp(const char *_a, const char *_b) @@ -133,8 +137,11 @@ typedef struct { } heap1_t; static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b); +static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b); // Function to compare reads in the heap and determine which one is < the other +// Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this +// is strictly 0 or 1 only. static inline int heap_lt(const heap1_t a, const heap1_t b) { if (!a.entry.bam_record) @@ -146,6 +153,9 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) int t; t = bam1_cmp_by_tag(a.entry, b.entry); if (t != 0) return t > 0; + } else if (g_is_by_minhash) { + int t = bam1_cmp_by_minhash(a.entry, b.entry); + if (t != 0) return t > 0; } else if (g_is_by_qname) { int t, fa, fb; t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record)); @@ -513,7 +523,8 @@ static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate, id_len = id_end - idp; if (id_len < transformed_id.l) { - if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len)) + if (ks_resize(&new_hdr_line, new_hdr_line.l + + transformed_id.l - id_len + 1/*nul*/)) goto fail; } if (id_len != transformed_id.l) { @@ -714,6 +725,7 @@ static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, // Get translated header lines and fill in map for @PG records pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids, tbl->pg_trans, NULL); + if (!pg_list) goto fail; // Fix-up PG: tags in the new @RG records and add to output if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg)) @@ -911,10 +923,38 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) #define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them #define MERGE_FIRST_CO 64 // Use only first file's @CO headers (sort cmd only) + +static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) { + if (!rl) + return NULL; + + hts_reglist_t *new_rl = calloc(rn, sizeof(hts_reglist_t)); + if (!new_rl) + return NULL; + + int i; + for (i=0; i < rn; i++) { + new_rl[i].tid = rl[i].tid; + new_rl[i].count = rl[i].count; + new_rl[i].min_beg = rl[i].min_beg; + new_rl[i].max_end = rl[i].max_end; + + new_rl[i].reg = rl[i].reg; + new_rl[i].intervals = malloc(new_rl[i].count * sizeof(hts_pair_pos_t)); + if (!new_rl[i].intervals) { + hts_reglist_free(new_rl, i); + return NULL; + } + memcpy(new_rl[i].intervals, rl[i].intervals, new_rl[i].count * sizeof(hts_pair_pos_t)); + } + + return new_rl; +} + /* * How merging is handled * - * If a hheader is defined use we will use that as our output header + * If a header is defined use we will use that as our output header * otherwise we use the first header from the first input file. * * Now go through each file and create a translation table for that file for: @@ -957,9 +997,9 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) */ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode, const char *headers, int n, char * const *fn, char * const *fn_idx, - int flag, const char *reg, int n_threads, const char *cmd, - const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index, - char *arg_list, int no_pg) + const char *fn_bed, int flag, const char *reg, int n_threads, + const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, + int write_index, char *arg_list, int no_pg) { samFile *fpout, **fp = NULL; heap1_t *heap = NULL; @@ -973,6 +1013,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m trans_tbl_t *translation_tbl = NULL; int *rtrans = NULL; char *out_idx_fn = NULL; + void *hreg = NULL; + hts_reglist_t *lreg = NULL; merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; @@ -1030,7 +1072,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m } if (hin) { - // Popluate merged_hdr from the pre-prepared header + // Populate merged_hdr from the pre-prepared header trans_tbl_t dummy; int res; res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG, @@ -1059,10 +1101,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m RG[i])) return -1; // FIXME: memory leak - // TODO sam_itr_next() doesn't yet work for SAM files, - // so for those keep the headers around for use with sam_read1() - if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; - else { sam_hdr_destroy(hin); hdr[i] = NULL; } + hdr[i] = hin; if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); @@ -1098,10 +1137,22 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m if (!hout) return -1; // FIXME: memory leak // If we're only merging a specified region move our iters to start at that point - if (reg) { - int tid; - hts_pos_t beg, end; + int tid, nreg; + hts_pos_t beg, end; + if (fn_bed) { + hreg = bed_read(fn_bed); + if (!hreg) { + fprintf(stderr, "[%s] Could not read BED file: \"%s\"\n", __func__, fn_bed); + goto fail; + } + bed_unify(hreg); + lreg = bed_reglist(hreg, ALL, &nreg); + if (!lreg || !nreg) { + fprintf(stderr, "[%s] Null or empty region list\n", __func__); + goto fail; + } + } else if (reg) { rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl); if (!rtrans) goto mem_fail; @@ -1109,55 +1160,69 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m fprintf(stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg); goto fail; } + + } + + if (reg || fn_bed) { + hts_idx_t *reg_idx = NULL; for (i = 0; i < n; ++i) { - hts_idx_t *idx = NULL; - // If index filename has not been specfied, look in BAM folder + + // If index filename has not been specified, look in the BAM folder if (fn_idx != NULL) { - idx = sam_index_load2(fp[i], fn[i], fn_idx[i]); + reg_idx = sam_index_load2(fp[i], fn[i], fn_idx[i]); } else { - idx = sam_index_load(fp[i], fn[i]); + reg_idx = sam_index_load(fp[i], fn[i]); } - // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space - int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid]; - if (idx == NULL) { - fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", + if (reg_idx == NULL) { + fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", __func__, fn[i]); + free(rtrans); + rtrans = NULL; goto fail; } - if (mapped_tid != INT32_MIN) { - iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); + + int mapped_tid = INT32_MIN; + if (fn_bed) { + hts_reglist_t *rl = duplicate_reglist(lreg, nreg); + iter[i] = sam_itr_regions(reg_idx, hdr[i], rl, nreg); } else { - iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); + // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space + mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid]; + if (mapped_tid != INT32_MIN) { + iter[i] = sam_itr_queryi(reg_idx, mapped_tid, beg, end); + } else { + iter[i] = sam_itr_queryi(reg_idx, HTS_IDX_NONE, 0, 0); + } } - hts_idx_destroy(idx); + if (iter[i] == NULL) { - if (mapped_tid != INT32_MIN) { - fprintf(stderr, - "[%s] failed to get iterator over " - "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n", - __func__, fn[i], mapped_tid, beg, end); + if (fn_bed) { + fprintf(stderr, "[%s] failed to get multi-region iterator " + "{%s, %s}\n", __func__, fn[i], fn_bed); } else { - fprintf(stderr, - "[%s] failed to get iterator over " - "{%s, HTS_IDX_NONE, 0, 0}\n", - __func__, fn[i]); + if (mapped_tid != INT32_MIN) { + fprintf(stderr, + "[%s] failed to get iterator over " + "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n", + __func__, fn[i], mapped_tid, beg, end); + } else { + fprintf(stderr, + "[%s] failed to get iterator over " + "{%s, HTS_IDX_NONE, 0, 0}\n", + __func__, fn[i]); + } } + hts_idx_destroy(reg_idx); + free(rtrans); + rtrans = NULL; goto fail; } + + hts_idx_destroy(reg_idx); } + free(rtrans); rtrans = NULL; - } else { - for (i = 0; i < n; ++i) { - if (hdr[i] == NULL) { - iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); - if (iter[i] == NULL) { - fprintf(stderr, "[%s] failed to get iterator\n", __func__); - goto fail; - } - } - else iter[i] = NULL; - } } // Load the first read from each file into the heap @@ -1279,6 +1344,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m sam_hdr_destroy(hin); sam_hdr_destroy(hout); free_merged_header(merged_hdr); + hts_reglist_free(lreg, nreg); + bed_destroy(hreg); free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); if (sam_close(fpout) < 0) { print_error(cmd, "error closing output file"); @@ -1307,6 +1374,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m free(RG); free(translation_tbl); free(hdr); + hts_reglist_free(lreg, nreg); + bed_destroy(hreg); free(iter); free(heap); free(fp); @@ -1322,13 +1391,14 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch strcpy(mode, "wb"); if (flag & MERGE_UNCOMP) strcat(mode, "0"); else if (flag & MERGE_LEVEL1) strcat(mode, "1"); - return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); + return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); } static void merge_usage(FILE *to) { fprintf(to, -"Usage: samtools merge [-nurlf] [-h inh.sam] [-b ] [ ... ]\n" +"Usage: samtools merge [options] -o [options] ... \n" +" or: samtools merge [options] ... \n" "\n" "Options:\n" " -n Input files are sorted by read name\n" @@ -1336,6 +1406,7 @@ static void merge_usage(FILE *to) " -r Attach RG tag (inferred from file names)\n" " -u Uncompressed BAM output\n" " -f Overwrite the output BAM if exist\n" +" -o FILE Specify output file via option instead of argument\n" " -1 Compress level 1\n" " -l INT Compression level, from 0 to 9 [-1]\n" " -R STR Merge file in the specified region STR [all]\n" @@ -1345,6 +1416,7 @@ static void merge_usage(FILE *to) " -s VALUE Override random seed\n" " -b FILE List of input BAM filenames, one per line [null]\n" " -X Use customized index files\n" +" -L FILE Specify a BED file for multiple region filtering [null]\n" " --no-PG do not add a PG line\n"); sam_global_opt_help(to, "-.O..@.."); } @@ -1353,10 +1425,10 @@ int bam_merge(int argc, char *argv[]) { int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; char *fn_headers = NULL, *reg = NULL, mode[12]; - char *sort_tag = NULL, *arg_list = NULL; + char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL; long random_seed = (long)time(NULL); char** fn = NULL; - char** fn_idx = NULL; + char** fn_idx = NULL, *fn_bed = NULL; int fn_size = 0, no_pg = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; @@ -1372,12 +1444,13 @@ int bam_merge(int argc, char *argv[]) return 0; } - while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = optarg; break; case 'n': is_by_qname = 1; break; + case 'o': fnout = optarg; break; case 't': sort_tag = optarg; break; case '1': flag |= MERGE_LEVEL1; level = 1; break; case 'u': flag |= MERGE_UNCOMP; level = 0; break; @@ -1387,6 +1460,7 @@ int bam_merge(int argc, char *argv[]) case 'p': flag |= MERGE_COMBINE_PG; break; case 's': random_seed = atol(optarg); break; case 'X': has_index_file = 1; break; // -X flag for index filename + case 'L': fn_bed = optarg; break; case 'b': { // load the list of files to read if (has_index_file) { @@ -1415,7 +1489,12 @@ int bam_merge(int argc, char *argv[]) case '?': merge_usage(stderr); return 1; } } - if ( argc - optind < 1 ) { + + if (fnout == NULL && argc - optind >= 1) { + fnout = argv[optind]; + optind++; + } + if (fnout == NULL) { print_error("merge", "You must at least specify the output file"); merge_usage(stderr); return 1; @@ -1426,50 +1505,57 @@ int bam_merge(int argc, char *argv[]) return 1; } - srand48(random_seed); - if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { - FILE *fp = fopen(argv[optind], "rb"); - if (fp != NULL) { - fclose(fp); - fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); - return 1; + hts_srand48(random_seed); + if (!(flag & MERGE_FORCE) && strcmp(fnout, "-") != 0) { + struct stat sbuf; + if (stat(fnout, &sbuf) == 0 && S_ISREG(sbuf.st_mode)) { + fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, fnout); + ret = 1; + goto end; } } int nargcfiles = 0; if (has_index_file) { // Calculate # of input BAM files - if ((argc - optind - 1) % 2 != 0) { + if ((argc - optind) % 2 != 0) { fprintf(stderr, "Odd number of filenames detected! Each BAM file should have an index file\n"); - return 1; + ret = 1; + goto end; } - nargcfiles = (argc - optind - 1) / 2; + nargcfiles = (argc - optind) / 2; } else { - nargcfiles = argc - optind - 1; + nargcfiles = argc - optind; } if (nargcfiles > 0) { // Add argc files to end of array fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*)); if (fn == NULL) { ret = 1; goto end; } - memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); + memcpy(fn+fn_size, argv + optind, nargcfiles * sizeof(char*)); if(has_index_file) { fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*)); if (fn_idx == NULL) { ret = 1; goto end; } - memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*)); + memcpy(fn_idx+fn_size, argv + nargcfiles + optind, nargcfiles * sizeof(char*)); } } if (fn_size+nargcfiles < 1) { print_error("merge", "You must specify at least one (and usually two or more) input files"); merge_usage(stderr); - free(fn_idx); - return 1; + ret = 1; + goto end; + } + + if (reg && fn_bed) { + print_error("merge", "You must specify either a BED file or a region"); + ret = 1; + goto end; } strcpy(mode, "wb"); - sam_open_mode(mode+1, argv[optind], NULL); + sam_open_mode(mode+1, fnout, NULL); if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); - if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers, - fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads, + if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers, + fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads, "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) ret = 1; @@ -1631,6 +1717,12 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, ks_heapmake(heap, heap_size, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->entry.bam_record; + if (g_is_by_minhash && b->core.tid == -1) { + // Remove the cached minhash value + b->core.pos = -1; + b->core.mpos = -1; + b->core.isize = 0; + } if (sam_write1(fpout, hout, b) < 0) { print_error_errno(cmd, "failed writing to \"%s\"", out); goto fail; @@ -1789,12 +1881,45 @@ static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b) } } +// Sort by minimiser (stored in bam1_tag.u.pos). +// If equal, sort by position. +// +// The 64-bit sort key is split over the bam pos and isize fields. +// This permits it to survive writing to temporary file and coming back. +static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) +{ + const bam1_t *A = a.bam_record; + const bam1_t *B = b.bam_record; + + if (!A) return 1; + if (!B) return 0; + + if (A->core.tid != -1 || B->core.tid != -1) + return bam1_cmp_core(a,b); + + const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos; + const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos; + + if (m_a < m_b) // by hash + return -1; + else if (m_a > m_b) + return 1; + else if (A->core.isize < B->core.isize) // by hash location in seq + return -1; + else if (A->core.isize > B->core.isize) + return 1; + else + return bam1_cmp_core(a,b); +} + // Function to compare reads and determine which one is < the other // Handle sort-by-pos, sort-by-name, or sort-by-tag static inline int bam1_lt(const bam1_tag a, const bam1_tag b) { if (g_is_by_tag) { return bam1_cmp_by_tag(a, b) < 0; + } else if (g_is_by_minhash) { + return bam1_cmp_by_minhash(a, b) < 0; } else { return bam1_cmp_core(a,b) < 0; } @@ -1818,7 +1943,7 @@ typedef struct { // -1 for failure static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const sam_hdr_t *h, int n_threads, const htsFormat *fmt, - char *arg_list, int no_pg, int write_index) + int clear_minhash, char *arg_list, int no_pg, int write_index) { size_t i; samFile* fp; @@ -1826,22 +1951,27 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu fp = sam_open_format(fn, mode, fmt); if (fp == NULL) return -1; - if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", - "VN", samtools_version(), + if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) { goto fail; } - if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail; + if (sam_hdr_write(fp, h) != 0) goto fail; - if (write_index) { + if (write_index) if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail; - } if (n_threads > 1) hts_set_threads(fp, n_threads); for (i = 0; i < l; ++i) { - if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail; + bam1_t *b = buf[i].bam_record; + if (clear_minhash && b->core.tid == -1) { + // Remove the cached minhash value + b->core.pos = -1; + b->core.mpos = -1; + b->core.isize = 0; + } + if (sam_write1(fp, h, b) < 0) goto fail; } if (write_index) { @@ -1944,18 +2074,206 @@ err: return ret; } +/* + * Computes the minhash of a sequence using both forward and reverse strands. + * + * This is used as a sort key for unmapped data, to collate like sequences + * together and to improve compression ratio. + * + * The minhash is returned and *pos filled out with location of this hash + * key in the sequence if pos != NULL. + */ +static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) { + uint64_t hashf = 0, minhashf = UINT64_MAX; + uint64_t hashr = 0, minhashr = UINT64_MAX; + int minhashpf = 0, minhashpr = 0, i; + uint64_t mask = (1L<<(2*kmer))-1; + unsigned char *seq = bam_get_seq(b); + int len = b->core.l_qseq; + + // Lookup tables for bam_seqi to 0123 fwd/rev hashes + // =ACM GRSV TWYH KDBN +#define X 0 + unsigned char L[16] = { + X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X, + }; + uint64_t R[16] = { + X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X, + }; + for (i = 0; i < 16; i++) + R[i] <<= 2*(kmer-1); + + // Punt homopolymers somewhere central in the hash space +#define XOR (0xdead7878beef7878 & mask) + + // Initialise hash keys + for (i = 0; i < kmer-1 && i < len; i++) { + int base = bam_seqi(seq, i); + hashf = (hashf<<2) | L[base]; + hashr = (hashr>>2) | R[base]; + } + + // Loop to find minimum + for (; i < len; i++) { + int base = bam_seqi(seq, i); + + hashf = ((hashf<<2) | L[base]) & mask; + hashr = (hashr>>2) | R[base]; + + if (minhashf > (hashf^XOR)) + minhashf = (hashf^XOR), minhashpf = i; + if (minhashr > (hashr^XOR)) + minhashr = (hashr^XOR), minhashpr = len-i+kmer-2; + + } + + if (minhashf <= minhashr) { + if (rev) *rev = 0; + if (pos) *pos = minhashpf; + return minhashf; + } else { + if (rev) *rev = 1; + if (pos) *pos = minhashpr; + return minhashr; + } +} + +//--- Start of candidates to punt to htslib +/*! + * @abstract + * Extracts the sequence (in current alignment orientation) from + * a bam record and places it in buf, which is nul terminated. + * + * @param b The bam structure + * @param buf A buffer at least b->core.l_qseq+1 bytes long + */ +static void bam_to_seq(bam1_t *b, char *buf) { + int i; + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < b->core.l_qseq; i++) + buf[i] = seq_nt16_str[bam_seqi(seq, i)]; + buf[i] = 0; +} + +/*! + * @abstract + * Writes a new sequence, of length b->core.l_qseq, to a BAM record. + * + * If a sequence of a new length is required the caller must first make + * room for it by updating the bam1_t struct. + * + * @param b The bam structure + * @param buf A buffer at least b->core.l_qseq bytes long + */ +static void seq_to_bam(bam1_t *b, char *buf) { + int i; + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < b->core.l_qseq; i++) + bam_set_seqi(seq, i, seq_nt16_table[(unsigned char)buf[i]]); +} + +/*! + * @abstract Reverse complements a BAM record. + * + * It's possible to do this inline, but complex due to the 4-bit sequence + * encoding. For now I take the dumb approach. + * + * @param b Pointer to a BAM alignment + * + * @return 0 on success, -1 on failure (ENOMEM) + */ +static int reverse_complement(bam1_t *b) { + static char comp[256] = { + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//00 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//10 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//20 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//30 + + // * * * * E F * * I J * L * * O + '@','T','V','G', 'H','E','F','C', 'D','I','H','M', 'L','K','N','O',//40 + //P Q * * * * * * X Y Z [ \ ] ^ _ + 'P','Q','Y','S', 'A','A','B','W', 'X','Y','Z','[','\\','[','^','_',//50 + //` * * * * E F * * I J * L * * O + '`','t','v','g', 'h','e','f','c', 'd','i','j','m', 'l','k','n','o',//60 + //P Q * * * * * * X Y Z { | } ~ DEL + 'p','q','y','s', 'a','a','b','w', 'x','y','z','{', '|','}','~',127,//70 + + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//80 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//90 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//A0 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//B0 + + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//C0 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//D0 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//E0 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//F0 + }; + char seq_[10000], *seq = seq_; + uint8_t *qual = bam_get_qual(b); + int i, j; + + if (b->core.l_qseq >= 10000) + if (!(seq = malloc(b->core.l_qseq+1))) + return -1; + + bam_to_seq(b, seq); + + for (i = 0, j = b->core.l_qseq-1; i < j; i++, j--) { + unsigned char tmp = seq[i]; + seq[i] = comp[(unsigned char)seq[j]]; + seq[j] = comp[tmp]; + tmp = qual[i]; + qual[i] = qual[j]; + qual[j] = tmp; + } + if (i ==j) + seq[i] = comp[(unsigned char)seq[i]]; + + seq_to_bam(b, seq); + + if (seq != seq_) + free(seq); + + b->core.flag ^= 0x10; + + return 0; +} +//--- End of candidates to punt to htslib + static void *worker(void *data) { worker_t *w = (worker_t*)data; char *name; w->error = 0; - if (!g_is_by_qname && !g_is_by_tag) { + if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) { if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) { w->error = errno; return NULL; } } else { + if (g_is_by_minhash) { + int i; + for (i = 0; i < w->buf_len; i++) { + bam1_t *b = w->buf[i].bam_record; + if (b->core.tid != -1) + continue; + + int pos = 0, rev = 0; + uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev); + if (rev) + reverse_complement(b); + + // Store 64-bit hash in unmapped pos and mpos fields. + // The position of hash is in isize, which we use for + // resolving ties when sorting by hash key. + // These are unused for completely unmapped data and + // will be reset during final output. + b->core.pos = mh>>31; + b->core.mpos = mh&0x7fffffff; + b->core.isize = 65535-pos >=0 ? 65535-pos : 0; + } + } ks_mergesort(sort, w->buf_len, w->buf, 0); } @@ -1983,10 +2301,10 @@ static void *worker(void *data) return 0; } - if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0) + if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0) w->error = errno; } else { - if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0) + if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0) w->error = errno; } @@ -2043,6 +2361,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, return n_files + n_threads; } + /*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @@ -2067,7 +2386,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, */ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, const char *fnout, const char *modeout, - size_t _max_mem, int n_threads, + size_t _max_mem, int by_minimiser, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { @@ -2090,6 +2409,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; + g_is_by_minhash = by_minimiser; if (sort_by_tag) { g_is_by_tag = 1; g_sort_tag[0] = sort_by_tag[0]; @@ -2116,11 +2436,23 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const else new_so = "coordinate"; - if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) - ) { - print_error("sort", "failed to change sort order header to '%s'\n", new_so); - goto err; + if (by_minimiser) { + const char *new_ss = "coordinate:minhash"; + if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, + "SO", new_so, "SS", new_ss, NULL)) + ) { + print_error("sort", "failed to change sort order header to 'SO:%s SS:%s'\n", + new_so, new_ss); + goto err; + } + } else { + if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) + ) { + print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so); + goto err; + } } if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { @@ -2207,7 +2539,8 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const // write the final output if (n_files == 0 && num_in_mem < 2) { // a single block - if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) { + if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, + g_is_by_minhash, arg_list, no_pg, write_index) != 0) { print_error_errno("sort", "failed to create \"%s\"", fnout); goto err; } @@ -2261,7 +2594,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma char *fnout = calloc(strlen(prefix) + 4 + 1, 1); if (!fnout) return -1; sprintf(fnout, "%s.bam", prefix); - ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); + ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0); free(fnout); return ret; } @@ -2272,13 +2605,16 @@ static void sort_usage(FILE *fp) "Usage: samtools sort [options...] [in.bam]\n" "Options:\n" " -l INT Set compression level, from 0 (uncompressed) to 9 (best)\n" +" -u Output uncompressed data (equivalent to -l 0)\n" " -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n" -" -n Sort by read name\n" +" -M Use minimiser for clustering unaligned/unplaced reads\n" +" -K INT Kmer size to use for minimiser [20]\n" +" -n Sort by read name (not compatible with samtools index command)\n" " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" " -o FILE Write final output to FILE rather than standard output\n" " -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" " --no-PG do not add a PG line\n"); - sam_global_opt_help(fp, "-.O..@-."); + sam_global_opt_help(fp, "-.O..@.."); } static void complain_about_memory_setting(size_t max_mem) { @@ -2302,6 +2638,7 @@ int bam_sort(int argc, char *argv[]) { size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0; + int by_minimiser = 0, minimiser_kmer = 20; char* sort_tag = NULL, *arg_list = NULL; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; @@ -2315,7 +2652,7 @@ int bam_sort(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) { switch (c) { case 'o': fnout = optarg; o_seen = 1; break; case 'n': is_by_qname = 1; break; @@ -2330,7 +2667,16 @@ int bam_sort(int argc, char *argv[]) } case 'T': kputs(optarg, &tmpprefix); break; case 'l': level = atoi(optarg); break; - case 1: no_pg = 1; break; + case 'u': level = 0; break; + case 1: no_pg = 1; break; + case 'M': by_minimiser = 1; break; + case 'K': + minimiser_kmer = atoi(optarg); + if (minimiser_kmer < 1) + minimiser_kmer = 1; + else if (minimiser_kmer > 31) + minimiser_kmer = 31; + break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ @@ -2385,7 +2731,8 @@ int bam_sort(int argc, char *argv[]) } ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-", - tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, + tmpprefix.s, fnout, modeout, max_mem, + by_minimiser * minimiser_kmer, ga.nthreads, &ga.in, &ga.out, arg_list, no_pg, ga.write_index); if (ret >= 0) ret = EXIT_SUCCESS; diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 3093960..6cbf66a 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -2,7 +2,7 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2019 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -35,11 +35,13 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include #include #include +#include #include "htslib/ksort.h" #include "htslib/hts_os.h" #include "htslib/khash.h" @@ -49,6 +51,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts_endian.h" #include "sam_opts.h" #include "samtools.h" +#include "bedidx.h" // Struct which contains the a record, and the pointer to the sort tag (if any) or @@ -99,6 +102,7 @@ KLIST_INIT(hdrln, char*, hdrln_free_char) static int g_is_by_qname = 0; static int g_is_by_tag = 0; +static int g_is_by_minhash = 0; static char g_sort_tag[2] = {0,0}; static int strnum_cmp(const char *_a, const char *_b) @@ -135,8 +139,11 @@ typedef struct { } heap1_t; static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b); +static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b); // Function to compare reads in the heap and determine which one is < the other +// Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this +// is strictly 0 or 1 only. static inline int heap_lt(const heap1_t a, const heap1_t b) { if (!a.entry.bam_record) @@ -148,6 +155,9 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) int t; t = bam1_cmp_by_tag(a.entry, b.entry); if (t != 0) return t > 0; + } else if (g_is_by_minhash) { + int t = bam1_cmp_by_minhash(a.entry, b.entry); + if (t != 0) return t > 0; } else if (g_is_by_qname) { int t, fa, fb; t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record)); @@ -515,7 +525,8 @@ static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate, id_len = id_end - idp; if (id_len < transformed_id.l) { - if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len)) + if (ks_resize(&new_hdr_line, new_hdr_line.l + + transformed_id.l - id_len + 1/*nul*/)) goto fail; } if (id_len != transformed_id.l) { @@ -716,6 +727,7 @@ static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, // Get translated header lines and fill in map for @PG records pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids, tbl->pg_trans, NULL); + if (!pg_list) goto fail; // Fix-up PG: tags in the new @RG records and add to output if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg)) @@ -913,10 +925,38 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) #define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them #define MERGE_FIRST_CO 64 // Use only first file's @CO headers (sort cmd only) + +static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) { + if (!rl) + return NULL; + + hts_reglist_t *new_rl = calloc(rn, sizeof(hts_reglist_t)); + if (!new_rl) + return NULL; + + int i; + for (i=0; i < rn; i++) { + new_rl[i].tid = rl[i].tid; + new_rl[i].count = rl[i].count; + new_rl[i].min_beg = rl[i].min_beg; + new_rl[i].max_end = rl[i].max_end; + + new_rl[i].reg = rl[i].reg; + new_rl[i].intervals = malloc(new_rl[i].count * sizeof(hts_pair_pos_t)); + if (!new_rl[i].intervals) { + hts_reglist_free(new_rl, i); + return NULL; + } + memcpy(new_rl[i].intervals, rl[i].intervals, new_rl[i].count * sizeof(hts_pair_pos_t)); + } + + return new_rl; +} + /* * How merging is handled * - * If a hheader is defined use we will use that as our output header + * If a header is defined use we will use that as our output header * otherwise we use the first header from the first input file. * * Now go through each file and create a translation table for that file for: @@ -959,9 +999,9 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl) */ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode, const char *headers, int n, char * const *fn, char * const *fn_idx, - int flag, const char *reg, int n_threads, const char *cmd, - const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index, - char *arg_list, int no_pg) + const char *fn_bed, int flag, const char *reg, int n_threads, + const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, + int write_index, char *arg_list, int no_pg) { samFile *fpout, **fp = NULL; heap1_t *heap = NULL; @@ -975,6 +1015,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m trans_tbl_t *translation_tbl = NULL; int *rtrans = NULL; char *out_idx_fn = NULL; + void *hreg = NULL; + hts_reglist_t *lreg = NULL; merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; @@ -1032,7 +1074,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m } if (hin) { - // Popluate merged_hdr from the pre-prepared header + // Populate merged_hdr from the pre-prepared header trans_tbl_t dummy; int res; res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG, @@ -1061,10 +1103,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m RG[i])) return -1; // FIXME: memory leak - // TODO sam_itr_next() doesn't yet work for SAM files, - // so for those keep the headers around for use with sam_read1() - if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; - else { sam_hdr_destroy(hin); hdr[i] = NULL; } + hdr[i] = hin; if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); @@ -1100,10 +1139,22 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m if (!hout) return -1; // FIXME: memory leak // If we're only merging a specified region move our iters to start at that point - if (reg) { - int tid; - hts_pos_t beg, end; + int tid, nreg; + hts_pos_t beg, end; + if (fn_bed) { + hreg = bed_read(fn_bed); + if (!hreg) { + fprintf(samtools_stderr, "[%s] Could not read BED file: \"%s\"\n", __func__, fn_bed); + goto fail; + } + bed_unify(hreg); + lreg = bed_reglist(hreg, ALL, &nreg); + if (!lreg || !nreg) { + fprintf(samtools_stderr, "[%s] Null or empty region list\n", __func__); + goto fail; + } + } else if (reg) { rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl); if (!rtrans) goto mem_fail; @@ -1111,55 +1162,69 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m fprintf(samtools_stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg); goto fail; } + + } + + if (reg || fn_bed) { + hts_idx_t *reg_idx = NULL; for (i = 0; i < n; ++i) { - hts_idx_t *idx = NULL; - // If index filename has not been specfied, look in BAM folder + + // If index filename has not been specified, look in the BAM folder if (fn_idx != NULL) { - idx = sam_index_load2(fp[i], fn[i], fn_idx[i]); + reg_idx = sam_index_load2(fp[i], fn[i], fn_idx[i]); } else { - idx = sam_index_load(fp[i], fn[i]); + reg_idx = sam_index_load(fp[i], fn[i]); } - // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space - int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid]; - if (idx == NULL) { - fprintf(samtools_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", + if (reg_idx == NULL) { + fprintf(samtools_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", __func__, fn[i]); + free(rtrans); + rtrans = NULL; goto fail; } - if (mapped_tid != INT32_MIN) { - iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); + + int mapped_tid = INT32_MIN; + if (fn_bed) { + hts_reglist_t *rl = duplicate_reglist(lreg, nreg); + iter[i] = sam_itr_regions(reg_idx, hdr[i], rl, nreg); } else { - iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); + // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space + mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid]; + if (mapped_tid != INT32_MIN) { + iter[i] = sam_itr_queryi(reg_idx, mapped_tid, beg, end); + } else { + iter[i] = sam_itr_queryi(reg_idx, HTS_IDX_NONE, 0, 0); + } } - hts_idx_destroy(idx); + if (iter[i] == NULL) { - if (mapped_tid != INT32_MIN) { - fprintf(samtools_stderr, - "[%s] failed to get iterator over " - "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n", - __func__, fn[i], mapped_tid, beg, end); + if (fn_bed) { + fprintf(samtools_stderr, "[%s] failed to get multi-region iterator " + "{%s, %s}\n", __func__, fn[i], fn_bed); } else { - fprintf(samtools_stderr, - "[%s] failed to get iterator over " - "{%s, HTS_IDX_NONE, 0, 0}\n", - __func__, fn[i]); + if (mapped_tid != INT32_MIN) { + fprintf(samtools_stderr, + "[%s] failed to get iterator over " + "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n", + __func__, fn[i], mapped_tid, beg, end); + } else { + fprintf(samtools_stderr, + "[%s] failed to get iterator over " + "{%s, HTS_IDX_NONE, 0, 0}\n", + __func__, fn[i]); + } } + hts_idx_destroy(reg_idx); + free(rtrans); + rtrans = NULL; goto fail; } + + hts_idx_destroy(reg_idx); } + free(rtrans); rtrans = NULL; - } else { - for (i = 0; i < n; ++i) { - if (hdr[i] == NULL) { - iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); - if (iter[i] == NULL) { - fprintf(samtools_stderr, "[%s] failed to get iterator\n", __func__); - goto fail; - } - } - else iter[i] = NULL; - } } // Load the first read from each file into the heap @@ -1281,6 +1346,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m sam_hdr_destroy(hin); sam_hdr_destroy(hout); free_merged_header(merged_hdr); + hts_reglist_free(lreg, nreg); + bed_destroy(hreg); free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); if (sam_close(fpout) < 0) { print_error(cmd, "error closing output file"); @@ -1309,6 +1376,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m free(RG); free(translation_tbl); free(hdr); + hts_reglist_free(lreg, nreg); + bed_destroy(hreg); free(iter); free(heap); free(fp); @@ -1324,13 +1393,14 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch strcpy(mode, "wb"); if (flag & MERGE_UNCOMP) strcat(mode, "0"); else if (flag & MERGE_LEVEL1) strcat(mode, "1"); - return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); + return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); } static void merge_usage(FILE *to) { fprintf(to, -"Usage: samtools merge [-nurlf] [-h inh.sam] [-b ] [ ... ]\n" +"Usage: samtools merge [options] -o [options] ... \n" +" or: samtools merge [options] ... \n" "\n" "Options:\n" " -n Input files are sorted by read name\n" @@ -1338,6 +1408,7 @@ static void merge_usage(FILE *to) " -r Attach RG tag (inferred from file names)\n" " -u Uncompressed BAM output\n" " -f Overwrite the output BAM if exist\n" +" -o FILE Specify output file via option instead of argument\n" " -1 Compress level 1\n" " -l INT Compression level, from 0 to 9 [-1]\n" " -R STR Merge file in the specified region STR [all]\n" @@ -1347,6 +1418,7 @@ static void merge_usage(FILE *to) " -s VALUE Override random seed\n" " -b FILE List of input BAM filenames, one per line [null]\n" " -X Use customized index files\n" +" -L FILE Specify a BED file for multiple region filtering [null]\n" " --no-PG do not add a PG line\n"); sam_global_opt_help(to, "-.O..@.."); } @@ -1355,10 +1427,10 @@ int bam_merge(int argc, char *argv[]) { int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; char *fn_headers = NULL, *reg = NULL, mode[12]; - char *sort_tag = NULL, *arg_list = NULL; + char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL; long random_seed = (long)time(NULL); char** fn = NULL; - char** fn_idx = NULL; + char** fn_idx = NULL, *fn_bed = NULL; int fn_size = 0, no_pg = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; @@ -1374,12 +1446,13 @@ int bam_merge(int argc, char *argv[]) return 0; } - while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = optarg; break; case 'n': is_by_qname = 1; break; + case 'o': fnout = optarg; break; case 't': sort_tag = optarg; break; case '1': flag |= MERGE_LEVEL1; level = 1; break; case 'u': flag |= MERGE_UNCOMP; level = 0; break; @@ -1389,6 +1462,7 @@ int bam_merge(int argc, char *argv[]) case 'p': flag |= MERGE_COMBINE_PG; break; case 's': random_seed = atol(optarg); break; case 'X': has_index_file = 1; break; // -X flag for index filename + case 'L': fn_bed = optarg; break; case 'b': { // load the list of files to read if (has_index_file) { @@ -1417,7 +1491,12 @@ int bam_merge(int argc, char *argv[]) case '?': merge_usage(samtools_stderr); return 1; } } - if ( argc - optind < 1 ) { + + if (fnout == NULL && argc - optind >= 1) { + fnout = argv[optind]; + optind++; + } + if (fnout == NULL) { print_error("merge", "You must at least specify the output file"); merge_usage(samtools_stderr); return 1; @@ -1428,50 +1507,57 @@ int bam_merge(int argc, char *argv[]) return 1; } - srand48(random_seed); - if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { - FILE *fp = fopen(argv[optind], "rb"); - if (fp != NULL) { - fclose(fp); - fprintf(samtools_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); - return 1; + hts_srand48(random_seed); + if (!(flag & MERGE_FORCE) && strcmp(fnout, "-") != 0) { + struct stat sbuf; + if (stat(fnout, &sbuf) == 0 && S_ISREG(sbuf.st_mode)) { + fprintf(samtools_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, fnout); + ret = 1; + goto end; } } int nargcfiles = 0; if (has_index_file) { // Calculate # of input BAM files - if ((argc - optind - 1) % 2 != 0) { + if ((argc - optind) % 2 != 0) { fprintf(samtools_stderr, "Odd number of filenames detected! Each BAM file should have an index file\n"); - return 1; + ret = 1; + goto end; } - nargcfiles = (argc - optind - 1) / 2; + nargcfiles = (argc - optind) / 2; } else { - nargcfiles = argc - optind - 1; + nargcfiles = argc - optind; } if (nargcfiles > 0) { // Add argc files to end of array fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*)); if (fn == NULL) { ret = 1; goto end; } - memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); + memcpy(fn+fn_size, argv + optind, nargcfiles * sizeof(char*)); if(has_index_file) { fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*)); if (fn_idx == NULL) { ret = 1; goto end; } - memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*)); + memcpy(fn_idx+fn_size, argv + nargcfiles + optind, nargcfiles * sizeof(char*)); } } if (fn_size+nargcfiles < 1) { print_error("merge", "You must specify at least one (and usually two or more) input files"); merge_usage(samtools_stderr); - free(fn_idx); - return 1; + ret = 1; + goto end; + } + + if (reg && fn_bed) { + print_error("merge", "You must specify either a BED file or a region"); + ret = 1; + goto end; } strcpy(mode, "wb"); - sam_open_mode(mode+1, argv[optind], NULL); + sam_open_mode(mode+1, fnout, NULL); if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); - if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers, - fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads, + if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers, + fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads, "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) ret = 1; @@ -1633,6 +1719,12 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, ks_heapmake(heap, heap_size, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->entry.bam_record; + if (g_is_by_minhash && b->core.tid == -1) { + // Remove the cached minhash value + b->core.pos = -1; + b->core.mpos = -1; + b->core.isize = 0; + } if (sam_write1(fpout, hout, b) < 0) { print_error_errno(cmd, "failed writing to \"%s\"", out); goto fail; @@ -1791,12 +1883,45 @@ static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b) } } +// Sort by minimiser (stored in bam1_tag.u.pos). +// If equal, sort by position. +// +// The 64-bit sort key is split over the bam pos and isize fields. +// This permits it to survive writing to temporary file and coming back. +static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) +{ + const bam1_t *A = a.bam_record; + const bam1_t *B = b.bam_record; + + if (!A) return 1; + if (!B) return 0; + + if (A->core.tid != -1 || B->core.tid != -1) + return bam1_cmp_core(a,b); + + const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos; + const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos; + + if (m_a < m_b) // by hash + return -1; + else if (m_a > m_b) + return 1; + else if (A->core.isize < B->core.isize) // by hash location in seq + return -1; + else if (A->core.isize > B->core.isize) + return 1; + else + return bam1_cmp_core(a,b); +} + // Function to compare reads and determine which one is < the other // Handle sort-by-pos, sort-by-name, or sort-by-tag static inline int bam1_lt(const bam1_tag a, const bam1_tag b) { if (g_is_by_tag) { return bam1_cmp_by_tag(a, b) < 0; + } else if (g_is_by_minhash) { + return bam1_cmp_by_minhash(a, b) < 0; } else { return bam1_cmp_core(a,b) < 0; } @@ -1820,7 +1945,7 @@ typedef struct { // -1 for failure static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const sam_hdr_t *h, int n_threads, const htsFormat *fmt, - char *arg_list, int no_pg, int write_index) + int clear_minhash, char *arg_list, int no_pg, int write_index) { size_t i; samFile* fp; @@ -1828,22 +1953,27 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu fp = sam_open_format(fn, mode, fmt); if (fp == NULL) return -1; - if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", - "VN", samtools_version(), + if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) { goto fail; } - if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail; + if (sam_hdr_write(fp, h) != 0) goto fail; - if (write_index) { + if (write_index) if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail; - } if (n_threads > 1) hts_set_threads(fp, n_threads); for (i = 0; i < l; ++i) { - if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail; + bam1_t *b = buf[i].bam_record; + if (clear_minhash && b->core.tid == -1) { + // Remove the cached minhash value + b->core.pos = -1; + b->core.mpos = -1; + b->core.isize = 0; + } + if (sam_write1(fp, h, b) < 0) goto fail; } if (write_index) { @@ -1946,18 +2076,206 @@ err: return ret; } +/* + * Computes the minhash of a sequence using both forward and reverse strands. + * + * This is used as a sort key for unmapped data, to collate like sequences + * together and to improve compression ratio. + * + * The minhash is returned and *pos filled out with location of this hash + * key in the sequence if pos != NULL. + */ +static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) { + uint64_t hashf = 0, minhashf = UINT64_MAX; + uint64_t hashr = 0, minhashr = UINT64_MAX; + int minhashpf = 0, minhashpr = 0, i; + uint64_t mask = (1L<<(2*kmer))-1; + unsigned char *seq = bam_get_seq(b); + int len = b->core.l_qseq; + + // Lookup tables for bam_seqi to 0123 fwd/rev hashes + // =ACM GRSV TWYH KDBN +#define X 0 + unsigned char L[16] = { + X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X, + }; + uint64_t R[16] = { + X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X, + }; + for (i = 0; i < 16; i++) + R[i] <<= 2*(kmer-1); + + // Punt homopolymers somewhere central in the hash space +#define XOR (0xdead7878beef7878 & mask) + + // Initialise hash keys + for (i = 0; i < kmer-1 && i < len; i++) { + int base = bam_seqi(seq, i); + hashf = (hashf<<2) | L[base]; + hashr = (hashr>>2) | R[base]; + } + + // Loop to find minimum + for (; i < len; i++) { + int base = bam_seqi(seq, i); + + hashf = ((hashf<<2) | L[base]) & mask; + hashr = (hashr>>2) | R[base]; + + if (minhashf > (hashf^XOR)) + minhashf = (hashf^XOR), minhashpf = i; + if (minhashr > (hashr^XOR)) + minhashr = (hashr^XOR), minhashpr = len-i+kmer-2; + + } + + if (minhashf <= minhashr) { + if (rev) *rev = 0; + if (pos) *pos = minhashpf; + return minhashf; + } else { + if (rev) *rev = 1; + if (pos) *pos = minhashpr; + return minhashr; + } +} + +//--- Start of candidates to punt to htslib +/*! + * @abstract + * Extracts the sequence (in current alignment orientation) from + * a bam record and places it in buf, which is nul terminated. + * + * @param b The bam structure + * @param buf A buffer at least b->core.l_qseq+1 bytes long + */ +static void bam_to_seq(bam1_t *b, char *buf) { + int i; + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < b->core.l_qseq; i++) + buf[i] = seq_nt16_str[bam_seqi(seq, i)]; + buf[i] = 0; +} + +/*! + * @abstract + * Writes a new sequence, of length b->core.l_qseq, to a BAM record. + * + * If a sequence of a new length is required the caller must first make + * room for it by updating the bam1_t struct. + * + * @param b The bam structure + * @param buf A buffer at least b->core.l_qseq bytes long + */ +static void seq_to_bam(bam1_t *b, char *buf) { + int i; + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < b->core.l_qseq; i++) + bam_set_seqi(seq, i, seq_nt16_table[(unsigned char)buf[i]]); +} + +/*! + * @abstract Reverse complements a BAM record. + * + * It's possible to do this inline, but complex due to the 4-bit sequence + * encoding. For now I take the dumb approach. + * + * @param b Pointer to a BAM alignment + * + * @return 0 on success, -1 on failure (ENOMEM) + */ +static int reverse_complement(bam1_t *b) { + static char comp[256] = { + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//00 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//10 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//20 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//30 + + // * * * * E F * * I J * L * * O + '@','T','V','G', 'H','E','F','C', 'D','I','H','M', 'L','K','N','O',//40 + //P Q * * * * * * X Y Z [ \ ] ^ _ + 'P','Q','Y','S', 'A','A','B','W', 'X','Y','Z','[','\\','[','^','_',//50 + //` * * * * E F * * I J * L * * O + '`','t','v','g', 'h','e','f','c', 'd','i','j','m', 'l','k','n','o',//60 + //P Q * * * * * * X Y Z { | } ~ DEL + 'p','q','y','s', 'a','a','b','w', 'x','y','z','{', '|','}','~',127,//70 + + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//80 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//90 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//A0 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//B0 + + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//C0 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//D0 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//E0 + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//F0 + }; + char seq_[10000], *seq = seq_; + uint8_t *qual = bam_get_qual(b); + int i, j; + + if (b->core.l_qseq >= 10000) + if (!(seq = malloc(b->core.l_qseq+1))) + return -1; + + bam_to_seq(b, seq); + + for (i = 0, j = b->core.l_qseq-1; i < j; i++, j--) { + unsigned char tmp = seq[i]; + seq[i] = comp[(unsigned char)seq[j]]; + seq[j] = comp[tmp]; + tmp = qual[i]; + qual[i] = qual[j]; + qual[j] = tmp; + } + if (i ==j) + seq[i] = comp[(unsigned char)seq[i]]; + + seq_to_bam(b, seq); + + if (seq != seq_) + free(seq); + + b->core.flag ^= 0x10; + + return 0; +} +//--- End of candidates to punt to htslib + static void *worker(void *data) { worker_t *w = (worker_t*)data; char *name; w->error = 0; - if (!g_is_by_qname && !g_is_by_tag) { + if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) { if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) { w->error = errno; return NULL; } } else { + if (g_is_by_minhash) { + int i; + for (i = 0; i < w->buf_len; i++) { + bam1_t *b = w->buf[i].bam_record; + if (b->core.tid != -1) + continue; + + int pos = 0, rev = 0; + uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev); + if (rev) + reverse_complement(b); + + // Store 64-bit hash in unmapped pos and mpos fields. + // The position of hash is in isize, which we use for + // resolving ties when sorting by hash key. + // These are unused for completely unmapped data and + // will be reset during final output. + b->core.pos = mh>>31; + b->core.mpos = mh&0x7fffffff; + b->core.isize = 65535-pos >=0 ? 65535-pos : 0; + } + } ks_mergesort(sort, w->buf_len, w->buf, 0); } @@ -1985,10 +2303,10 @@ static void *worker(void *data) return 0; } - if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0) + if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0) w->error = errno; } else { - if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0) + if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0) w->error = errno; } @@ -2045,6 +2363,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, return n_files + n_threads; } + /*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @@ -2069,7 +2388,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, */ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, const char *fnout, const char *modeout, - size_t _max_mem, int n_threads, + size_t _max_mem, int by_minimiser, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { @@ -2092,6 +2411,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; + g_is_by_minhash = by_minimiser; if (sort_by_tag) { g_is_by_tag = 1; g_sort_tag[0] = sort_by_tag[0]; @@ -2118,11 +2438,23 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const else new_so = "coordinate"; - if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) - ) { - print_error("sort", "failed to change sort order header to '%s'\n", new_so); - goto err; + if (by_minimiser) { + const char *new_ss = "coordinate:minhash"; + if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, + "SO", new_so, "SS", new_ss, NULL)) + ) { + print_error("sort", "failed to change sort order header to 'SO:%s SS:%s'\n", + new_so, new_ss); + goto err; + } + } else { + if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) + ) { + print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so); + goto err; + } } if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { @@ -2209,7 +2541,8 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const // write the final output if (n_files == 0 && num_in_mem < 2) { // a single block - if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) { + if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, + g_is_by_minhash, arg_list, no_pg, write_index) != 0) { print_error_errno("sort", "failed to create \"%s\"", fnout); goto err; } @@ -2263,7 +2596,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma char *fnout = calloc(strlen(prefix) + 4 + 1, 1); if (!fnout) return -1; sprintf(fnout, "%s.bam", prefix); - ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); + ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0); free(fnout); return ret; } @@ -2274,13 +2607,16 @@ static void sort_usage(FILE *fp) "Usage: samtools sort [options...] [in.bam]\n" "Options:\n" " -l INT Set compression level, from 0 (uncompressed) to 9 (best)\n" +" -u Output uncompressed data (equivalent to -l 0)\n" " -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n" -" -n Sort by read name\n" +" -M Use minimiser for clustering unaligned/unplaced reads\n" +" -K INT Kmer size to use for minimiser [20]\n" +" -n Sort by read name (not compatible with samtools index command)\n" " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" " -o FILE Write final output to FILE rather than standard output\n" " -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" " --no-PG do not add a PG line\n"); - sam_global_opt_help(fp, "-.O..@-."); + sam_global_opt_help(fp, "-.O..@.."); } static void complain_about_memory_setting(size_t max_mem) { @@ -2304,6 +2640,7 @@ int bam_sort(int argc, char *argv[]) { size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0; + int by_minimiser = 0, minimiser_kmer = 20; char* sort_tag = NULL, *arg_list = NULL; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; @@ -2317,7 +2654,7 @@ int bam_sort(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) { switch (c) { case 'o': fnout = optarg; o_seen = 1; break; case 'n': is_by_qname = 1; break; @@ -2332,7 +2669,16 @@ int bam_sort(int argc, char *argv[]) } case 'T': kputs(optarg, &tmpprefix); break; case 'l': level = atoi(optarg); break; - case 1: no_pg = 1; break; + case 'u': level = 0; break; + case 1: no_pg = 1; break; + case 'M': by_minimiser = 1; break; + case 'K': + minimiser_kmer = atoi(optarg); + if (minimiser_kmer < 1) + minimiser_kmer = 1; + else if (minimiser_kmer > 31) + minimiser_kmer = 31; + break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ @@ -2387,7 +2733,8 @@ int bam_sort(int argc, char *argv[]) } ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-", - tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, + tmpprefix.s, fnout, modeout, max_mem, + by_minimiser * minimiser_kmer, ga.nthreads, &ga.in, &ga.out, arg_list, no_pg, ga.write_index); if (ret >= 0) ret = EXIT_SUCCESS; diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c index 5fb9ba0..31dc8fe 100644 --- a/samtools/bam_stat.c +++ b/samtools/bam_stat.c @@ -1,6 +1,6 @@ /* bam_stat.c -- flagstat subcommand. - Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd. + Copyright (C) 2009, 2011, 2013-2015, 2019, 2021 Genome Research Ltd. Author: Heng Li @@ -42,32 +42,41 @@ typedef struct { long long n_dup[2]; long long n_diffchr[2], n_diffhigh[2]; long long n_secondary[2], n_supp[2]; + long long n_primary[2], n_pmapped[2], n_pdup[2]; } bam_flagstat_t; -#define flagstat_loop(s, c) do { \ - int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ - ++(s)->n_reads[w]; \ - if ((c)->flag & BAM_FSECONDARY ) { \ - ++(s)->n_secondary[w]; \ - } else if ((c)->flag & BAM_FSUPPLEMENTARY ) { \ - ++(s)->n_supp[w]; \ - } else if ((c)->flag & BAM_FPAIRED) { \ - ++(s)->n_pair_all[w]; \ - if (((c)->flag & BAM_FPROPER_PAIR) && !((c)->flag & BAM_FUNMAP) ) ++(s)->n_pair_good[w]; \ - if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ - if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ - if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ - if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ - ++(s)->n_pair_map[w]; \ - if ((c)->mtid != (c)->tid) { \ - ++(s)->n_diffchr[w]; \ - if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ - } \ - } \ - } \ - if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ - } while (0) +inline static void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c) +{ + int w = (c->flag & BAM_FQCFAIL)? 1 : 0; + ++s->n_reads[w]; + if (c->flag & BAM_FSECONDARY ) { + ++s->n_secondary[w]; + } else if (c->flag & BAM_FSUPPLEMENTARY ) { + ++s->n_supp[w]; + } else { + ++s->n_primary[w]; + + if (c->flag & BAM_FPAIRED) { + ++s->n_pair_all[w]; + if ((c->flag & BAM_FPROPER_PAIR) && !(c->flag & BAM_FUNMAP) ) ++s->n_pair_good[w]; + if (c->flag & BAM_FREAD1) ++s->n_read1[w]; + if (c->flag & BAM_FREAD2) ++s->n_read2[w]; + if ((c->flag & BAM_FMUNMAP) && !(c->flag & BAM_FUNMAP)) ++s->n_sgltn[w]; + if (!(c->flag & BAM_FUNMAP) && !(c->flag & BAM_FMUNMAP)) { + ++s->n_pair_map[w]; + if (c->mtid != c->tid) { + ++s->n_diffchr[w]; + if (c->qual >= 5) ++s->n_diffhigh[w]; + } + } + } + + if (!(c->flag & BAM_FUNMAP)) ++s->n_pmapped[w]; + if (c->flag & BAM_FDUP) ++s->n_pdup[w]; + } + if (!(c->flag & BAM_FUNMAP)) ++s->n_mapped[w]; + if (c->flag & BAM_FDUP) ++s->n_dup[w]; +} bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) { @@ -81,8 +90,10 @@ bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) while ((ret = sam_read1(fp, h, b)) >= 0) flagstat_loop(s, c); bam_destroy1(b); - if (ret != -1) - fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + if (ret != -1) { + free(s); + return NULL; + } return s; } @@ -114,10 +125,13 @@ static void out_fmt_default(bam_flagstat_t *s) { char b0[16], b1[16]; printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + printf("%lld + %lld primary\n", s->n_primary[0], s->n_primary[1]); printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + printf("%lld + %lld primary duplicates\n", s->n_pdup[0], s->n_pdup[1]); printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); + printf("%lld + %lld primary mapped (%s : %s)\n", s->n_pmapped[0], s->n_pmapped[1], percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1])); printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); @@ -129,14 +143,18 @@ static void out_fmt_default(bam_flagstat_t *s) } static void out_fmt_json(bam_flagstat_t *s) { - char b0[16], b1[16]; + char b0[16], b1[16], p0[16], p1[16], pp0[16], pp1[16], s0[16], s1[16]; printf("{\n \"QC-passed reads\": { \n" " \"total\": %lld, \n" + " \"primary\": %lld, \n" " \"secondary\": %lld, \n" " \"supplementary\": %lld, \n" " \"duplicates\": %lld, \n" + " \"primary duplicates\": %lld, \n" " \"mapped\": %lld, \n" " \"mapped %%\": %s, \n" + " \"primary mapped\": %lld, \n" + " \"primary mapped %%\": %s, \n" " \"paired in sequencing\": %lld, \n" " \"read1\": %lld, \n" " \"read2\": %lld, \n" @@ -150,11 +168,15 @@ static void out_fmt_json(bam_flagstat_t *s) { " }," "\n \"QC-failed reads\": { \n" " \"total\": %lld, \n" + " \"primary\": %lld, \n" " \"secondary\": %lld, \n" " \"supplementary\": %lld, \n" " \"duplicates\": %lld, \n" + " \"primary duplicates\": %lld, \n" " \"mapped\": %lld, \n" " \"mapped %%\": %s, \n" + " \"primary mapped\": %lld, \n" + " \"primary mapped %%\": %s, \n" " \"paired in sequencing\": %lld, \n" " \"read1\": %lld, \n" " \"read2\": %lld, \n" @@ -168,35 +190,43 @@ static void out_fmt_json(bam_flagstat_t *s) { " }\n" "}\n", s->n_reads[0], + s->n_primary[0], s->n_secondary[0], s->n_supp[0], s->n_dup[0], + s->n_pdup[0], s->n_mapped[0], percent_json(b0, s->n_mapped[0], s->n_reads[0]), + s->n_pmapped[0], + percent_json(p0, s->n_pmapped[0], s->n_primary[0]), s->n_pair_all[0], s->n_read1[0], s->n_read2[0], s->n_pair_good[0], - percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]), + percent_json(pp0, s->n_pair_good[0], s->n_pair_all[0]), s->n_pair_map[0], s->n_sgltn[0], - percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]), + percent_json(s0, s->n_sgltn[0], s->n_pair_all[0]), s->n_diffchr[0], s->n_diffhigh[0], s->n_reads[1], + s->n_primary[1], s->n_secondary[1], s->n_supp[1], s->n_dup[1], + s->n_pdup[1], s->n_mapped[1], percent_json(b1, s->n_mapped[1], s->n_reads[1]), + s->n_pmapped[1], + percent_json(p1, s->n_pmapped[1], s->n_primary[1]), s->n_pair_all[1], s->n_read1[1], s->n_read2[1], s->n_pair_good[1], - percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]), + percent_json(pp1, s->n_pair_good[1], s->n_pair_all[1]), s->n_pair_map[1], s->n_sgltn[1], - percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]), + percent_json(s1, s->n_sgltn[1], s->n_pair_all[1]), s->n_diffchr[1], s->n_diffhigh[1] ); @@ -205,11 +235,15 @@ static void out_fmt_json(bam_flagstat_t *s) { static void out_fmt_tsv(bam_flagstat_t *s) { char b0[16], b1[16]; printf("%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + printf("%lld\t%lld\tprimary\n", s->n_primary[0], s->n_primary[1]); printf("%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]); printf("%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]); printf("%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]); + printf("%lld\t%lld\tprimary duplicates\n", s->n_pdup[0], s->n_pdup[1]); printf("%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]); printf("%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); + printf("%lld\t%lld\tprimary mapped\n", s->n_pmapped[0], s->n_pmapped[1]); + printf("%s\t%s\tprimary mapped %%\n", percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1])); printf("%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); printf("%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]); printf("%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]); @@ -242,7 +276,7 @@ int bam_flagstat(int argc, char *argv[]) sam_hdr_t *header; bam_flagstat_t *s; const char *out_fmt = "default"; - int c; + int c, status = EXIT_SUCCESS; enum { INPUT_FMT_OPTION = CHAR_MAX+1, @@ -296,10 +330,17 @@ int bam_flagstat(int argc, char *argv[]) } s = bam_flagstat_core(fp, header); - output_fmt(s, out_fmt); - free(s); + if (s) { + output_fmt(s, out_fmt); + free(s); + } + else { + print_error("flagstat", "error reading from \"%s\"", argv[optind]); + status = EXIT_FAILURE; + } + sam_hdr_destroy(header); sam_close(fp); sam_global_args_free(&ga); - return 0; + return status; } diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c index 84a9ea4..bd6f4ca 100644 --- a/samtools/bam_stat.c.pysam.c +++ b/samtools/bam_stat.c.pysam.c @@ -2,7 +2,7 @@ /* bam_stat.c -- flagstat subcommand. - Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd. + Copyright (C) 2009, 2011, 2013-2015, 2019, 2021 Genome Research Ltd. Author: Heng Li @@ -44,32 +44,41 @@ typedef struct { long long n_dup[2]; long long n_diffchr[2], n_diffhigh[2]; long long n_secondary[2], n_supp[2]; + long long n_primary[2], n_pmapped[2], n_pdup[2]; } bam_flagstat_t; -#define flagstat_loop(s, c) do { \ - int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ - ++(s)->n_reads[w]; \ - if ((c)->flag & BAM_FSECONDARY ) { \ - ++(s)->n_secondary[w]; \ - } else if ((c)->flag & BAM_FSUPPLEMENTARY ) { \ - ++(s)->n_supp[w]; \ - } else if ((c)->flag & BAM_FPAIRED) { \ - ++(s)->n_pair_all[w]; \ - if (((c)->flag & BAM_FPROPER_PAIR) && !((c)->flag & BAM_FUNMAP) ) ++(s)->n_pair_good[w]; \ - if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ - if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ - if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ - if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ - ++(s)->n_pair_map[w]; \ - if ((c)->mtid != (c)->tid) { \ - ++(s)->n_diffchr[w]; \ - if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ - } \ - } \ - } \ - if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ - } while (0) +inline static void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c) +{ + int w = (c->flag & BAM_FQCFAIL)? 1 : 0; + ++s->n_reads[w]; + if (c->flag & BAM_FSECONDARY ) { + ++s->n_secondary[w]; + } else if (c->flag & BAM_FSUPPLEMENTARY ) { + ++s->n_supp[w]; + } else { + ++s->n_primary[w]; + + if (c->flag & BAM_FPAIRED) { + ++s->n_pair_all[w]; + if ((c->flag & BAM_FPROPER_PAIR) && !(c->flag & BAM_FUNMAP) ) ++s->n_pair_good[w]; + if (c->flag & BAM_FREAD1) ++s->n_read1[w]; + if (c->flag & BAM_FREAD2) ++s->n_read2[w]; + if ((c->flag & BAM_FMUNMAP) && !(c->flag & BAM_FUNMAP)) ++s->n_sgltn[w]; + if (!(c->flag & BAM_FUNMAP) && !(c->flag & BAM_FMUNMAP)) { + ++s->n_pair_map[w]; + if (c->mtid != c->tid) { + ++s->n_diffchr[w]; + if (c->qual >= 5) ++s->n_diffhigh[w]; + } + } + } + + if (!(c->flag & BAM_FUNMAP)) ++s->n_pmapped[w]; + if (c->flag & BAM_FDUP) ++s->n_pdup[w]; + } + if (!(c->flag & BAM_FUNMAP)) ++s->n_mapped[w]; + if (c->flag & BAM_FDUP) ++s->n_dup[w]; +} bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) { @@ -83,8 +92,10 @@ bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) while ((ret = sam_read1(fp, h, b)) >= 0) flagstat_loop(s, c); bam_destroy1(b); - if (ret != -1) - fprintf(samtools_stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + if (ret != -1) { + free(s); + return NULL; + } return s; } @@ -109,17 +120,20 @@ static void usage_exit(FILE *fp, int exit_status) fprintf(fp, " -O, --"); fprintf(fp, "output-fmt FORMAT[,OPT[=VAL]]...\n" " Specify output format (json, tsv)\n"); - exit(exit_status); + samtools_exit(exit_status); } static void out_fmt_default(bam_flagstat_t *s) { char b0[16], b1[16]; fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + fprintf(samtools_stdout, "%lld + %lld primary\n", s->n_primary[0], s->n_primary[1]); fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + fprintf(samtools_stdout, "%lld + %lld primary duplicates\n", s->n_pdup[0], s->n_pdup[1]); fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); + fprintf(samtools_stdout, "%lld + %lld primary mapped (%s : %s)\n", s->n_pmapped[0], s->n_pmapped[1], percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1])); fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); @@ -131,14 +145,18 @@ static void out_fmt_default(bam_flagstat_t *s) } static void out_fmt_json(bam_flagstat_t *s) { - char b0[16], b1[16]; + char b0[16], b1[16], p0[16], p1[16], pp0[16], pp1[16], s0[16], s1[16]; fprintf(samtools_stdout, "{\n \"QC-passed reads\": { \n" " \"total\": %lld, \n" + " \"primary\": %lld, \n" " \"secondary\": %lld, \n" " \"supplementary\": %lld, \n" " \"duplicates\": %lld, \n" + " \"primary duplicates\": %lld, \n" " \"mapped\": %lld, \n" " \"mapped %%\": %s, \n" + " \"primary mapped\": %lld, \n" + " \"primary mapped %%\": %s, \n" " \"paired in sequencing\": %lld, \n" " \"read1\": %lld, \n" " \"read2\": %lld, \n" @@ -152,11 +170,15 @@ static void out_fmt_json(bam_flagstat_t *s) { " }," "\n \"QC-failed reads\": { \n" " \"total\": %lld, \n" + " \"primary\": %lld, \n" " \"secondary\": %lld, \n" " \"supplementary\": %lld, \n" " \"duplicates\": %lld, \n" + " \"primary duplicates\": %lld, \n" " \"mapped\": %lld, \n" " \"mapped %%\": %s, \n" + " \"primary mapped\": %lld, \n" + " \"primary mapped %%\": %s, \n" " \"paired in sequencing\": %lld, \n" " \"read1\": %lld, \n" " \"read2\": %lld, \n" @@ -170,35 +192,43 @@ static void out_fmt_json(bam_flagstat_t *s) { " }\n" "}\n", s->n_reads[0], + s->n_primary[0], s->n_secondary[0], s->n_supp[0], s->n_dup[0], + s->n_pdup[0], s->n_mapped[0], percent_json(b0, s->n_mapped[0], s->n_reads[0]), + s->n_pmapped[0], + percent_json(p0, s->n_pmapped[0], s->n_primary[0]), s->n_pair_all[0], s->n_read1[0], s->n_read2[0], s->n_pair_good[0], - percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]), + percent_json(pp0, s->n_pair_good[0], s->n_pair_all[0]), s->n_pair_map[0], s->n_sgltn[0], - percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]), + percent_json(s0, s->n_sgltn[0], s->n_pair_all[0]), s->n_diffchr[0], s->n_diffhigh[0], s->n_reads[1], + s->n_primary[1], s->n_secondary[1], s->n_supp[1], s->n_dup[1], + s->n_pdup[1], s->n_mapped[1], percent_json(b1, s->n_mapped[1], s->n_reads[1]), + s->n_pmapped[1], + percent_json(p1, s->n_pmapped[1], s->n_primary[1]), s->n_pair_all[1], s->n_read1[1], s->n_read2[1], s->n_pair_good[1], - percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]), + percent_json(pp1, s->n_pair_good[1], s->n_pair_all[1]), s->n_pair_map[1], s->n_sgltn[1], - percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]), + percent_json(s1, s->n_sgltn[1], s->n_pair_all[1]), s->n_diffchr[1], s->n_diffhigh[1] ); @@ -207,11 +237,15 @@ static void out_fmt_json(bam_flagstat_t *s) { static void out_fmt_tsv(bam_flagstat_t *s) { char b0[16], b1[16]; fprintf(samtools_stdout, "%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + fprintf(samtools_stdout, "%lld\t%lld\tprimary\n", s->n_primary[0], s->n_primary[1]); fprintf(samtools_stdout, "%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]); fprintf(samtools_stdout, "%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]); fprintf(samtools_stdout, "%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]); + fprintf(samtools_stdout, "%lld\t%lld\tprimary duplicates\n", s->n_pdup[0], s->n_pdup[1]); fprintf(samtools_stdout, "%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]); fprintf(samtools_stdout, "%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); + fprintf(samtools_stdout, "%lld\t%lld\tprimary mapped\n", s->n_pmapped[0], s->n_pmapped[1]); + fprintf(samtools_stdout, "%s\t%s\tprimary mapped %%\n", percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1])); fprintf(samtools_stdout, "%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); fprintf(samtools_stdout, "%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]); fprintf(samtools_stdout, "%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]); @@ -244,7 +278,7 @@ int bam_flagstat(int argc, char *argv[]) sam_hdr_t *header; bam_flagstat_t *s; const char *out_fmt = "default"; - int c; + int c, status = EXIT_SUCCESS; enum { INPUT_FMT_OPTION = CHAR_MAX+1, @@ -298,10 +332,17 @@ int bam_flagstat(int argc, char *argv[]) } s = bam_flagstat_core(fp, header); - output_fmt(s, out_fmt); - free(s); + if (s) { + output_fmt(s, out_fmt); + free(s); + } + else { + print_error("flagstat", "error reading from \"%s\"", argv[optind]); + status = EXIT_FAILURE; + } + sam_hdr_destroy(header); sam_close(fp); sam_global_args_free(&ga); - return 0; + return status; } diff --git a/samtools/bamtk.c b/samtools/bamtk.c index a6959f9..93e6468 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -1,6 +1,6 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2019 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Author: Heng Li @@ -30,6 +30,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/hts.h" +#include "htslib/hfile.h" #include "samtools.h" #include "version.h" @@ -46,7 +47,6 @@ int bam_fillmd(int argc, char *argv[]); int bam_idxstats(int argc, char *argv[]); int bam_markdup(int argc, char *argv[]); int main_samview(int argc, char *argv[]); -int main_import(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); int main_cut_target(int argc, char *argv[]); int main_phase(int argc, char *argv[]); @@ -65,12 +65,78 @@ int main_addreplacerg(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int dict_main(int argc, char *argv[]); int fqidx_main(int argc, char *argv[]); +int amplicon_clip_main(int argc, char *argv[]); +int main_ampliconstats(int argc, char *argv[]); +int main_import(int argc, char *argv[]); const char *samtools_version() { return SAMTOOLS_VERSION; } +// These come out of the config.h file built by autoconf or Makefile +const char *samtools_feature_string(void) { + const char *fmt = + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef HAVE_CURSES + "curses=yes " +#else + "curses=no " +#endif + ; + + return fmt; +} + +static void long_version(void) { + printf("samtools %s\n" + "Using htslib %s\n" + "Copyright (C) 2021 Genome Research Ltd.\n", + samtools_version(), hts_version()); + + printf("\nSamtools compilation details:\n"); + printf(" Features: %s\n", samtools_feature_string()); + printf(" CC: %s\n", SAMTOOLS_CC); + printf(" CPPFLAGS: %s\n", SAMTOOLS_CPPFLAGS); + printf(" CFLAGS: %s\n", SAMTOOLS_CFLAGS); + printf(" LDFLAGS: %s\n", SAMTOOLS_LDFLAGS); + printf(" HTSDIR: %s\n", SAMTOOLS_HTSDIR); + printf(" LIBS: %s\n", SAMTOOLS_LIBS); + printf(" CURSES_LIB: %s\n", SAMTOOLS_CURSES_LIB); + + printf("\nHTSlib compilation details:\n"); + printf(" Features: %s\n", hts_feature_string()); + printf(" CC: %s\n", hts_test_feature(HTS_FEATURE_CC)); + printf(" CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS)); + printf(" CFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CFLAGS)); + printf(" LDFLAGS: %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS)); + + // Plugins and schemes + printf("\nHTSlib URL scheme handlers present:\n"); + const char *plugins[100]; + int np = 100, i, j; + + if (hfile_list_plugins(plugins, &np) < 0) + return; + + for (i = 0; i < np; i++) { + const char *sc_list[100]; + int nschemes = 100; + if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0) + return; + + printf(" %s:\t", plugins[i]); + for (j = 0; j < nschemes; j++) + printf(" %s%c", sc_list[j], ",\n"[j+1==nschemes]); + } +} + static void usage(FILE *fp) { /* Please improve the grouping */ @@ -96,6 +162,7 @@ static void usage(FILE *fp) " targetcut cut fosmid regions (for fosmid pool only)\n" " addreplacerg adds or replaces RG tags\n" " markdup mark duplicates\n" +" ampliconclip clip oligos from the end of reads\n" "\n" " -- File operations\n" " collate shuffle and group alignments by name\n" @@ -107,6 +174,7 @@ static void usage(FILE *fp) " quickcheck quickly check if SAM/BAM/CRAM file appears intact\n" " fastq converts a BAM to a FASTQ\n" " fasta converts a BAM to a FASTA\n" +" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n" "\n" " -- Statistics\n" " bedcov read depth per BED region\n" @@ -116,19 +184,18 @@ static void usage(FILE *fp) " idxstats BAM index stats\n" " phase phase heterozygotes\n" " stats generate stats (former bamcheck)\n" +" ampliconstats generate amplicon specific stats\n" "\n" " -- Viewing\n" " flags explain BAM flags\n" " tview text alignment viewer\n" " view SAM<->BAM<->CRAM conversion\n" " depad convert padded BAM to unpadded BAM\n" +"\n" +" -- Misc\n" +" help [cmd] display this help message or help for [cmd]\n" +" version detailed version information\n" "\n"); -#ifdef _WIN32 - fprintf(fp, -"Note: The Windows version of SAMtools is mainly designed for read-only\n" -" operations, such as viewing the alignments and generating the pileup.\n" -" Binary files generated by the Windows version may be buggy.\n\n"); -#endif } // This is a tricky one, but on Windows the filename wildcard expansion is done by @@ -176,6 +243,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); + else if (strcmp(argv[1], "ampliconclip") == 0) ret = amplicon_clip_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0 || strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); @@ -206,12 +274,10 @@ int main(int argc, char *argv[]) return 1; } else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); - else if (strcmp(argv[1], "--version") == 0) { - printf( -"samtools %s\n" -"Using htslib %s\n" -"Copyright (C) 2019 Genome Research Ltd.\n", - samtools_version(), hts_version()); + else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); + else if (strcmp(argv[1], "version") == 0 || \ + strcmp(argv[1], "--version") == 0) { + long_version(); } else if (strcmp(argv[1], "--version-only") == 0) { printf("%s+htslib-%s\n", samtools_version(), hts_version()); diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index 91c29b8..dfb2cdd 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -2,7 +2,7 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2019 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Author: Heng Li @@ -32,8 +32,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/hts.h" +#include "htslib/hfile.h" #include "samtools.h" #include "version.h" +#include "samtools_config_vars.h" int bam_taf2baf(int argc, char *argv[]); int bam_mpileup(int argc, char *argv[]); @@ -48,8 +50,7 @@ int bam_fillmd(int argc, char *argv[]); int bam_idxstats(int argc, char *argv[]); int bam_markdup(int argc, char *argv[]); int main_samview(int argc, char *argv[]); -int main_import(int argc, char *argv[]); -int main_reheader(int argc, char *argv[]); +int samtools_main_reheader(int argc, char *argv[]); int main_cut_target(int argc, char *argv[]); int main_phase(int argc, char *argv[]); int main_cat(int argc, char *argv[]); @@ -67,12 +68,78 @@ int main_addreplacerg(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int dict_main(int argc, char *argv[]); int fqidx_main(int argc, char *argv[]); +int amplicon_clip_main(int argc, char *argv[]); +int main_ampliconstats(int argc, char *argv[]); +int main_import(int argc, char *argv[]); const char *samtools_version() { return SAMTOOLS_VERSION; } +// These come out of the config.h file built by autoconf or Makefile +const char *samtools_feature_string(void) { + const char *fmt = + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef HAVE_CURSES + "curses=yes " +#else + "curses=no " +#endif + ; + + return fmt; +} + +static void long_version(void) { + fprintf(samtools_stdout, "samtools %s\n" + "Using htslib %s\n" + "Copyright (C) 2021 Genome Research Ltd.\n", + samtools_version(), hts_version()); + + fprintf(samtools_stdout, "\nSamtools compilation details:\n"); + fprintf(samtools_stdout, " Features: %s\n", samtools_feature_string()); + fprintf(samtools_stdout, " CC: %s\n", SAMTOOLS_CC); + fprintf(samtools_stdout, " CPPFLAGS: %s\n", SAMTOOLS_CPPFLAGS); + fprintf(samtools_stdout, " CFLAGS: %s\n", SAMTOOLS_CFLAGS); + fprintf(samtools_stdout, " LDFLAGS: %s\n", SAMTOOLS_LDFLAGS); + fprintf(samtools_stdout, " HTSDIR: %s\n", SAMTOOLS_HTSDIR); + fprintf(samtools_stdout, " LIBS: %s\n", SAMTOOLS_LIBS); + fprintf(samtools_stdout, " CURSES_LIB: %s\n", SAMTOOLS_CURSES_LIB); + + fprintf(samtools_stdout, "\nHTSlib compilation details:\n"); + fprintf(samtools_stdout, " Features: %s\n", hts_feature_string()); + fprintf(samtools_stdout, " CC: %s\n", hts_test_feature(HTS_FEATURE_CC)); + fprintf(samtools_stdout, " CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS)); + fprintf(samtools_stdout, " CFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CFLAGS)); + fprintf(samtools_stdout, " LDFLAGS: %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS)); + + // Plugins and schemes + fprintf(samtools_stdout, "\nHTSlib URL scheme handlers present:\n"); + const char *plugins[100]; + int np = 100, i, j; + + if (hfile_list_plugins(plugins, &np) < 0) + return; + + for (i = 0; i < np; i++) { + const char *sc_list[100]; + int nschemes = 100; + if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0) + return; + + fprintf(samtools_stdout, " %s:\t", plugins[i]); + for (j = 0; j < nschemes; j++) + fprintf(samtools_stdout, " %s%c", sc_list[j], ",\n"[j+1==nschemes]); + } +} + static void usage(FILE *fp) { /* Please improve the grouping */ @@ -98,6 +165,7 @@ static void usage(FILE *fp) " targetcut cut fosmid regions (for fosmid pool only)\n" " addreplacerg adds or replaces RG tags\n" " markdup mark duplicates\n" +" ampliconclip clip oligos from the end of reads\n" "\n" " -- File operations\n" " collate shuffle and group alignments by name\n" @@ -109,6 +177,7 @@ static void usage(FILE *fp) " quickcheck quickly check if SAM/BAM/CRAM file appears intact\n" " fastq converts a BAM to a FASTQ\n" " fasta converts a BAM to a FASTA\n" +" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n" "\n" " -- Statistics\n" " bedcov read depth per BED region\n" @@ -118,19 +187,18 @@ static void usage(FILE *fp) " idxstats BAM index stats\n" " phase phase heterozygotes\n" " stats generate stats (former bamcheck)\n" +" ampliconstats generate amplicon specific stats\n" "\n" " -- Viewing\n" " flags explain BAM flags\n" " tview text alignment viewer\n" " view SAM<->BAM<->CRAM conversion\n" " depad convert padded BAM to unpadded BAM\n" +"\n" +" -- Misc\n" +" help [cmd] display this help message or help for [cmd]\n" +" version detailed version information\n" "\n"); -#ifdef _WIN32 - fprintf(fp, -"Note: The Windows version of SAMtools is mainly designed for read-only\n" -" operations, such as viewing the alignments and generating the pileup.\n" -" Binary files generated by the Windows version may be buggy.\n\n"); -#endif } // This is a tricky one, but on Windows the filename wildcard expansion is done by @@ -178,11 +246,12 @@ int samtools_main(int argc, char *argv[]) else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); + else if (strcmp(argv[1], "ampliconclip") == 0) ret = amplicon_clip_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0 || strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1); - else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1); + else if (strcmp(argv[1], "reheader") == 0) ret = samtools_main_reheader(argc-1, argv+1); else if (strcmp(argv[1], "cat") == 0) ret = main_cat(argc-1, argv+1); else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1); else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1); @@ -208,12 +277,10 @@ int samtools_main(int argc, char *argv[]) return 1; } //else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); - else if (strcmp(argv[1], "--version") == 0) { - fprintf(samtools_stdout, -"samtools %s\n" -"Using htslib %s\n" -"Copyright (C) 2019 Genome Research Ltd.\n", - samtools_version(), hts_version()); + else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); + else if (strcmp(argv[1], "version") == 0 || \ + strcmp(argv[1], "--version") == 0) { + long_version(); } else if (strcmp(argv[1], "--version-only") == 0) { fprintf(samtools_stdout, "%s+htslib-%s\n", samtools_version(), hts_version()); diff --git a/samtools/bedcov.c b/samtools/bedcov.c index a36d672..bccc09b 100644 --- a/samtools/bedcov.c +++ b/samtools/bedcov.c @@ -1,7 +1,7 @@ /* bedcov.c -- bedcov subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd. + Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd. Author: Heng Li @@ -40,11 +40,14 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) +#define DEFAULT_DEPTH 64000 + typedef struct { htsFile *fp; sam_hdr_t *header; hts_itr_t *iter; int min_mapQ; + uint32_t flags; // read filtering flags } aux_t; static int read_bam(void *data, bam1_t *b) @@ -55,7 +58,7 @@ static int read_bam(void *data, bam1_t *b) { ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b); if ( ret<0 ) break; - if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; + if ( b->core.flag & aux->flags ) continue; if ( (int)b->core.qual < aux->min_mapQ ) continue; break; } @@ -69,10 +72,12 @@ int main_bedcov(int argc, char *argv[]) kstream_t *ks; hts_idx_t **idx; aux_t **aux; - int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0; - int64_t *cnt; + int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0; + int64_t *cnt, *pcov = NULL;; const bam_pileup1_t **plp; int usage = 0, has_index_file = 0; + uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); + int tflags = 0, min_depth = -1; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { @@ -80,11 +85,28 @@ int main_bedcov(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; case 'X': has_index_file = 1; break; + case 'g': + tflags = bam_str2flag(optarg); + if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { + print_error("bedcov", "Flag value \"%s\" is not supported", optarg); + return 1; + } + flags &= ~tflags; + break; + case 'G': + tflags = bam_str2flag(optarg); + if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { + print_error("bedcov", "Flag value \"%s\" is not supported", optarg); + return 1; + } + flags |= tflags; + break; case 'j': skip_DN = 1; break; + case 'd': min_depth = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage = 1; break; @@ -96,7 +118,12 @@ int main_bedcov(int argc, char *argv[]) fprintf(stderr, "Options:\n"); fprintf(stderr, " -Q mapping quality threshold [0]\n"); fprintf(stderr, " -X use customized index files\n"); + fprintf(stderr, " -g remove the specified flags from the set used to filter out reads\n"); + fprintf(stderr, " -G add the specified flags to the set used to filter out reads\n" + " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704"); fprintf(stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n"); + fprintf(stderr, " -d depth threshold. Number of reference bases with coverage above and" + " including this value will be displayed in a separate column\n"); sam_global_opt_help(stderr, "-.--.--."); return 1; } @@ -136,8 +163,11 @@ int main_bedcov(int argc, char *argv[]) argv[i+optind+1]); return 2; } + aux[i]->flags = flags; } - cnt = calloc(n, 8); + cnt = calloc(n, sizeof(*cnt)); + if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov)); + if (!cnt || (min_depth >= 0 && !pcov)) return 2; fp = gzopen(argv[optind], "rb"); if (fp == NULL) { @@ -149,7 +179,8 @@ int main_bedcov(int argc, char *argv[]) plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; - int tid, beg, end, pos; + int tid, pos, num = 0; + int64_t beg = 0, end = 0; bam_mplp_t mplp; if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */ @@ -158,53 +189,75 @@ int main_bedcov(int argc, char *argv[]) be followed by a tab in that case). */ if (strncmp(str.s, "track ", 6) == 0) continue; if (strncmp(str.s, "browser ", 8) == 0) continue; - for (p = q = str.s; *p && *p != '\t'; ++p); - if (*p != '\t') goto bed_error; - *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; + for (p = q = str.s; *p && !isspace(*p); ++p); + if (*p == 0) goto bed_error; + char c = *p; + *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c; if (tid < 0) goto bed_error; - for (q = p = p + 1; isdigit(*p); ++p); - if (*p != '\t') goto bed_error; - *p = 0; beg = atoi(q); *p = '\t'; - for (q = p = p + 1; isdigit(*p); ++p); - if (*p == '\t' || *p == 0) { - int c = *p; - *p = 0; end = atoi(q); *p = c; - } else goto bed_error; + num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end); + if (num < 2 || end < beg) goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); } + mplp = bam_mplp_init(n, read_bam, (void**)aux); - bam_mplp_set_maxcnt(mplp, 64000); - memset(cnt, 0, 8 * n); - while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) + if (min_depth > DEFAULT_DEPTH) + bam_mplp_set_maxcnt(mplp, min_depth); + else + bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH); + + memset(cnt, 0, sizeof(*cnt) * n); + if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n); + + while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) if (pos >= beg && pos < end) { - for (i = 0, m = 0; i < n; ++i) { - if (skip_DN) + for (i = 0; i < n; ++i) { + m = 0; + if (skip_DN || min_depth >= 0) { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *pi = plp[i] + j; if (pi->is_del || pi->is_refskip) ++m; } - cnt[i] += n_plp[i] - m; + } + int pd = n_plp[i] - m; + cnt[i] += pd; + if (min_depth >= 0 && pd >= min_depth) pcov[i]++; } } + + if (ret < 0) { + print_error("bedcov", "error reading from input file"); + status = 2; + bam_mplp_destroy(mplp); + break; + } + for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } + if (min_depth >= 0) { + for (i = 0; i < n; ++i) { + kputc('\t', &str); + kputl(pcov[i], &str); + } + } puts(str.s); bam_mplp_destroy(mplp); continue; bed_error: fprintf(stderr, "Errors in BED line '%s'\n", str.s); + status = 2; } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); + free(pcov); for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); hts_idx_destroy(idx[i]); @@ -215,5 +268,5 @@ bed_error: free(aux); free(idx); free(str.s); sam_global_args_free(&ga); - return 0; + return status; } diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c index 82b63aa..b72cbf1 100644 --- a/samtools/bedcov.c.pysam.c +++ b/samtools/bedcov.c.pysam.c @@ -3,7 +3,7 @@ /* bedcov.c -- bedcov subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd. + Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd. Author: Heng Li @@ -42,11 +42,14 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) +#define DEFAULT_DEPTH 64000 + typedef struct { htsFile *fp; sam_hdr_t *header; hts_itr_t *iter; int min_mapQ; + uint32_t flags; // read filtering flags } aux_t; static int read_bam(void *data, bam1_t *b) @@ -57,7 +60,7 @@ static int read_bam(void *data, bam1_t *b) { ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b); if ( ret<0 ) break; - if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; + if ( b->core.flag & aux->flags ) continue; if ( (int)b->core.qual < aux->min_mapQ ) continue; break; } @@ -71,10 +74,12 @@ int main_bedcov(int argc, char *argv[]) kstream_t *ks; hts_idx_t **idx; aux_t **aux; - int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0; - int64_t *cnt; + int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0; + int64_t *cnt, *pcov = NULL;; const bam_pileup1_t **plp; int usage = 0, has_index_file = 0; + uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); + int tflags = 0, min_depth = -1; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { @@ -82,11 +87,28 @@ int main_bedcov(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; case 'X': has_index_file = 1; break; + case 'g': + tflags = bam_str2flag(optarg); + if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { + print_error("bedcov", "Flag value \"%s\" is not supported", optarg); + return 1; + } + flags &= ~tflags; + break; + case 'G': + tflags = bam_str2flag(optarg); + if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { + print_error("bedcov", "Flag value \"%s\" is not supported", optarg); + return 1; + } + flags |= tflags; + break; case 'j': skip_DN = 1; break; + case 'd': min_depth = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage = 1; break; @@ -98,7 +120,12 @@ int main_bedcov(int argc, char *argv[]) fprintf(samtools_stderr, "Options:\n"); fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); fprintf(samtools_stderr, " -X use customized index files\n"); + fprintf(samtools_stderr, " -g remove the specified flags from the set used to filter out reads\n"); + fprintf(samtools_stderr, " -G add the specified flags to the set used to filter out reads\n" + " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704"); fprintf(samtools_stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n"); + fprintf(samtools_stderr, " -d depth threshold. Number of reference bases with coverage above and" + " including this value will be displayed in a separate column\n"); sam_global_opt_help(samtools_stderr, "-.--.--."); return 1; } @@ -138,8 +165,11 @@ int main_bedcov(int argc, char *argv[]) argv[i+optind+1]); return 2; } + aux[i]->flags = flags; } - cnt = calloc(n, 8); + cnt = calloc(n, sizeof(*cnt)); + if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov)); + if (!cnt || (min_depth >= 0 && !pcov)) return 2; fp = gzopen(argv[optind], "rb"); if (fp == NULL) { @@ -151,7 +181,8 @@ int main_bedcov(int argc, char *argv[]) plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; - int tid, beg, end, pos; + int tid, pos, num = 0; + int64_t beg = 0, end = 0; bam_mplp_t mplp; if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */ @@ -160,53 +191,75 @@ int main_bedcov(int argc, char *argv[]) be followed by a tab in that case). */ if (strncmp(str.s, "track ", 6) == 0) continue; if (strncmp(str.s, "browser ", 8) == 0) continue; - for (p = q = str.s; *p && *p != '\t'; ++p); - if (*p != '\t') goto bed_error; - *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; + for (p = q = str.s; *p && !isspace(*p); ++p); + if (*p == 0) goto bed_error; + char c = *p; + *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c; if (tid < 0) goto bed_error; - for (q = p = p + 1; isdigit(*p); ++p); - if (*p != '\t') goto bed_error; - *p = 0; beg = atoi(q); *p = '\t'; - for (q = p = p + 1; isdigit(*p); ++p); - if (*p == '\t' || *p == 0) { - int c = *p; - *p = 0; end = atoi(q); *p = c; - } else goto bed_error; + num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end); + if (num < 2 || end < beg) goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); } + mplp = bam_mplp_init(n, read_bam, (void**)aux); - bam_mplp_set_maxcnt(mplp, 64000); - memset(cnt, 0, 8 * n); - while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) + if (min_depth > DEFAULT_DEPTH) + bam_mplp_set_maxcnt(mplp, min_depth); + else + bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH); + + memset(cnt, 0, sizeof(*cnt) * n); + if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n); + + while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) if (pos >= beg && pos < end) { - for (i = 0, m = 0; i < n; ++i) { - if (skip_DN) + for (i = 0; i < n; ++i) { + m = 0; + if (skip_DN || min_depth >= 0) { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *pi = plp[i] + j; if (pi->is_del || pi->is_refskip) ++m; } - cnt[i] += n_plp[i] - m; + } + int pd = n_plp[i] - m; + cnt[i] += pd; + if (min_depth >= 0 && pd >= min_depth) pcov[i]++; } } + + if (ret < 0) { + print_error("bedcov", "error reading from input file"); + status = 2; + bam_mplp_destroy(mplp); + break; + } + for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } + if (min_depth >= 0) { + for (i = 0; i < n; ++i) { + kputc('\t', &str); + kputl(pcov[i], &str); + } + } samtools_puts(str.s); bam_mplp_destroy(mplp); continue; bed_error: fprintf(samtools_stderr, "Errors in BED line '%s'\n", str.s); + status = 2; } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); + free(pcov); for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); hts_idx_destroy(idx[i]); @@ -217,5 +270,5 @@ bed_error: free(aux); free(idx); free(str.s); sam_global_args_free(&ga); - return 0; + return status; } diff --git a/samtools/bedidx.c b/samtools/bedidx.c index ded2314..6b22d4e 100644 --- a/samtools/bedidx.c +++ b/samtools/bedidx.c @@ -573,6 +573,14 @@ const char* bed_get(void *reg_hash, int i, int filter) { return kh_key(h, i); } +/** + * Create a region list from a the region hash table + * @param reg_hash The region hash table + * @param filter 0 - allow all regions, 1 - allow only selected regions + * @param n_reg Pointer to the returned region number + * @return The regions list as a hts_reglist_t + */ + hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *n_reg) { reghash_t *h; diff --git a/samtools/bedidx.c.pysam.c b/samtools/bedidx.c.pysam.c index 027e08e..533b42a 100644 --- a/samtools/bedidx.c.pysam.c +++ b/samtools/bedidx.c.pysam.c @@ -575,6 +575,14 @@ const char* bed_get(void *reg_hash, int i, int filter) { return kh_key(h, i); } +/** + * Create a region list from a the region hash table + * @param reg_hash The region hash table + * @param filter 0 - allow all regions, 1 - allow only selected regions + * @param n_reg Pointer to the returned region number + * @return The regions list as a hts_reglist_t + */ + hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *n_reg) { reghash_t *h; diff --git a/samtools/coverage.c b/samtools/coverage.c index c4f38de..cab1f8b 100644 --- a/samtools/coverage.c +++ b/samtools/coverage.c @@ -1,7 +1,7 @@ /* coverage.c -- samtools coverage subcommand Copyright (C) 2018,2019 Florian Breitwieser - Portions copyright (C) 2019 Genome Research Ltd. + Portions copyright (C) 2019-2021 Genome Research Ltd. Author: Florian P Breitwieser @@ -24,7 +24,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* This program calculates coverage from multiple BAMs - * simutaneously, to achieve random access and to use the BED interface. + * simultaneously, to achieve random access and to use the BED interface. * To compile this program separately, you may: * * gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz @@ -57,19 +57,6 @@ DEALINGS IN THE SOFTWARE. */ const char *VERSION = "0.1"; -typedef struct { // auxiliary data structure to hold a BAM file - samFile *fp; // file handle - sam_hdr_t *hdr; // file header - hts_itr_t *iter; // iterator to a region - NULL for us by default - int min_mapQ; // mapQ filter - int min_len; // length filter - unsigned int n_reads; // records the number of reads seen in file - unsigned int n_selected_reads; // records the number of reads passing filter - unsigned long summed_mapQ; // summed mapQ of all reads passing filter - int fail_flags; - int required_flags; -} bam_aux_t; - typedef struct { // auxiliary data structure to hold stats on coverage unsigned long long n_covered_bases; unsigned long long summed_coverage; @@ -77,12 +64,23 @@ typedef struct { // auxiliary data structure to hold stats on coverage unsigned long long summed_mapQ; unsigned int n_reads; unsigned int n_selected_reads; - int32_t tid; // chromosome ID, defined by header + bool covered; hts_pos_t beg; hts_pos_t end; int64_t bin_width; } stats_aux_t; +typedef struct { // auxiliary data structure to hold a BAM file + samFile *fp; // file handle + sam_hdr_t *hdr; // file header + hts_itr_t *iter; // iterator to a region - NULL for us by default + int min_mapQ; // mapQ filter + int min_len; // length filter + int fail_flags; + int required_flags; + stats_aux_t *stats; +} bam_aux_t; + #if __STDC_VERSION__ >= 199901L #define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL @@ -91,7 +89,7 @@ typedef struct { // auxiliary data structure to hold stats on coverage // LOWER ONE EIGHTH BLOCK … FULL BLOCK static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"}; // In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those -static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"}; +static const char *const BLOCK_CHARS2[2] = {".", ":"}; #else @@ -102,7 +100,7 @@ static const char *const BLOCK_CHARS8[8] = { "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84", "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" }; -static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"}; +static const char *const BLOCK_CHARS2[2] = {".", ":"}; #endif @@ -114,11 +112,14 @@ static int usage() { "Input options:\n" " -b, --bam-list FILE list of input BAM filenames, one per line\n" " -l, --min-read-len INT ignore reads shorter than INT bp [0]\n" - " -q, --min-MQ INT base quality threshold [0]\n" - " -Q, --min-BQ INT mapping quality threshold [0]\n" + " -q, --min-MQ INT mapping quality threshold [0]\n" + " -Q, --min-BQ INT base quality threshold [0]\n" " --rf required flags: skip reads with mask bits unset []\n" " --ff filter flags: skip reads with mask bits set \n" " [UNMAP,SECONDARY,QCFAIL,DUP]\n" + " -d, --depth INT maximum allowed coverage depth [1000000].\n" + " If 0, depth is set to the maximum integer value,\n" + " effectively removing any depth limit.\n" "Output options:\n" " -m, --histogram show histogram instead of tabular output\n" " -A, --ascii show only ASCII characters in histogram\n" @@ -171,79 +172,63 @@ static char* readable_bps(double base_pairs, char *buf) { return buf; } -static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) { - int i; - stats->n_reads = 0; - stats->n_selected_reads = 0; - stats->summed_mapQ = 0; - for (i = 0; i < n_bam_files && data[i]; ++i) { - stats->n_reads += data[i]->n_reads; - stats->n_selected_reads += data[i]->n_selected_reads; - stats->summed_mapQ += data[i]->summed_mapQ; - data[i]->n_reads = 0; - data[i]->n_selected_reads = 0; - data[i]->summed_mapQ = 0; - } -} - // read one alignment from one BAM file static int read_bam(void *data, bam1_t *b) { bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure + int nref = sam_hdr_nref(aux->hdr); int ret; while (1) { if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break; - ++aux->n_reads; + if (b->core.tid >= 0 && b->core.tid < nref) + aux->stats[b->core.tid].n_reads++; if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue; if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue; if ( b->core.qual < aux->min_mapQ ) continue; if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; - ++aux->n_selected_reads; - aux->summed_mapQ += b->core.qual; + if (b->core.tid >= 0 && b->core.tid < nref) { + aux->stats[b->core.tid].n_selected_reads++; + aux->stats[b->core.tid].summed_mapQ += b->core.qual; + } break; } return ret; } -void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) { - fputs(sam_hdr_tid2name(h, stats->tid), file_out); - double region_len = (double) stats->end - stats->beg; +void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid) { + fputs(sam_hdr_tid2name(h, tid), file_out); + double region_len = (double) stats[tid].end - stats[tid].beg; fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n", - stats->beg+1, - stats->end, - stats->n_selected_reads, - stats->n_covered_bases, - 100.0 * stats->n_covered_bases / region_len, - stats->summed_coverage / region_len, - stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0, - stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0 + stats[tid].beg+1, + stats[tid].end, + stats[tid].n_selected_reads, + stats[tid].n_covered_bases, + 100.0 * stats[tid].n_covered_bases / region_len, + stats[tid].summed_coverage / region_len, + stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0, + stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0 ); } -void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist, +void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist, const int hist_size, const bool full_utf) { int i, col; bool show_percentiles = false; const int n_rows = 10; const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2; const int blockchar_len = full_utf? 8 : 2; - /* - if (stats->beg == 0) { - stats->end = h->target_len[stats->tid]; - } - */ - double region_len = stats->end - stats->beg; + double region_len = stats[tid].end - stats[tid].beg; // Calculate histogram that contains percent covered double hist_data[hist_size]; double max_val = 0.0; for (i = 0; i < hist_size; ++i) { - hist_data[i] = 100 * hist[i] / (double) stats->bin_width; + hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width; if (hist_data[i] > max_val) max_val = hist_data[i]; } char buf[30]; - fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf)); + fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, tid), readable_bps(sam_hdr_tid2len(h, tid), buf)); double row_bin_size = max_val / (double) n_rows; for (i = n_rows-1; i >= 0; --i) { @@ -253,7 +238,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co } else { fprintf(file_out, ">%7.2f%% ", current_bin); } - fprintf(file_out, VERTICAL_LINE); + fprintf(file_out, full_utf ? VERTICAL_LINE : "|"); for (col = 0; col < hist_size; ++col) { // get the difference in eights, or halfs when full UTF8 is not supported int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1; @@ -266,22 +251,22 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]); } } - fprintf(file_out, VERTICAL_LINE); + fprintf(file_out, full_utf ? VERTICAL_LINE : "|"); fputc(' ', file_out); switch (i) { - case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break; - case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats->n_reads - stats->n_selected_reads); break; - case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats->n_covered_bases, buf)); break; + case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break; + case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break; + case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break; case 6: fprintf(file_out, "Percent covered: %.4g%%", - 100.0 * stats->n_covered_bases / region_len); break; + 100.0 * stats[tid].n_covered_bases / region_len); break; case 5: fprintf(file_out, "Mean coverage: %.3gx", - stats->summed_coverage / region_len); break; + stats[tid].summed_coverage / region_len); break; case 4: fprintf(file_out, "Mean baseQ: %.3g", - stats->summed_baseQ/(double) stats->summed_coverage); break; + stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break; case 3: fprintf(file_out, "Mean mapQ: %.3g", - stats->summed_mapQ/(double) stats->n_selected_reads); break; + stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break; case 1: fprintf(file_out, "Histo bin width: %sbp", - readable_bps(stats->bin_width, buf)); break; + readable_bps(stats[tid].bin_width, buf)); break; case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break; }; fputc('\n', file_out); @@ -290,22 +275,22 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co // print x axis. Could be made pretty for widths that are not divisible // by 10 by variable spacing of the labels, instead of placing a label every 10 characters char buf2[50]; - fprintf(file_out, " %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10)); + fprintf(file_out, " %s", center_text(readable_bps(stats[tid].beg + 1, buf), buf2, 10)); int rest; for (rest = 10; rest < 10*(hist_size/10); rest += 10) { - fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10)); + fprintf(file_out, "%s", center_text(readable_bps(stats[tid].beg + stats[tid].bin_width*rest, buf), buf2, 10)); } int last_padding = hist_size%10; - fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10)); + fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats[tid].end, buf), buf2, 10)); fprintf(file_out, "\n"); } int main_coverage(int argc, char *argv[]) { int status = EXIT_SUCCESS; - int ret, tid, pos, i, j; + int ret, tid = -1, old_tid = -1, pos, i, j; - int max_depth = 0; + int max_depth = 1000000; int opt_min_baseQ = 0; int opt_min_mapQ = 0; int opt_min_len = 0; @@ -330,7 +315,6 @@ int main_coverage(int argc, char *argv[]) { bool opt_print_header = true; bool opt_print_tabular = true; bool opt_print_histogram = false; - bool *covered_tids = NULL; bool opt_full_utf = true; FILE *file_out = stdout; @@ -343,7 +327,7 @@ int main_coverage(int argc, char *argv[]) { {"incl-flags", required_argument, NULL, 1}, // require flag {"excl-flags", required_argument, NULL, 2}, // filter flag {"bam-list", required_argument, NULL, 'b'}, - {"min-read-len", required_argument, NULL, 'L'}, + {"min-read-len", required_argument, NULL, 'l'}, {"min-MQ", required_argument, NULL, 'q'}, {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, @@ -355,13 +339,14 @@ int main_coverage(int argc, char *argv[]) { {"n-bins", required_argument, NULL, 'w'}, {"region", required_argument, NULL, 'r'}, {"help", no_argument, NULL, 'h'}, + {"depth", required_argument, NULL, 'd'}, { NULL, 0, NULL, 0 } }; // parse the command line int c; opterr = 0; - while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) { switch (c) { case 1: if ((required_flags = bam_str2flag(optarg)) < 0) { @@ -372,9 +357,10 @@ int main_coverage(int argc, char *argv[]) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE; }; break; case 'o': opt_output_file = optarg; opt_full_width = false; break; - case 'L': opt_min_len = atoi(optarg); break; - case 'q': opt_min_baseQ = atoi(optarg); break; - case 'Q': opt_min_mapQ = atoi(optarg); break; + case 'l': opt_min_len = atoi(optarg); break; + case 'q': opt_min_mapQ = atoi(optarg); break; + case 'Q': opt_min_baseQ = atoi(optarg); break; + case 'd': max_depth = atoi(optarg); break; // maximum coverage depth case 'w': opt_n_bins = atoi(optarg); opt_full_width = false; opt_print_histogram = true; opt_print_tabular = false; break; @@ -427,7 +413,7 @@ int main_coverage(int argc, char *argv[]) { if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) { columns = csbi.srWindow.Right - csbi.srWindow.Left + 1; } -#else +#elif defined TIOCGWINSZ struct winsize w; if (ioctl(2, TIOCGWINSZ, &w) == 0) columns = w.ws_col; @@ -460,7 +446,7 @@ int main_coverage(int argc, char *argv[]) { data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file if (!data) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } @@ -469,7 +455,7 @@ int main_coverage(int argc, char *argv[]) { int rf; data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t)); if (!data[i]) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } @@ -485,12 +471,12 @@ int main_coverage(int argc, char *argv[]) { // Set CRAM options on file handle - returns 0 on success if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); + print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); status = EXIT_FAILURE; goto coverage_end; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); + print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); status = EXIT_FAILURE; goto coverage_end; } @@ -516,7 +502,7 @@ int main_coverage(int argc, char *argv[]) { data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator hts_idx_destroy(idx); // the index is not needed any more; free the memory if (data[i]->iter == NULL) { - print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg); + print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]); status = EXIT_FAILURE; goto coverage_end; } @@ -528,30 +514,30 @@ int main_coverage(int argc, char *argv[]) { h = data[0]->hdr; // easy access to the header of the 1st BAM int n_targets = sam_hdr_nref(h); - covered_tids = calloc(n_targets, sizeof(bool)); - stats = calloc(1, sizeof(stats_aux_t)); - if (!covered_tids || !stats) { - print_error("coverage", "Failed to allocate memory"); + stats = calloc(n_targets, sizeof(stats_aux_t)); + if (!stats) { + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } int64_t n_bins = opt_n_bins; if (opt_reg) { - stats->tid = data[0]->iter->tid; - stats->beg = data[0]->iter->beg; // and to the parsed region coordinates - stats->end = data[0]->iter->end; - if (stats->end == HTS_POS_MAX) { - stats->end = sam_hdr_tid2len(h, stats->tid); + stats_aux_t *s = stats + data[0]->iter->tid; + s->beg = data[0]->iter->beg; // and to the parsed region coordinates + s->end = data[0]->iter->end; + if (s->end == HTS_POS_MAX) { + s->end = sam_hdr_tid2len(h, data[0]->iter->tid); } - if (opt_n_bins > stats->end - stats->beg) { - n_bins = stats->end - stats->beg; + if (opt_n_bins > s->end - s->beg) { + n_bins = s->end - s->beg; } - stats->bin_width = (stats->end-stats->beg) / n_bins; - } else { - stats->tid = -1; + s->bin_width = (s->end-s->beg) / (n_bins > 0 ? n_bins : 1); } + for (i=0; istats = stats; + int64_t current_bin = 0; // the core multi-pileup loop @@ -567,43 +553,41 @@ int main_coverage(int argc, char *argv[]) { n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) if (!hist || !n_plp || !plp) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position - if (tid != stats->tid) { // Next target sequence - if (stats->tid >= 0) { // It's not the first sequence, print results - set_read_counts(data, stats, n_bam_files); + if (tid != old_tid) { // Next target sequence + if (old_tid >= 0) { if (opt_print_histogram) { - print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); + print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf); fputc('\n', file_out); } else if (opt_print_tabular) { - print_tabular_line(file_out, h, stats); + print_tabular_line(file_out, h, stats, old_tid); } - // reset data - memset(stats, 0, sizeof(stats_aux_t)); if (opt_print_histogram) memset(hist, 0, n_bins*sizeof(uint32_t)); } - stats->tid = tid; - covered_tids[tid] = true; + stats[tid].covered = true; if (!opt_reg) - stats->end = sam_hdr_tid2len(h, tid); + stats[tid].end = sam_hdr_tid2len(h, tid); if (opt_print_histogram) { - n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins; - stats->bin_width = (stats->end-stats->beg) / n_bins; + n_bins = opt_n_bins > stats[tid].end-stats[tid].beg? stats[tid].end-stats[tid].beg : opt_n_bins; + stats[tid].bin_width = (stats[tid].end-stats[tid].beg) / n_bins; } + + old_tid = tid; } - if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip + if (pos < stats[tid].beg || pos >= stats[tid].end) continue; // out of range; skip if (tid >= n_targets) continue; // diff number of @SQ lines per file? if (opt_print_histogram) { - current_bin = (pos - stats->beg) / stats->bin_width; + current_bin = (pos - stats[tid].beg) / stats[tid].bin_width; } bool count_base = false; @@ -616,39 +600,40 @@ int main_coverage(int argc, char *argv[]) { else if (p->qpos < p->b->core.l_qseq && bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality else - stats->summed_baseQ += bam_get_qual(p->b)[p->qpos]; + stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos]; } if (depth_at_pos > 0) { count_base = true; - stats->summed_coverage += depth_at_pos; + stats[tid].summed_coverage += depth_at_pos; } // hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output } if (count_base) { - ++(stats->n_covered_bases); + stats[tid].n_covered_bases++; if (opt_print_histogram && current_bin < n_bins) ++(hist[current_bin]); // Histogram based on breadth of coverage } } - if (stats->tid != -1) { - set_read_counts(data, stats, n_bam_files); + if (tid == -1 && opt_reg && *opt_reg != '*') + // Region specified but no data covering it. + tid = data[0]->iter->tid; + + if (tid < n_targets && tid >=0) { if (opt_print_histogram) { - print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); + print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf); } else if (opt_print_tabular) { - print_tabular_line(file_out, h, stats); + print_tabular_line(file_out, h, stats, tid); } } if (!opt_reg && opt_print_tabular) { - memset(stats, 0, sizeof(stats_aux_t)); for (i = 0; i < n_targets; ++i) { - if (!covered_tids[i]) { - stats->tid = i; - stats->end = sam_hdr_tid2len(h, i); - print_tabular_line(file_out, h, stats); + if (!stats[i].covered) { + stats[i].end = sam_hdr_tid2len(h, i); + print_tabular_line(file_out, h, stats, i); } } } @@ -658,13 +643,11 @@ int main_coverage(int argc, char *argv[]) { coverage_end: if (n_plp) free(n_plp); if (plp) free(plp); - bam_mplp_destroy(mplp); + if (mplp) bam_mplp_destroy(mplp); - if (covered_tids) free(covered_tids); if (hist) free(hist); if (stats) free(stats); - // Close files and free data structures if (!(file_out == stdout || fclose(file_out) == 0)) { if (status == EXIT_SUCCESS) { diff --git a/samtools/coverage.c.pysam.c b/samtools/coverage.c.pysam.c index 127a528..662deb5 100644 --- a/samtools/coverage.c.pysam.c +++ b/samtools/coverage.c.pysam.c @@ -3,7 +3,7 @@ /* coverage.c -- samtools coverage subcommand Copyright (C) 2018,2019 Florian Breitwieser - Portions copyright (C) 2019 Genome Research Ltd. + Portions copyright (C) 2019-2021 Genome Research Ltd. Author: Florian P Breitwieser @@ -26,7 +26,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* This program calculates coverage from multiple BAMs - * simutaneously, to achieve random access and to use the BED interface. + * simultaneously, to achieve random access and to use the BED interface. * To compile this program separately, you may: * * gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz @@ -59,19 +59,6 @@ DEALINGS IN THE SOFTWARE. */ const char *VERSION = "0.1"; -typedef struct { // auxiliary data structure to hold a BAM file - samFile *fp; // file handle - sam_hdr_t *hdr; // file header - hts_itr_t *iter; // iterator to a region - NULL for us by default - int min_mapQ; // mapQ filter - int min_len; // length filter - unsigned int n_reads; // records the number of reads seen in file - unsigned int n_selected_reads; // records the number of reads passing filter - unsigned long summed_mapQ; // summed mapQ of all reads passing filter - int fail_flags; - int required_flags; -} bam_aux_t; - typedef struct { // auxiliary data structure to hold stats on coverage unsigned long long n_covered_bases; unsigned long long summed_coverage; @@ -79,12 +66,23 @@ typedef struct { // auxiliary data structure to hold stats on coverage unsigned long long summed_mapQ; unsigned int n_reads; unsigned int n_selected_reads; - int32_t tid; // chromosome ID, defined by header + bool covered; hts_pos_t beg; hts_pos_t end; int64_t bin_width; } stats_aux_t; +typedef struct { // auxiliary data structure to hold a BAM file + samFile *fp; // file handle + sam_hdr_t *hdr; // file header + hts_itr_t *iter; // iterator to a region - NULL for us by default + int min_mapQ; // mapQ filter + int min_len; // length filter + int fail_flags; + int required_flags; + stats_aux_t *stats; +} bam_aux_t; + #if __STDC_VERSION__ >= 199901L #define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL @@ -93,7 +91,7 @@ typedef struct { // auxiliary data structure to hold stats on coverage // LOWER ONE EIGHTH BLOCK … FULL BLOCK static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"}; // In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those -static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"}; +static const char *const BLOCK_CHARS2[2] = {".", ":"}; #else @@ -104,7 +102,7 @@ static const char *const BLOCK_CHARS8[8] = { "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84", "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" }; -static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"}; +static const char *const BLOCK_CHARS2[2] = {".", ":"}; #endif @@ -116,11 +114,14 @@ static int usage() { "Input options:\n" " -b, --bam-list FILE list of input BAM filenames, one per line\n" " -l, --min-read-len INT ignore reads shorter than INT bp [0]\n" - " -q, --min-MQ INT base quality threshold [0]\n" - " -Q, --min-BQ INT mapping quality threshold [0]\n" + " -q, --min-MQ INT mapping quality threshold [0]\n" + " -Q, --min-BQ INT base quality threshold [0]\n" " --rf required flags: skip reads with mask bits unset []\n" " --ff filter flags: skip reads with mask bits set \n" " [UNMAP,SECONDARY,QCFAIL,DUP]\n" + " -d, --depth INT maximum allowed coverage depth [1000000].\n" + " If 0, depth is set to the maximum integer value,\n" + " effectively removing any depth limit.\n" "Output options:\n" " -m, --histogram show histogram instead of tabular output\n" " -A, --ascii show only ASCII characters in histogram\n" @@ -173,79 +174,63 @@ static char* readable_bps(double base_pairs, char *buf) { return buf; } -static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) { - int i; - stats->n_reads = 0; - stats->n_selected_reads = 0; - stats->summed_mapQ = 0; - for (i = 0; i < n_bam_files && data[i]; ++i) { - stats->n_reads += data[i]->n_reads; - stats->n_selected_reads += data[i]->n_selected_reads; - stats->summed_mapQ += data[i]->summed_mapQ; - data[i]->n_reads = 0; - data[i]->n_selected_reads = 0; - data[i]->summed_mapQ = 0; - } -} - // read one alignment from one BAM file static int read_bam(void *data, bam1_t *b) { bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure + int nref = sam_hdr_nref(aux->hdr); int ret; while (1) { if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break; - ++aux->n_reads; + if (b->core.tid >= 0 && b->core.tid < nref) + aux->stats[b->core.tid].n_reads++; if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue; if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue; if ( b->core.qual < aux->min_mapQ ) continue; if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; - ++aux->n_selected_reads; - aux->summed_mapQ += b->core.qual; + if (b->core.tid >= 0 && b->core.tid < nref) { + aux->stats[b->core.tid].n_selected_reads++; + aux->stats[b->core.tid].summed_mapQ += b->core.qual; + } break; } return ret; } -void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) { - fputs(sam_hdr_tid2name(h, stats->tid), file_out); - double region_len = (double) stats->end - stats->beg; +void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid) { + fputs(sam_hdr_tid2name(h, tid), file_out); + double region_len = (double) stats[tid].end - stats[tid].beg; fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n", - stats->beg+1, - stats->end, - stats->n_selected_reads, - stats->n_covered_bases, - 100.0 * stats->n_covered_bases / region_len, - stats->summed_coverage / region_len, - stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0, - stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0 + stats[tid].beg+1, + stats[tid].end, + stats[tid].n_selected_reads, + stats[tid].n_covered_bases, + 100.0 * stats[tid].n_covered_bases / region_len, + stats[tid].summed_coverage / region_len, + stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0, + stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0 ); } -void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist, +void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist, const int hist_size, const bool full_utf) { int i, col; bool show_percentiles = false; const int n_rows = 10; const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2; const int blockchar_len = full_utf? 8 : 2; - /* - if (stats->beg == 0) { - stats->end = h->target_len[stats->tid]; - } - */ - double region_len = stats->end - stats->beg; + double region_len = stats[tid].end - stats[tid].beg; // Calculate histogram that contains percent covered double hist_data[hist_size]; double max_val = 0.0; for (i = 0; i < hist_size; ++i) { - hist_data[i] = 100 * hist[i] / (double) stats->bin_width; + hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width; if (hist_data[i] > max_val) max_val = hist_data[i]; } char buf[30]; - fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf)); + fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, tid), readable_bps(sam_hdr_tid2len(h, tid), buf)); double row_bin_size = max_val / (double) n_rows; for (i = n_rows-1; i >= 0; --i) { @@ -255,7 +240,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co } else { fprintf(file_out, ">%7.2f%% ", current_bin); } - fprintf(file_out, VERTICAL_LINE); + fprintf(file_out, full_utf ? VERTICAL_LINE : "|"); for (col = 0; col < hist_size; ++col) { // get the difference in eights, or halfs when full UTF8 is not supported int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1; @@ -268,22 +253,22 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]); } } - fprintf(file_out, VERTICAL_LINE); + fprintf(file_out, full_utf ? VERTICAL_LINE : "|"); fputc(' ', file_out); switch (i) { - case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break; - case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats->n_reads - stats->n_selected_reads); break; - case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats->n_covered_bases, buf)); break; + case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break; + case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break; + case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break; case 6: fprintf(file_out, "Percent covered: %.4g%%", - 100.0 * stats->n_covered_bases / region_len); break; + 100.0 * stats[tid].n_covered_bases / region_len); break; case 5: fprintf(file_out, "Mean coverage: %.3gx", - stats->summed_coverage / region_len); break; + stats[tid].summed_coverage / region_len); break; case 4: fprintf(file_out, "Mean baseQ: %.3g", - stats->summed_baseQ/(double) stats->summed_coverage); break; + stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break; case 3: fprintf(file_out, "Mean mapQ: %.3g", - stats->summed_mapQ/(double) stats->n_selected_reads); break; + stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break; case 1: fprintf(file_out, "Histo bin width: %sbp", - readable_bps(stats->bin_width, buf)); break; + readable_bps(stats[tid].bin_width, buf)); break; case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break; }; fputc('\n', file_out); @@ -292,22 +277,22 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co // print x axis. Could be made pretty for widths that are not divisible // by 10 by variable spacing of the labels, instead of placing a label every 10 characters char buf2[50]; - fprintf(file_out, " %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10)); + fprintf(file_out, " %s", center_text(readable_bps(stats[tid].beg + 1, buf), buf2, 10)); int rest; for (rest = 10; rest < 10*(hist_size/10); rest += 10) { - fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10)); + fprintf(file_out, "%s", center_text(readable_bps(stats[tid].beg + stats[tid].bin_width*rest, buf), buf2, 10)); } int last_padding = hist_size%10; - fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10)); + fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats[tid].end, buf), buf2, 10)); fprintf(file_out, "\n"); } int main_coverage(int argc, char *argv[]) { int status = EXIT_SUCCESS; - int ret, tid, pos, i, j; + int ret, tid = -1, old_tid = -1, pos, i, j; - int max_depth = 0; + int max_depth = 1000000; int opt_min_baseQ = 0; int opt_min_mapQ = 0; int opt_min_len = 0; @@ -332,7 +317,6 @@ int main_coverage(int argc, char *argv[]) { bool opt_print_header = true; bool opt_print_tabular = true; bool opt_print_histogram = false; - bool *covered_tids = NULL; bool opt_full_utf = true; FILE *file_out = samtools_stdout; @@ -345,7 +329,7 @@ int main_coverage(int argc, char *argv[]) { {"incl-flags", required_argument, NULL, 1}, // require flag {"excl-flags", required_argument, NULL, 2}, // filter flag {"bam-list", required_argument, NULL, 'b'}, - {"min-read-len", required_argument, NULL, 'L'}, + {"min-read-len", required_argument, NULL, 'l'}, {"min-MQ", required_argument, NULL, 'q'}, {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, @@ -357,13 +341,14 @@ int main_coverage(int argc, char *argv[]) { {"n-bins", required_argument, NULL, 'w'}, {"region", required_argument, NULL, 'r'}, {"help", no_argument, NULL, 'h'}, + {"depth", required_argument, NULL, 'd'}, { NULL, 0, NULL, 0 } }; // parse the command line int c; opterr = 0; - while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) { switch (c) { case 1: if ((required_flags = bam_str2flag(optarg)) < 0) { @@ -374,9 +359,10 @@ int main_coverage(int argc, char *argv[]) { fprintf(samtools_stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE; }; break; case 'o': opt_output_file = optarg; opt_full_width = false; break; - case 'L': opt_min_len = atoi(optarg); break; - case 'q': opt_min_baseQ = atoi(optarg); break; - case 'Q': opt_min_mapQ = atoi(optarg); break; + case 'l': opt_min_len = atoi(optarg); break; + case 'q': opt_min_mapQ = atoi(optarg); break; + case 'Q': opt_min_baseQ = atoi(optarg); break; + case 'd': max_depth = atoi(optarg); break; // maximum coverage depth case 'w': opt_n_bins = atoi(optarg); opt_full_width = false; opt_print_histogram = true; opt_print_tabular = false; break; @@ -429,7 +415,7 @@ int main_coverage(int argc, char *argv[]) { if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) { columns = csbi.srWindow.Right - csbi.srWindow.Left + 1; } -#else +#elif defined TIOCGWINSZ struct winsize w; if (ioctl(2, TIOCGWINSZ, &w) == 0) columns = w.ws_col; @@ -462,7 +448,7 @@ int main_coverage(int argc, char *argv[]) { data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file if (!data) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } @@ -471,7 +457,7 @@ int main_coverage(int argc, char *argv[]) { int rf; data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t)); if (!data[i]) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } @@ -487,12 +473,12 @@ int main_coverage(int argc, char *argv[]) { // Set CRAM options on file handle - returns 0 on success if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); + print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); status = EXIT_FAILURE; goto coverage_end; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); + print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); status = EXIT_FAILURE; goto coverage_end; } @@ -518,7 +504,7 @@ int main_coverage(int argc, char *argv[]) { data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator hts_idx_destroy(idx); // the index is not needed any more; free the memory if (data[i]->iter == NULL) { - print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg); + print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]); status = EXIT_FAILURE; goto coverage_end; } @@ -530,30 +516,30 @@ int main_coverage(int argc, char *argv[]) { h = data[0]->hdr; // easy access to the header of the 1st BAM int n_targets = sam_hdr_nref(h); - covered_tids = calloc(n_targets, sizeof(bool)); - stats = calloc(1, sizeof(stats_aux_t)); - if (!covered_tids || !stats) { - print_error("coverage", "Failed to allocate memory"); + stats = calloc(n_targets, sizeof(stats_aux_t)); + if (!stats) { + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } int64_t n_bins = opt_n_bins; if (opt_reg) { - stats->tid = data[0]->iter->tid; - stats->beg = data[0]->iter->beg; // and to the parsed region coordinates - stats->end = data[0]->iter->end; - if (stats->end == HTS_POS_MAX) { - stats->end = sam_hdr_tid2len(h, stats->tid); + stats_aux_t *s = stats + data[0]->iter->tid; + s->beg = data[0]->iter->beg; // and to the parsed region coordinates + s->end = data[0]->iter->end; + if (s->end == HTS_POS_MAX) { + s->end = sam_hdr_tid2len(h, data[0]->iter->tid); } - if (opt_n_bins > stats->end - stats->beg) { - n_bins = stats->end - stats->beg; + if (opt_n_bins > s->end - s->beg) { + n_bins = s->end - s->beg; } - stats->bin_width = (stats->end-stats->beg) / n_bins; - } else { - stats->tid = -1; + s->bin_width = (s->end-s->beg) / (n_bins > 0 ? n_bins : 1); } + for (i=0; istats = stats; + int64_t current_bin = 0; // the core multi-pileup loop @@ -569,43 +555,41 @@ int main_coverage(int argc, char *argv[]) { n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) if (!hist || !n_plp || !plp) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position - if (tid != stats->tid) { // Next target sequence - if (stats->tid >= 0) { // It's not the first sequence, print results - set_read_counts(data, stats, n_bam_files); + if (tid != old_tid) { // Next target sequence + if (old_tid >= 0) { if (opt_print_histogram) { - print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); + print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf); fputc('\n', file_out); } else if (opt_print_tabular) { - print_tabular_line(file_out, h, stats); + print_tabular_line(file_out, h, stats, old_tid); } - // reset data - memset(stats, 0, sizeof(stats_aux_t)); if (opt_print_histogram) memset(hist, 0, n_bins*sizeof(uint32_t)); } - stats->tid = tid; - covered_tids[tid] = true; + stats[tid].covered = true; if (!opt_reg) - stats->end = sam_hdr_tid2len(h, tid); + stats[tid].end = sam_hdr_tid2len(h, tid); if (opt_print_histogram) { - n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins; - stats->bin_width = (stats->end-stats->beg) / n_bins; + n_bins = opt_n_bins > stats[tid].end-stats[tid].beg? stats[tid].end-stats[tid].beg : opt_n_bins; + stats[tid].bin_width = (stats[tid].end-stats[tid].beg) / n_bins; } + + old_tid = tid; } - if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip + if (pos < stats[tid].beg || pos >= stats[tid].end) continue; // out of range; skip if (tid >= n_targets) continue; // diff number of @SQ lines per file? if (opt_print_histogram) { - current_bin = (pos - stats->beg) / stats->bin_width; + current_bin = (pos - stats[tid].beg) / stats[tid].bin_width; } bool count_base = false; @@ -618,39 +602,40 @@ int main_coverage(int argc, char *argv[]) { else if (p->qpos < p->b->core.l_qseq && bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality else - stats->summed_baseQ += bam_get_qual(p->b)[p->qpos]; + stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos]; } if (depth_at_pos > 0) { count_base = true; - stats->summed_coverage += depth_at_pos; + stats[tid].summed_coverage += depth_at_pos; } // hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output } if (count_base) { - ++(stats->n_covered_bases); + stats[tid].n_covered_bases++; if (opt_print_histogram && current_bin < n_bins) ++(hist[current_bin]); // Histogram based on breadth of coverage } } - if (stats->tid != -1) { - set_read_counts(data, stats, n_bam_files); + if (tid == -1 && opt_reg && *opt_reg != '*') + // Region specified but no data covering it. + tid = data[0]->iter->tid; + + if (tid < n_targets && tid >=0) { if (opt_print_histogram) { - print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); + print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf); } else if (opt_print_tabular) { - print_tabular_line(file_out, h, stats); + print_tabular_line(file_out, h, stats, tid); } } if (!opt_reg && opt_print_tabular) { - memset(stats, 0, sizeof(stats_aux_t)); for (i = 0; i < n_targets; ++i) { - if (!covered_tids[i]) { - stats->tid = i; - stats->end = sam_hdr_tid2len(h, i); - print_tabular_line(file_out, h, stats); + if (!stats[i].covered) { + stats[i].end = sam_hdr_tid2len(h, i); + print_tabular_line(file_out, h, stats, i); } } } @@ -660,13 +645,11 @@ int main_coverage(int argc, char *argv[]) { coverage_end: if (n_plp) free(n_plp); if (plp) free(plp); - bam_mplp_destroy(mplp); + if (mplp) bam_mplp_destroy(mplp); - if (covered_tids) free(covered_tids); if (hist) free(hist); if (stats) free(stats); - // Close files and free data structures if (!(file_out == samtools_stdout || fclose(file_out) == 0)) { if (status == EXIT_SUCCESS) { diff --git a/samtools/cut_target.c b/samtools/cut_target.c index e59f51b..7c8387c 100644 --- a/samtools/cut_target.c +++ b/samtools/cut_target.c @@ -63,7 +63,7 @@ static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp) if (n > g->max_bases) { // enlarge g->bases g->max_bases = n; kroundup32(g->max_bases); - g->bases = realloc(g->bases, g->max_bases * 2); + g->bases = realloc(g->bases, (size_t) g->max_bases * 2); } for (i = k = 0; i < n; ++i) { const bam_pileup1_t *p = plp + i; @@ -170,7 +170,7 @@ static int read_aln(void *data, bam1_t *b) int main_cut_target(int argc, char *argv[]) { - int c, tid, pos, n, lasttid = -1, usage = 0; + int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS; hts_pos_t l, max_l; const bam_pileup1_t *p; bam_plp_t plp; @@ -237,6 +237,12 @@ int main_cut_target(int argc, char *argv[]) cns[pos] = gencns(&g, n, p); } process_cns(g.h, lasttid, l, cns); + + if (n < 0) { + print_error("targetcut", "error reading from \"%s\"", argv[optind]); + status = EXIT_FAILURE; + } + free(cns); sam_hdr_destroy(g.h); bam_plp_destroy(plp); @@ -247,5 +253,5 @@ int main_cut_target(int argc, char *argv[]) errmod_destroy(g.em); free(g.bases); sam_global_args_free(&ga); - return 0; + return status; } diff --git a/samtools/cut_target.c.pysam.c b/samtools/cut_target.c.pysam.c index bbc2d29..babe42b 100644 --- a/samtools/cut_target.c.pysam.c +++ b/samtools/cut_target.c.pysam.c @@ -65,7 +65,7 @@ static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp) if (n > g->max_bases) { // enlarge g->bases g->max_bases = n; kroundup32(g->max_bases); - g->bases = realloc(g->bases, g->max_bases * 2); + g->bases = realloc(g->bases, (size_t) g->max_bases * 2); } for (i = k = 0; i < n; ++i) { const bam_pileup1_t *p = plp + i; @@ -172,7 +172,7 @@ static int read_aln(void *data, bam1_t *b) int main_cut_target(int argc, char *argv[]) { - int c, tid, pos, n, lasttid = -1, usage = 0; + int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS; hts_pos_t l, max_l; const bam_pileup1_t *p; bam_plp_t plp; @@ -239,6 +239,12 @@ int main_cut_target(int argc, char *argv[]) cns[pos] = gencns(&g, n, p); } process_cns(g.h, lasttid, l, cns); + + if (n < 0) { + print_error("targetcut", "error reading from \"%s\"", argv[optind]); + status = EXIT_FAILURE; + } + free(cns); sam_hdr_destroy(g.h); bam_plp_destroy(plp); @@ -249,5 +255,5 @@ int main_cut_target(int argc, char *argv[]) errmod_destroy(g.em); free(g.bases); sam_global_args_free(&ga); - return 0; + return status; } diff --git a/samtools/dict.c b/samtools/dict.c index c159c24..029d548 100644 --- a/samtools/dict.c +++ b/samtools/dict.c @@ -1,6 +1,6 @@ /* dict.c -- create a sequence dictionary file. - Copyright (C) 2015 Genome Research Ltd. + Copyright (C) 2015, 2020 Genome Research Ltd. Author: Shane McCarthy @@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include @@ -37,7 +38,7 @@ typedef struct _args_t { char *output_fname, *fname; char *assembly, *species, *uri; - int header; + int alias, header; } args_t; @@ -79,6 +80,20 @@ static void write_dict(const char *fn, args_t *args) hts_md5_final(digest, md5); hts_md5_hex(hex, digest); fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex); + if (args->alias) { + const char *name = seq->name.s; + if (strncmp(name, "chr", 3) == 0) { + name += 3; + fprintf(out, "\tAN:%s", name); + } + else + fprintf(out, "\tAN:chr%s", name); + + if (strcmp(name, "M") == 0) + fprintf(out, ",chrMT,MT"); + else if (strcmp(name, "MT") == 0) + fprintf(out, ",chrM,M"); + } if (args->uri) fprintf(out, "\tUR:%s", args->uri); else if (strcmp(fn, "-") != 0) { @@ -107,8 +122,10 @@ static int dict_usage(void) fprintf(stderr, "About: Create a sequence dictionary file from a fasta file\n"); fprintf(stderr, "Usage: samtools dict [options] \n\n"); fprintf(stderr, "Options: -a, --assembly STR assembly\n"); + fprintf(stderr, " -A, --alias, --alternative-name\n"); + fprintf(stderr, " add AN tag by adding/removing 'chr'\n"); fprintf(stderr, " -H, --no-header do not print @HD line\n"); - fprintf(stderr, " -o, --output STR file to write out dict file [stdout]\n"); + fprintf(stderr, " -o, --output FILE file to write out dict file [stdout]\n"); fprintf(stderr, " -s, --species STR species\n"); fprintf(stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n"); fprintf(stderr, "\n"); @@ -124,6 +141,8 @@ int dict_main(int argc, char *argv[]) { {"help", no_argument, NULL, 'h'}, {"no-header", no_argument, NULL, 'H'}, + {"alias", no_argument, NULL, 'A'}, + {"alternative-name", no_argument, NULL, 'A'}, {"assembly", required_argument, NULL, 'a'}, {"species", required_argument, NULL, 's'}, {"uri", required_argument, NULL, 'u'}, @@ -131,10 +150,11 @@ int dict_main(int argc, char *argv[]) {NULL, 0, NULL, 0} }; int c; - while ( (c=getopt_long(argc,argv,"?hHa:s:u:o:",loptions,NULL))>0 ) + while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 ) { switch (c) { + case 'A': args->alias = 1; break; case 'a': args->assembly = optarg; break; case 's': args->species = optarg; break; case 'u': args->uri = optarg; break; diff --git a/samtools/dict.c.pysam.c b/samtools/dict.c.pysam.c index 87ec1ac..ca54c48 100644 --- a/samtools/dict.c.pysam.c +++ b/samtools/dict.c.pysam.c @@ -2,7 +2,7 @@ /* dict.c -- create a sequence dictionary file. - Copyright (C) 2015 Genome Research Ltd. + Copyright (C) 2015, 2020 Genome Research Ltd. Author: Shane McCarthy @@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include @@ -39,7 +40,7 @@ typedef struct _args_t { char *output_fname, *fname; char *assembly, *species, *uri; - int header; + int alias, header; } args_t; @@ -55,19 +56,19 @@ static void write_dict(const char *fn, args_t *args) fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { fprintf(samtools_stderr, "dict: %s: No such file or directory\n", fn); - exit(1); + samtools_exit(1); } FILE *out = samtools_stdout; if (args->output_fname) { out = fopen(args->output_fname, "w"); if (out == NULL) { fprintf(samtools_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname); - exit(1); + samtools_exit(1); } } if (!(md5 = hts_md5_init())) - exit(1); + samtools_exit(1); seq = kseq_init(fp); if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n"); @@ -81,6 +82,20 @@ static void write_dict(const char *fn, args_t *args) hts_md5_final(digest, md5); hts_md5_hex(hex, digest); fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex); + if (args->alias) { + const char *name = seq->name.s; + if (strncmp(name, "chr", 3) == 0) { + name += 3; + fprintf(out, "\tAN:%s", name); + } + else + fprintf(out, "\tAN:chr%s", name); + + if (strcmp(name, "M") == 0) + fprintf(out, ",chrMT,MT"); + else if (strcmp(name, "MT") == 0) + fprintf(out, ",chrM,M"); + } if (args->uri) fprintf(out, "\tUR:%s", args->uri); else if (strcmp(fn, "-") != 0) { @@ -109,8 +124,10 @@ static int dict_usage(void) fprintf(samtools_stderr, "About: Create a sequence dictionary file from a fasta file\n"); fprintf(samtools_stderr, "Usage: samtools dict [options] \n\n"); fprintf(samtools_stderr, "Options: -a, --assembly STR assembly\n"); + fprintf(samtools_stderr, " -A, --alias, --alternative-name\n"); + fprintf(samtools_stderr, " add AN tag by adding/removing 'chr'\n"); fprintf(samtools_stderr, " -H, --no-header do not print @HD line\n"); - fprintf(samtools_stderr, " -o, --output STR file to write out dict file [samtools_stdout]\n"); + fprintf(samtools_stderr, " -o, --output FILE file to write out dict file [samtools_stdout]\n"); fprintf(samtools_stderr, " -s, --species STR species\n"); fprintf(samtools_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n"); fprintf(samtools_stderr, "\n"); @@ -126,6 +143,8 @@ int dict_main(int argc, char *argv[]) { {"help", no_argument, NULL, 'h'}, {"no-header", no_argument, NULL, 'H'}, + {"alias", no_argument, NULL, 'A'}, + {"alternative-name", no_argument, NULL, 'A'}, {"assembly", required_argument, NULL, 'a'}, {"species", required_argument, NULL, 's'}, {"uri", required_argument, NULL, 'u'}, @@ -133,10 +152,11 @@ int dict_main(int argc, char *argv[]) {NULL, 0, NULL, 0} }; int c; - while ( (c=getopt_long(argc,argv,"?hHa:s:u:o:",loptions,NULL))>0 ) + while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 ) { switch (c) { + case 'A': args->alias = 1; break; case 'a': args->assembly = optarg; break; case 's': args->species = optarg; break; case 'u': args->uri = optarg; break; diff --git a/samtools/faidx.c b/samtools/faidx.c index 162233f..03b5d65 100644 --- a/samtools/faidx.c +++ b/samtools/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- faidx subcommand. - Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -198,14 +198,16 @@ static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, con static int usage(FILE *fp, enum fai_format_options format, int exit_status) { - char *tool, *file_type; + char *tool, *file_type, *index_name; if (format == FAI_FASTA) { tool = "faidx "; file_type = "FASTA"; + index_name = "file.fa"; } else { tool = "fqidx "; file_type = "FASTQ"; + index_name = "file.fq"; } fprintf(fp, "Usage: samtools %s [ [...]]\n", tool); @@ -219,8 +221,10 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status) " TYPE = rc for /rc on negative strand (default)\n" " no for no strand indicator\n" " sign for (+) / (-)\n" - " custom,, for custom indicator\n", - file_type, file_type); + " custom,, for custom indicator\n" + " --fai-idx FILE name of the index file (default %s.fai).\n" + " --gzi-idx FILE name of compressed file index (default %s.gz.gzi).\n", + file_type, file_type, index_name, index_name); if (format == FAI_FASTA) { @@ -241,6 +245,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) char *pos_strand_name = ""; // Extension to add to name for +ve strand char *neg_strand_name = "/rc"; // Extension to add to name for -ve strand char *strand_names = NULL; // Used for custom strand annotation + char *fai_name = NULL; // specified index name + char *gzi_name = NULL; // specified compressed index name FILE* file_out = stdout;/* output stream */ static const struct option lopts[] = { @@ -252,6 +258,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) { "fastq", no_argument, NULL, 'f' }, { "reverse-complement", no_argument, NULL, 'i' }, { "mark-strand", required_argument, NULL, 1000 }, + { "fai-idx", required_argument, NULL, 1001 }, + { "gzi-idx", required_argument, NULL, 1002 }, { NULL, 0, NULL, 0 } }; @@ -300,6 +308,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) return usage(stderr, format, EXIT_FAILURE); } break; + case 1001: fai_name = optarg; break; + case 1002: gzi_name = optarg; break; default: break; } } @@ -307,19 +317,40 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) if ( argc==optind ) return usage(stdout, format, EXIT_SUCCESS); - if ( optind+1 == argc && !region_file) - { - if (fai_build(argv[optind]) != 0) { - fprintf(stderr, "[faidx] Could not build fai index %s.fai\n", argv[optind]); + if (optind+1 == argc && !region_file) { + if (output_file && !fai_name) + fai_name = output_file; + + if (fai_build3(argv[optind], fai_name, gzi_name) != 0) { + if (fai_name) + fprintf(stderr, "[faidx] Could not build fai index %s", fai_name); + else + fprintf(stderr, "[faidx] Could not build fai index %s.fai", argv[optind]); + + if (gzi_name) + fprintf(stderr, " or compressed index %s\n", gzi_name); + else + fprintf(stderr, "\n"); + return EXIT_FAILURE; } + return 0; } - faidx_t *fai = fai_load_format(argv[optind], format); + faidx_t *fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format); + + if (!fai) { + if (fai_name) + fprintf(stderr, "[faidx] Could not load fai index %s", fai_name); + else + fprintf(stderr, "[faidx] Could not build fai index %s.fai", argv[optind]); + + if (gzi_name) + fprintf(stderr, " or compressed index %s\n", gzi_name); + else + fprintf(stderr, "\n"); - if ( !fai ) { - fprintf(stderr, "[faidx] Could not load fai index of %s\n", argv[optind]); return EXIT_FAILURE; } diff --git a/samtools/faidx.c.pysam.c b/samtools/faidx.c.pysam.c index e73e63b..0bc515b 100644 --- a/samtools/faidx.c.pysam.c +++ b/samtools/faidx.c.pysam.c @@ -2,7 +2,7 @@ /* faidx.c -- faidx subcommand. - Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -200,14 +200,16 @@ static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, con static int usage(FILE *fp, enum fai_format_options format, int exit_status) { - char *tool, *file_type; + char *tool, *file_type, *index_name; if (format == FAI_FASTA) { tool = "faidx "; file_type = "FASTA"; + index_name = "file.fa"; } else { tool = "fqidx "; file_type = "FASTQ"; + index_name = "file.fq"; } fprintf(fp, "Usage: samtools %s [ [...]]\n", tool); @@ -221,8 +223,10 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status) " TYPE = rc for /rc on negative strand (default)\n" " no for no strand indicator\n" " sign for (+) / (-)\n" - " custom,, for custom indicator\n", - file_type, file_type); + " custom,, for custom indicator\n" + " --fai-idx FILE name of the index file (default %s.fai).\n" + " --gzi-idx FILE name of compressed file index (default %s.gz.gzi).\n", + file_type, file_type, index_name, index_name); if (format == FAI_FASTA) { @@ -243,6 +247,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) char *pos_strand_name = ""; // Extension to add to name for +ve strand char *neg_strand_name = "/rc"; // Extension to add to name for -ve strand char *strand_names = NULL; // Used for custom strand annotation + char *fai_name = NULL; // specified index name + char *gzi_name = NULL; // specified compressed index name FILE* file_out = samtools_stdout;/* output stream */ static const struct option lopts[] = { @@ -254,6 +260,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) { "fastq", no_argument, NULL, 'f' }, { "reverse-complement", no_argument, NULL, 'i' }, { "mark-strand", required_argument, NULL, 1000 }, + { "fai-idx", required_argument, NULL, 1001 }, + { "gzi-idx", required_argument, NULL, 1002 }, { NULL, 0, NULL, 0 } }; @@ -302,6 +310,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) return usage(samtools_stderr, format, EXIT_FAILURE); } break; + case 1001: fai_name = optarg; break; + case 1002: gzi_name = optarg; break; default: break; } } @@ -309,19 +319,40 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) if ( argc==optind ) return usage(samtools_stdout, format, EXIT_SUCCESS); - if ( optind+1 == argc && !region_file) - { - if (fai_build(argv[optind]) != 0) { - fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai\n", argv[optind]); + if (optind+1 == argc && !region_file) { + if (output_file && !fai_name) + fai_name = output_file; + + if (fai_build3(argv[optind], fai_name, gzi_name) != 0) { + if (fai_name) + fprintf(samtools_stderr, "[faidx] Could not build fai index %s", fai_name); + else + fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai", argv[optind]); + + if (gzi_name) + fprintf(samtools_stderr, " or compressed index %s\n", gzi_name); + else + fprintf(samtools_stderr, "\n"); + return EXIT_FAILURE; } + return 0; } - faidx_t *fai = fai_load_format(argv[optind], format); + faidx_t *fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format); + + if (!fai) { + if (fai_name) + fprintf(samtools_stderr, "[faidx] Could not load fai index %s", fai_name); + else + fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai", argv[optind]); + + if (gzi_name) + fprintf(samtools_stderr, " or compressed index %s\n", gzi_name); + else + fprintf(samtools_stderr, "\n"); - if ( !fai ) { - fprintf(samtools_stderr, "[faidx] Could not load fai index of %s\n", argv[optind]); return EXIT_FAILURE; } diff --git a/samtools/htslib-1.10/LICENSE b/samtools/htslib-1.10/LICENSE deleted file mode 100644 index f70e757..0000000 --- a/samtools/htslib-1.10/LICENSE +++ /dev/null @@ -1,69 +0,0 @@ -[Files in this distribution outwith the cram/ subdirectory are distributed -according to the terms of the following MIT/Expat license.] - -The MIT/Expat License - -Copyright (C) 2012-2019 Genome Research Ltd. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. - - -[Files within the cram/ subdirectory in this distribution are distributed -according to the terms of the following Modified 3-Clause BSD license.] - -The Modified-BSD License - -Copyright (C) 2012-2019 Genome Research Ltd. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute - nor the names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -[The use of a range of years within a copyright notice in this distribution -should be interpreted as being equivalent to a list of years including the -first and last year specified and all consecutive years between them. - -For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009, -2011-2012" should be interpreted as being identical to a notice that reads -"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice -that reads "Copyright (C) 2005-2012" should be interpreted as being identical -to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, -2011, 2012".] diff --git a/samtools/htslib-1.10/README b/samtools/htslib-1.10/README deleted file mode 100644 index 4225bec..0000000 --- a/samtools/htslib-1.10/README +++ /dev/null @@ -1,5 +0,0 @@ -HTSlib is an implementation of a unified C library for accessing common file -formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing -data. It is the core library used by samtools and bcftools. - -See INSTALL for building and installation instructions. diff --git a/samtools/padding.c b/samtools/padding.c index a769efe..11b098e 100644 --- a/samtools/padding.c +++ b/samtools/padding.c @@ -1,7 +1,7 @@ /* padding.c -- depad subcommand. Copyright (C) 2011, 2012 Broad Institute. - Copyright (C) 2014-2016, 2019 Genome Research Ltd. + Copyright (C) 2014-2016, 2019-2020 Genome Research Ltd. Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute. Author: Heng Li @@ -38,24 +38,38 @@ DEALINGS IN THE SOFTWARE. */ #define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5) -// The one and only function needed from sam.c. -// Explicitly here to avoid including bam.h translation layer. -extern char *samfaipath(const char *fn_ref); - -static void replace_cigar(bam1_t *b, int n, uint32_t *cigar) +static int replace_cigar(bam1_t *b, uint32_t n, uint32_t *cigar) { + int diff = 0; if (n != b->core.n_cigar) { int o = b->core.l_qname + b->core.n_cigar * 4; - if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) { - b->m_data = b->l_data + (n - b->core.n_cigar) * 4; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); + if (n > b->core.n_cigar) { + diff = (n - b->core.n_cigar) * 4; + if ((INT_MAX - b->l_data)/4 < (n - b->core.n_cigar)) { + fprintf(stderr, "[depad] ERROR: BAM record too big\n"); + return -1; + } + if (b->l_data + diff > b->m_data) { + b->m_data = b->l_data + diff; + kroundup32(b->m_data); + uint8_t *tmp = (uint8_t*)realloc(b->data, b->m_data); + if (!tmp) { + fprintf(stderr, "[depad] ERROR: Memory allocation failure.\n"); + return -1; + } + b->data = tmp; + } + } else { + diff = -(int)((b->core.n_cigar - n) * 4); } memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o); - memcpy(b->data + b->core.l_qname, cigar, n * 4); - b->l_data += (n - b->core.n_cigar) * 4; b->core.n_cigar = n; - } else memcpy(b->data + b->core.l_qname, cigar, n * 4); + } + + memcpy(b->data + b->core.l_qname, cigar, n * 4); + b->l_data += diff; + + return 0; } #define write_cigar(_c, _n, _m, _v) do { \ @@ -195,7 +209,8 @@ int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) kstring_t r, q; int r_tid = -1; uint32_t *cigar2 = 0; - int ret = 0, n2 = 0, m2 = 0, *posmap = 0; + int ret = 0, *posmap = 0; + uint32_t n2 = 0, m2 = 0; b = bam_init1(); if (!b) { @@ -242,7 +257,8 @@ int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) } } write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); - replace_cigar(b, n2, cigar2); + if (replace_cigar(b, n2, cigar2) < 0) + return -1; posmap = update_posmap(posmap, r); } else if (b->core.n_cigar > 0) { int i, k, op; @@ -328,7 +344,8 @@ int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; - replace_cigar(b, n2, cigar2); + if (replace_cigar(b, n2, cigar2) < 0) + return -1; } /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */ if (b->core.pos != -1) b->core.pos = posmap[b->core.pos]; @@ -430,7 +447,7 @@ int main_pad2unpad(int argc, char *argv[]) sam_hdr_t *h = 0, *h_fix = 0; faidx_t *fai = 0; int c, compress_level = -1, is_long_help = 0, no_pg = 0; - char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL; + char in_mode[5], out_mode[6], *fn_out = 0, *fn_fai = 0, *fn_out_idx = NULL; int ret=0; char *arg_list = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; @@ -477,8 +494,8 @@ int main_pad2unpad(int argc, char *argv[]) // Load FASTA reference (also needed for SAM -> BAM if missing header) if (ga.reference) { - fn_list = samfaipath(ga.reference); - fai = fai_load(ga.reference); + fn_fai = fai_path(ga.reference); + fai = fai_load3(ga.reference, fn_fai, NULL, FAI_CREATE); } // open file handlers if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) { @@ -486,8 +503,8 @@ int main_pad2unpad(int argc, char *argv[]) ret = 1; goto depad_end; } - if (fn_list && hts_set_fai_filename(in, fn_list) != 0) { - fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_list); + if (fn_fai && hts_set_fai_filename(in, fn_fai) != 0) { + fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_fai); ret = 1; goto depad_end; } @@ -570,7 +587,7 @@ depad_end: fprintf(stderr, "[depad] error on closing output file.\n"); ret = 1; } - free(fn_list); free(fn_out); + free(fn_fai); free(fn_out); if (fn_out_idx) free(fn_out_idx); sam_global_args_free(&ga); diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c index ecc3691..e90255f 100644 --- a/samtools/padding.c.pysam.c +++ b/samtools/padding.c.pysam.c @@ -3,7 +3,7 @@ /* padding.c -- depad subcommand. Copyright (C) 2011, 2012 Broad Institute. - Copyright (C) 2014-2016, 2019 Genome Research Ltd. + Copyright (C) 2014-2016, 2019-2020 Genome Research Ltd. Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute. Author: Heng Li @@ -40,24 +40,38 @@ DEALINGS IN THE SOFTWARE. */ #define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5) -// The one and only function needed from sam.c. -// Explicitly here to avoid including bam.h translation layer. -extern char *samfaipath(const char *fn_ref); - -static void replace_cigar(bam1_t *b, int n, uint32_t *cigar) +static int replace_cigar(bam1_t *b, uint32_t n, uint32_t *cigar) { + int diff = 0; if (n != b->core.n_cigar) { int o = b->core.l_qname + b->core.n_cigar * 4; - if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) { - b->m_data = b->l_data + (n - b->core.n_cigar) * 4; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); + if (n > b->core.n_cigar) { + diff = (n - b->core.n_cigar) * 4; + if ((INT_MAX - b->l_data)/4 < (n - b->core.n_cigar)) { + fprintf(samtools_stderr, "[depad] ERROR: BAM record too big\n"); + return -1; + } + if (b->l_data + diff > b->m_data) { + b->m_data = b->l_data + diff; + kroundup32(b->m_data); + uint8_t *tmp = (uint8_t*)realloc(b->data, b->m_data); + if (!tmp) { + fprintf(samtools_stderr, "[depad] ERROR: Memory allocation failure.\n"); + return -1; + } + b->data = tmp; + } + } else { + diff = -(int)((b->core.n_cigar - n) * 4); } memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o); - memcpy(b->data + b->core.l_qname, cigar, n * 4); - b->l_data += (n - b->core.n_cigar) * 4; b->core.n_cigar = n; - } else memcpy(b->data + b->core.l_qname, cigar, n * 4); + } + + memcpy(b->data + b->core.l_qname, cigar, n * 4); + b->l_data += diff; + + return 0; } #define write_cigar(_c, _n, _m, _v) do { \ @@ -197,7 +211,8 @@ int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) kstring_t r, q; int r_tid = -1; uint32_t *cigar2 = 0; - int ret = 0, n2 = 0, m2 = 0, *posmap = 0; + int ret = 0, *posmap = 0; + uint32_t n2 = 0, m2 = 0; b = bam_init1(); if (!b) { @@ -244,7 +259,8 @@ int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) } } write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); - replace_cigar(b, n2, cigar2); + if (replace_cigar(b, n2, cigar2) < 0) + return -1; posmap = update_posmap(posmap, r); } else if (b->core.n_cigar > 0) { int i, k, op; @@ -330,7 +346,8 @@ int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; - replace_cigar(b, n2, cigar2); + if (replace_cigar(b, n2, cigar2) < 0) + return -1; } /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */ if (b->core.pos != -1) b->core.pos = posmap[b->core.pos]; @@ -432,7 +449,7 @@ int main_pad2unpad(int argc, char *argv[]) sam_hdr_t *h = 0, *h_fix = 0; faidx_t *fai = 0; int c, compress_level = -1, is_long_help = 0, no_pg = 0; - char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL; + char in_mode[5], out_mode[6], *fn_out = 0, *fn_fai = 0, *fn_out_idx = NULL; int ret=0; char *arg_list = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; @@ -479,8 +496,8 @@ int main_pad2unpad(int argc, char *argv[]) // Load FASTA reference (also needed for SAM -> BAM if missing header) if (ga.reference) { - fn_list = samfaipath(ga.reference); - fai = fai_load(ga.reference); + fn_fai = fai_path(ga.reference); + fai = fai_load3(ga.reference, fn_fai, NULL, FAI_CREATE); } // open file handlers if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) { @@ -488,8 +505,8 @@ int main_pad2unpad(int argc, char *argv[]) ret = 1; goto depad_end; } - if (fn_list && hts_set_fai_filename(in, fn_list) != 0) { - fprintf(samtools_stderr, "[depad] failed to load reference file \"%s\".\n", fn_list); + if (fn_fai && hts_set_fai_filename(in, fn_fai) != 0) { + fprintf(samtools_stderr, "[depad] failed to load reference file \"%s\".\n", fn_fai); ret = 1; goto depad_end; } @@ -572,7 +589,7 @@ depad_end: fprintf(samtools_stderr, "[depad] error on closing output file.\n"); ret = 1; } - free(fn_list); free(fn_out); + free(fn_fai); free(fn_out); if (fn_out_idx) free(fn_out_idx); sam_global_args_free(&ga); diff --git a/samtools/phase.c b/samtools/phase.c index 871e7c3..50f7a8f 100644 --- a/samtools/phase.c +++ b/samtools/phase.c @@ -583,6 +583,7 @@ static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat int main_phase(int argc, char *argv[]) { int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0; + int status = EXIT_SUCCESS; const bam_pileup1_t *plp; bam_plp_t iter; nseq_t *seqs; @@ -785,6 +786,12 @@ int main_phase(int argc, char *argv[]) return 1; } } + + if (n < 0) { + print_error("phase", "error reading from '%s'", argv[optind]); + status = EXIT_FAILURE; + } + sam_hdr_destroy(g.fp_hdr); bam_plp_destroy(iter); sam_close(g.fp); @@ -809,5 +816,5 @@ int main_phase(int argc, char *argv[]) } free(g.arg_list); sam_global_args_free(&ga); - return 0; + return status; } diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c index 6357eab..13ab556 100644 --- a/samtools/phase.c.pysam.c +++ b/samtools/phase.c.pysam.c @@ -585,6 +585,7 @@ static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat int main_phase(int argc, char *argv[]) { int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0; + int status = EXIT_SUCCESS; const bam_pileup1_t *plp; bam_plp_t iter; nseq_t *seqs; @@ -787,6 +788,12 @@ int main_phase(int argc, char *argv[]) return 1; } } + + if (n < 0) { + print_error("phase", "error reading from '%s'", argv[optind]); + status = EXIT_FAILURE; + } + sam_hdr_destroy(g.fp_hdr); bam_plp_destroy(iter); sam_close(g.fp); @@ -811,5 +818,5 @@ int main_phase(int argc, char *argv[]) } free(g.arg_list); sam_global_args_free(&ga); - return 0; + return status; } diff --git a/samtools/sam_view.c b/samtools/sam_view.c index c13aea8..515eaa5 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -1,6 +1,6 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2019 Genome Research Ltd. + Copyright (C) 2009-2021 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -37,20 +37,20 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/faidx.h" #include "htslib/khash.h" #include "htslib/thread_pool.h" +#include "htslib/hts_expr.h" #include "samtools.h" #include "sam_opts.h" #include "bedidx.h" -KHASH_SET_INIT_STR(rg) -KHASH_SET_INIT_STR(tv) +KHASH_SET_INIT_STR(str) -typedef khash_t(rg) *rghash_t; -typedef khash_t(tv) *tvhash_t; +typedef khash_t(str) *strhash_t; // This structure contains the settings for a samview run typedef struct samview_settings { - rghash_t rghash; - tvhash_t tvhash; + strhash_t rghash; + strhash_t rnhash; + strhash_t tvhash; int min_mapQ; int flag_on; int flag_off; @@ -65,13 +65,15 @@ typedef struct samview_settings { char** remove_aux; int multi_region; char* tag; + hts_filter_t *filter; + int remove_flag; + int add_flag; } samview_settings_t; // TODO Add declarations of these to a viable htslib or samtools header extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); extern int bam_remove_B(bam1_t *b); -extern char *samfaipath(const char *fn_ref); // Returns 0 to indicate read should be output 1 otherwise static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) @@ -98,19 +100,39 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin if (settings->rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { - khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1)); + khint_t k = kh_get(str, settings->rghash, (char*)(s + 1)); if (k == kh_end(settings->rghash)) return 1; } } - if (settings->tvhash && settings->tag) { + if (settings->tag) { uint8_t *s = bam_aux_get(b, settings->tag); if (s) { - khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1)); - if (k == kh_end(settings->tvhash)) return 1; + if (settings->tvhash) { + char t[32], *val; + if (*s == 'i' || *s == 'I' || *s == 's' || *s == 'S' || *s == 'c' || *s == 'C') { + int ret = snprintf(t, 32, "%"PRId64, bam_aux2i(s)); + if (ret > 0) val = t; + else return 1; + } else if (*s == 'A') { + t[0] = *(s+1); + t[1] = 0; + val = t; + } else { + val = (char *)(s+1); + } + khint_t k = kh_get(str, settings->tvhash, val); + if (k == kh_end(settings->tvhash)) return 1; + } } else { return 1; } } + if (settings->rnhash) { + const char* rn = bam_get_qname(b); + if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) { + return 1; + } + } if (settings->library) { const char *p = bam_get_library((sam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; @@ -124,11 +146,43 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin } } } + + if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) + return 1; + return 0; } static int usage(FILE *fp, int exit_status, int is_long_help); +static int populate_lookup_from_file(const char *subcmd, strhash_t lookup, char *fn) +{ + FILE *fp; + char buf[1024]; + int ret = 0; + fp = fopen(fn, "r"); + if (fp == NULL) { + print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); + return -1; + } + + while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { + char *d = strdup(buf); + if (d != NULL) { + kh_put(str, lookup, d, &ret); + if (ret == 0) free(d); /* Duplicate */ + } else { + ret = -1; + } + } + if (ferror(fp)) ret = -1; + if (ret == -1) { + print_error_errno(subcmd, "failed to read \"%s\"", fn); + } + fclose(fp); + return (ret != -1) ? 0 : -1; +} + static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name) { char *d = strdup(name); @@ -137,11 +191,11 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting if (d == NULL) goto err; if (settings->rghash == NULL) { - settings->rghash = kh_init(rg); + settings->rghash = kh_init(str); if (settings->rghash == NULL) goto err; } - kh_put(rg, settings->rghash, d, &ret); + kh_put(str, settings->rghash, d, &ret); if (ret == -1) goto err; if (ret == 0) free(d); /* Duplicate */ return 0; @@ -152,40 +206,28 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting return -1; } -static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn) +static int add_read_names_file(const char *subcmd, samview_settings_t *settings, char *fn) { - FILE *fp; - char buf[1024]; - int ret = 0; - if (settings->rghash == NULL) { - settings->rghash = kh_init(rg); - if (settings->rghash == NULL) { + if (settings->rnhash == NULL) { + settings->rnhash = kh_init(str); + if (settings->rnhash == NULL) { perror(NULL); return -1; } } + return populate_lookup_from_file(subcmd, settings->rnhash, fn); +} - fp = fopen(fn, "r"); - if (fp == NULL) { - print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); - return -1; - } - - while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { - char *d = strdup(buf); - if (d != NULL) { - kh_put(rg, settings->rghash, d, &ret); - if (ret == 0) free(d); /* Duplicate */ - } else { - ret = -1; +static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn) +{ + if (settings->rghash == NULL) { + settings->rghash = kh_init(str); + if (settings->rghash == NULL) { + perror(NULL); + return -1; } } - if (ferror(fp)) ret = -1; - if (ret == -1) { - print_error_errno(subcmd, "failed to read \"%s\"", fn); - } - fclose(fp); - return (ret != -1) ? 0 : -1; + return populate_lookup_from_file(subcmd, settings->rghash, fn); } static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name) @@ -196,11 +238,11 @@ static int add_tag_value_single(const char *subcmd, samview_settings_t *settings if (d == NULL) goto err; if (settings->tvhash == NULL) { - settings->tvhash = kh_init(tv); + settings->tvhash = kh_init(str); if (settings->tvhash == NULL) goto err; } - kh_put(tv, settings->tvhash, d, &ret); + kh_put(str, settings->tvhash, d, &ret); if (ret == -1) goto err; if (ret == 0) free(d); /* Duplicate */ return 0; @@ -213,38 +255,14 @@ static int add_tag_value_single(const char *subcmd, samview_settings_t *settings static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn) { - FILE *fp; - char buf[1024]; - int ret = 0; if (settings->tvhash == NULL) { - settings->tvhash = kh_init(tv); + settings->tvhash = kh_init(str); if (settings->tvhash == NULL) { perror(NULL); return -1; } } - - fp = fopen(fn, "r"); - if (fp == NULL) { - print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); - return -1; - } - - while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { - char *d = strdup(buf); - if (d != NULL) { - kh_put(tv, settings->tvhash, d, &ret); - if (ret == 0) free(d); /* Duplicate */ - } else { - ret = -1; - } - } - if (ferror(fp)) ret = -1; - if (ret == -1) { - print_error_errno(subcmd, "failed to read \"%s\"", fn); - } - fclose(fp); - return (ret != -1) ? 0 : -1; + return populate_lookup_from_file(subcmd, settings->tvhash, fn); } static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) @@ -259,6 +277,18 @@ static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t return r; } +static inline void change_flag(bam1_t *b, samview_settings_t *settings) +{ + if (settings->add_flag) + b->core.flag |= settings->add_flag; + + if (settings->remove_flag) + b->core.flag &= ~settings->remove_flag; +} + +// Make mnemonic distinct values for longoption-only options +#define LONGOPT(c) ((c) + 128) + int main_samview(int argc, char *argv[]) { int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; @@ -266,8 +296,8 @@ int main_samview(int argc, char *argv[]) samFile *in = 0, *out = 0, *un_out=0; FILE *fp_out = NULL; sam_hdr_t *header = NULL; - char out_mode[5], out_un_mode[5], *out_format = ""; - char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; + char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = ""; + char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0; char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; @@ -288,12 +318,59 @@ int main_samview(int argc, char *argv[]) .library = NULL, .bed = NULL, .multi_region = 0, - .tag = NULL + .tag = NULL, + .filter = NULL, + .remove_flag = 0, + .add_flag = 0 }; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), - {"no-PG", no_argument, NULL, 1}, + {"add-flags", required_argument, NULL, LONGOPT('a')}, + {"bam", no_argument, NULL, 'b'}, + {"count", no_argument, NULL, 'c'}, + {"cram", no_argument, NULL, 'C'}, + {"customised-index", no_argument, NULL, 'X'}, + {"customized-index", no_argument, NULL, 'X'}, + {"excl-flags", required_argument, NULL, 'F'}, + {"exclude-flags", required_argument, NULL, 'F'}, + {"expr", required_argument, NULL, 'e'}, + {"expression", required_argument, NULL, 'e'}, + {"fai-reference", required_argument, NULL, 't'}, + {"fast", no_argument, NULL, '1'}, + {"header-only", no_argument, NULL, 'H'}, + {"help", no_argument, NULL, LONGOPT('?')}, + {"library", required_argument, NULL, 'l'}, + {"min-mapq", required_argument, NULL, 'q'}, + {"min-MQ", required_argument, NULL, 'q'}, + {"min-mq", required_argument, NULL, 'q'}, + {"min-qlen", required_argument, NULL, 'm'}, + {"no-header", no_argument, NULL, LONGOPT('H')}, + {"no-PG", no_argument, NULL, LONGOPT('P')}, + {"output", required_argument, NULL, 'o'}, + {"output-unselected", required_argument, NULL, 'U'}, + {"QNAME-file", required_argument, NULL, 'N'}, + {"qname-file", required_argument, NULL, 'N'}, + {"read-group", required_argument, NULL, 'r'}, + {"read-group-file", required_argument, NULL, 'R'}, + {"readgroup", required_argument, NULL, 'r'}, + {"readgroup-file", required_argument, NULL, 'R'}, + {"region-file", required_argument, NULL, LONGOPT('L')}, + {"regions-file", required_argument, NULL, LONGOPT('L')}, + {"remove-B", no_argument, NULL, 'B'}, + {"remove-flags", required_argument, NULL, LONGOPT('r')}, + {"remove-tag", required_argument, NULL, 'x'}, + {"require-flags", required_argument, NULL, 'f'}, + {"subsample", required_argument, NULL, LONGOPT('s')}, + {"subsample-seed", required_argument, NULL, LONGOPT('S')}, + {"tag", required_argument, NULL, 'd'}, + {"tag-file", required_argument, NULL, 'D'}, + {"target-file", required_argument, NULL, 'L'}, + {"targets-file", required_argument, NULL, 'L'}, + {"uncompressed", no_argument, NULL, 'u'}, + {"unoutput", required_argument, NULL, 'U'}, + {"use-index", no_argument, NULL, 'M'}, + {"with-header", no_argument, NULL, 'h'}, { NULL, 0, NULL, 0 } }; @@ -310,16 +387,11 @@ int main_samview(int argc, char *argv[]) opterr = 0; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:", lopts, NULL)) >= 0) { switch (c) { case 's': - if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { - // Convert likely user input 0,1,2,... to pseudo-random - // values with more entropy and more bits set - srand(settings.subsam_seed); - settings.subsam_seed = rand(); - } + settings.subsam_seed = strtol(optarg, &q, 10); if (q && *q == '.') { settings.subsam_frac = strtod(q, &q); if (*q) ret = 1; @@ -332,24 +404,36 @@ int main_samview(int argc, char *argv[]) goto view_end; } break; + case LONGOPT('s'): + settings.subsam_frac = strtod(optarg, &q); + if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) { + print_error("view", "Incorrect sampling argument \"%s\"", optarg); + goto view_end; + } + break; + case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break; case 'm': settings.min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; - case 't': fn_list = strdup(optarg); break; + case 't': fn_fai = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; + case LONGOPT('H'): is_header = is_header_only = 0; break; case 'o': fn_out = strdup(optarg); break; case 'U': fn_un_out = strdup(optarg); break; case 'X': has_index_file = 1; break; - case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; - case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; - case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; + case 'f': settings.flag_on |= bam_str2flag(optarg); break; + case 'F': settings.flag_off |= bam_str2flag(optarg); break; + case 'G': settings.flag_alloff |= bam_str2flag(optarg); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; + case LONGOPT('L'): + settings.multi_region = 1; + // fall through case 'L': if ((settings.bed = bed_read(optarg)) == NULL) { print_error_errno("view", "Could not read file \"%s\"", optarg); @@ -369,8 +453,14 @@ int main_samview(int argc, char *argv[]) goto view_end; } break; + case 'N': + if (add_read_names_file("view", &settings, optarg) != 0) { + ret = 1; + goto view_end; + } + break; case 'd': - if (strlen(optarg) < 4 || optarg[2] != ':') { + if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) { print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); ret = 1; goto view_end; @@ -391,7 +481,8 @@ int main_samview(int argc, char *argv[]) memcpy(settings.tag, optarg, 2); } - if (add_tag_value_single("view", &settings, optarg+3) != 0) { + if (strlen(optarg) > 3 && add_tag_value_single("view", &settings, optarg+3) != 0) { + print_error("view", "Could not add tag:value \"%s\"", optarg); ret = 1; goto view_end; } @@ -399,7 +490,7 @@ int main_samview(int argc, char *argv[]) case 'D': // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX // path translation as described at: - // http://www.mingw.org/wiki/Posix_path_conversion + // http://www.mingw.org/wiki/Posix_path_conversion if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); ret = 1; @@ -430,6 +521,8 @@ int main_samview(int argc, char *argv[]) //case 'x': out_format = "x"; break; //case 'X': out_format = "X"; break; */ + case LONGOPT('?'): + return usage(stdout, EXIT_SUCCESS, 1); case '?': if (optopt == '?') { // '-?' appeared on command line return usage(stdout, EXIT_SUCCESS, 1); @@ -451,7 +544,7 @@ int main_samview(int argc, char *argv[]) case 'x': { if (strlen(optarg) != 2) { - fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); + print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long."); return usage(stderr, EXIT_FAILURE, 0); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); @@ -459,13 +552,22 @@ int main_samview(int argc, char *argv[]) } break; case 'M': settings.multi_region = 1; break; - case 1: no_pg = 1; break; + case LONGOPT('P'): no_pg = 1; break; + case 'e': + if (!(settings.filter = hts_filter_init(optarg))) { + print_error("main_samview", "Couldn't initialise filter"); + return 1; + } + break; + case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break; + case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(stderr, EXIT_FAILURE, 0); break; } } + if (fn_fai == 0 && ga.reference) fn_fai = fai_path(ga.reference); if (compress_level >= 0 && !*out_format) out_format = "b"; if (is_header_only) is_header = 1; // File format auto-detection first @@ -474,8 +576,7 @@ int main_samview(int argc, char *argv[]) // Overridden by manual -b, -C if (*out_format) out_mode[1] = out_un_mode[1] = *out_format; - out_mode[2] = out_un_mode[2] = '\0'; - // out_(un_)mode now 1 or 2 bytes long, followed by nul. + // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul. if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; @@ -486,20 +587,23 @@ int main_samview(int argc, char *argv[]) print_error("view", "No input provided or missing option argument."); return usage(stderr, EXIT_FAILURE, 0); // potential memory leak... } + if (settings.subsam_seed != 0) { + // Convert likely user input 1,2,... to pseudo-random + // values with more entropy and more bits set + srand(settings.subsam_seed); + settings.subsam_seed = rand(); + } fn_in = (optind < argc)? argv[optind] : "-"; - // generate the fn_list if necessary - if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference); - // open file handlers if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { print_error_errno("view", "failed to open \"%s\" for reading", fn_in); ret = 1; goto view_end; } - if (fn_list) { - if (hts_set_fai_filename(in, fn_list) != 0) { - fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + if (fn_fai) { + if (hts_set_fai_filename(in, fn_fai) != 0) { + fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); ret = 1; goto view_end; } @@ -518,9 +622,9 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - if (fn_list) { - if (hts_set_fai_filename(out, fn_list) != 0) { - fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + if (fn_fai) { + if (hts_set_fai_filename(out, fn_fai) != 0) { + fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); ret = 1; goto view_end; } @@ -565,9 +669,9 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - if (fn_list) { - if (hts_set_fai_filename(un_out, fn_list) != 0) { - fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + if (fn_fai) { + if (hts_set_fai_filename(un_out, fn_fai) != 0) { + fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); ret = 1; goto view_end; } @@ -654,7 +758,10 @@ int main_samview(int argc, char *argv[]) // fetch alignments while ((result = sam_itr_multi_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } @@ -682,16 +789,20 @@ int main_samview(int argc, char *argv[]) if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file bam1_t *b = bam_init1(); int r; + errno = 0; while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } if (r < -1) { - fprintf(stderr, "[main_samview] truncated file.\n"); + print_error_errno("view", "error reading file \"%s\"", fn_in); ret = 1; } bam_destroy1(b); @@ -722,7 +833,10 @@ int main_samview(int argc, char *argv[]) // fetch alignments while ((result = sam_itr_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } @@ -766,7 +880,7 @@ view_end: if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); if (fp_out) fclose(fp_out); - free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); + free(fn_fai); free(fn_out); free(settings.library); free(fn_un_out); sam_global_args_free(&ga); if ( header ) sam_hdr_destroy(header); if (settings.bed) bed_destroy(settings.bed); @@ -774,13 +888,19 @@ view_end: khint_t k; for (k = 0; k < kh_end(settings.rghash); ++k) if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); - kh_destroy(rg, settings.rghash); + kh_destroy(str, settings.rghash); + } + if (settings.rnhash) { + khint_t k; + for (k = 0; k < kh_end(settings.rnhash); ++k) + if (kh_exist(settings.rnhash, k)) free((char*)kh_key(settings.rnhash, k)); + kh_destroy(str, settings.rnhash); } if (settings.tvhash) { khint_t k; for (k = 0; k < kh_end(settings.tvhash); ++k) if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); - kh_destroy(tv, settings.tvhash); + kh_destroy(str, settings.tvhash); } if (settings.remove_aux_len) { free(settings.remove_aux); @@ -788,6 +908,8 @@ view_end: if (settings.tag) { free(settings.tag); } + if (settings.filter) + hts_filter_free(settings.filter); if (p.pool) hts_tpool_destroy(p.pool); @@ -807,47 +929,52 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "\n" "Usage: samtools view [options] || [region ...]\n" "\n" -"Options:\n" -// output options -" -b output BAM\n" -" -C output CRAM (requires -T)\n" -" -1 use fast BAM compression (implies -b)\n" -" -u uncompressed BAM output (implies -b)\n" -" -h include header in SAM output\n" -" -H print SAM header only (no alignments)\n" -" -c print only the count of matching records\n" -" -o FILE output file name [stdout]\n" -" -U FILE output reads not selected by filters to FILE [null]\n" -// extra input -" -t FILE FILE listing reference names and lengths (see long help) [null]\n" -" -X include customized index file\n" -// read filters -" -L FILE only include reads overlapping this BED FILE [null]\n" -" -r STR only include reads in read group STR [null]\n" -" -R FILE only include reads with read group listed in FILE [null]\n" -" -d STR:STR\n" -" only include reads with tag STR and associated value STR [null]\n" -" -D STR:FILE\n" -" only include reads with tag STR and associated values listed in\n" -" FILE [null]\n" -" -q INT only include reads with mapping quality >= INT [0]\n" -" -l STR only include reads in library STR [null]\n" -" -m INT only include reads with number of CIGAR operations consuming\n" -" query sequence >= INT [0]\n" -" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 -" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) -" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n" -" fraction of templates/read pairs to keep; INT part sets seed)\n" -" -M use the multi-region iterator (increases the speed, removes\n" -" duplicates and outputs the reads as they are ordered in the file)\n" -// read processing -" -x STR read tag to strip (repeatable) [null]\n" -" -B collapse the backward CIGAR operation\n" -// general options -" -? print long help, including note about region specification\n" -" -S ignored (input format is auto-detected)\n" -" --no-PG do not add a PG line\n"); +"Output options:\n" +" -b, --bam Output BAM\n" +" -C, --cram Output CRAM (requires -T)\n" +" -1, --fast Use fast BAM compression (implies --bam)\n" +" -u, --uncompressed Uncompressed BAM output (implies --bam)\n" +" -h, --with-header Include header in SAM output\n" +" -H, --header-only Print SAM header only (no alignments)\n" +" --no-header Print SAM alignment records only [default]\n" +" -c, --count Print only the count of matching records\n" +" -o, --output FILE Write output to FILE [standard output]\n" +" -U, --unoutput FILE, --output-unselected FILE\n" +" Output reads not selected by filters to FILE\n" +"Input options:\n" +" -t, --fai-reference FILE FILE listing reference names and lengths\n" +" -M, --use-index Use index and multi-region iterator for regions\n" +" --region[s]-file FILE Use index to include only reads overlapping FILE\n" +" -X, --customized-index Expect extra index file argument after \n" +"\n" +"Filtering options (Only include in output reads that...):\n" +" -L, --target[s]-file FILE ...overlap (BED) regions in FILE\n" +" -r, --read-group STR ...are in read group STR\n" +" -R, --read-group-file FILE ...are in a read group listed in FILE\n" +" -N, --qname-file FILE ...whose read name is listed in FILE\n" +" -d, --tag STR1[:STR2] ...have a tag STR1 (with associated value STR2)\n" +" -D, --tag-file STR:FILE ...have a tag STR whose value is listed in FILE\n" +" -q, --min-MQ INT ...have mapping quality >= INT\n" +" -l, --library STR ...are in library STR\n" +" -m, --min-qlen INT ...cover >= INT query bases (as measured via CIGAR)\n" +" -e, --expr STR ...match the filter expression STR\n" +" -f, --require-flags FLAG ...have all of the FLAGs present\n" // F&x == x +" -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n" // F&x == 0 +" -G FLAG EXCLUDE reads with all of the FLAGs present\n" // !(F&x == x) TODO long option +" --subsample FLOAT Keep only FLOAT fraction of templates/read pairs\n" +" --subsample-seed INT Influence WHICH reads are kept in subsampling [0]\n" +" -s INT.FRAC Same as --subsample 0.FRAC --subsample-seed INT\n" +"\n" +"Processing options:\n" +" --add-flags FLAG Add FLAGs to reads\n" +" --remove-flags FLAG Remove FLAGs from reads\n" +" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n" +" -B, --remove-B Collapse the backward CIGAR operation\n" +"\n" +"General options:\n" +" -?, --help Print long help, including note about region specification\n" +" -S Ignored (input format is auto-detected)\n" +" --no-PG Do not add a PG line\n"); sam_global_opt_help(fp, "-.O.T@.."); fprintf(fp, "\n"); @@ -887,23 +1014,16 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "\n" "6. Option `-u' is preferred over `-b' when the output is piped to\n" " another samtools command.\n" +"\n" +"7. Option `-M`/`--use-index` causes overlaps with `-L` BED file regions and\n" +" command-line region arguments to be computed using the multi-region iterator\n" +" and an index. This increases speed, omits duplicates, and outputs the reads\n" +" as they are ordered in the input SAM/BAM/CRAM file.\n" +"\n" +"8. Options `-L`/`--target[s]-file` and `--region[s]-file` may not be used\n" +" together. `--region[s]-file FILE` is simply equivalent to `-M -L FILE`,\n" +" so using both causes one of the specified BED files to be ignored.\n" "\n"); return exit_status; } - -int main_import(int argc, char *argv[]) -{ - int argc2, ret; - char **argv2; - if (argc != 4) { - fprintf(stderr, "Usage: samtools import \n"); - return 1; - } - argc2 = 6; - argv2 = calloc(6, sizeof(char*)); - argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2]; - ret = main_samview(argc2, argv2); - free(argv2); - return ret; -} diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 6153ee8..42c42e4 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -2,7 +2,7 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2019 Genome Research Ltd. + Copyright (C) 2009-2021 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -39,20 +39,20 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/faidx.h" #include "htslib/khash.h" #include "htslib/thread_pool.h" +#include "htslib/hts_expr.h" #include "samtools.h" #include "sam_opts.h" #include "bedidx.h" -KHASH_SET_INIT_STR(rg) -KHASH_SET_INIT_STR(tv) +KHASH_SET_INIT_STR(str) -typedef khash_t(rg) *rghash_t; -typedef khash_t(tv) *tvhash_t; +typedef khash_t(str) *strhash_t; // This structure contains the settings for a samview run typedef struct samview_settings { - rghash_t rghash; - tvhash_t tvhash; + strhash_t rghash; + strhash_t rnhash; + strhash_t tvhash; int min_mapQ; int flag_on; int flag_off; @@ -67,13 +67,15 @@ typedef struct samview_settings { char** remove_aux; int multi_region; char* tag; + hts_filter_t *filter; + int remove_flag; + int add_flag; } samview_settings_t; // TODO Add declarations of these to a viable htslib or samtools header extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); extern int bam_remove_B(bam1_t *b); -extern char *samfaipath(const char *fn_ref); // Returns 0 to indicate read should be output 1 otherwise static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) @@ -100,19 +102,39 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin if (settings->rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { - khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1)); + khint_t k = kh_get(str, settings->rghash, (char*)(s + 1)); if (k == kh_end(settings->rghash)) return 1; } } - if (settings->tvhash && settings->tag) { + if (settings->tag) { uint8_t *s = bam_aux_get(b, settings->tag); if (s) { - khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1)); - if (k == kh_end(settings->tvhash)) return 1; + if (settings->tvhash) { + char t[32], *val; + if (*s == 'i' || *s == 'I' || *s == 's' || *s == 'S' || *s == 'c' || *s == 'C') { + int ret = snprintf(t, 32, "%"PRId64, bam_aux2i(s)); + if (ret > 0) val = t; + else return 1; + } else if (*s == 'A') { + t[0] = *(s+1); + t[1] = 0; + val = t; + } else { + val = (char *)(s+1); + } + khint_t k = kh_get(str, settings->tvhash, val); + if (k == kh_end(settings->tvhash)) return 1; + } } else { return 1; } } + if (settings->rnhash) { + const char* rn = bam_get_qname(b); + if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) { + return 1; + } + } if (settings->library) { const char *p = bam_get_library((sam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; @@ -126,11 +148,43 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin } } } + + if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) + return 1; + return 0; } static int usage(FILE *fp, int exit_status, int is_long_help); +static int populate_lookup_from_file(const char *subcmd, strhash_t lookup, char *fn) +{ + FILE *fp; + char buf[1024]; + int ret = 0; + fp = fopen(fn, "r"); + if (fp == NULL) { + print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); + return -1; + } + + while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { + char *d = strdup(buf); + if (d != NULL) { + kh_put(str, lookup, d, &ret); + if (ret == 0) free(d); /* Duplicate */ + } else { + ret = -1; + } + } + if (ferror(fp)) ret = -1; + if (ret == -1) { + print_error_errno(subcmd, "failed to read \"%s\"", fn); + } + fclose(fp); + return (ret != -1) ? 0 : -1; +} + static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name) { char *d = strdup(name); @@ -139,11 +193,11 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting if (d == NULL) goto err; if (settings->rghash == NULL) { - settings->rghash = kh_init(rg); + settings->rghash = kh_init(str); if (settings->rghash == NULL) goto err; } - kh_put(rg, settings->rghash, d, &ret); + kh_put(str, settings->rghash, d, &ret); if (ret == -1) goto err; if (ret == 0) free(d); /* Duplicate */ return 0; @@ -154,40 +208,28 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting return -1; } -static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn) +static int add_read_names_file(const char *subcmd, samview_settings_t *settings, char *fn) { - FILE *fp; - char buf[1024]; - int ret = 0; - if (settings->rghash == NULL) { - settings->rghash = kh_init(rg); - if (settings->rghash == NULL) { + if (settings->rnhash == NULL) { + settings->rnhash = kh_init(str); + if (settings->rnhash == NULL) { perror(NULL); return -1; } } + return populate_lookup_from_file(subcmd, settings->rnhash, fn); +} - fp = fopen(fn, "r"); - if (fp == NULL) { - print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); - return -1; - } - - while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { - char *d = strdup(buf); - if (d != NULL) { - kh_put(rg, settings->rghash, d, &ret); - if (ret == 0) free(d); /* Duplicate */ - } else { - ret = -1; +static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn) +{ + if (settings->rghash == NULL) { + settings->rghash = kh_init(str); + if (settings->rghash == NULL) { + perror(NULL); + return -1; } } - if (ferror(fp)) ret = -1; - if (ret == -1) { - print_error_errno(subcmd, "failed to read \"%s\"", fn); - } - fclose(fp); - return (ret != -1) ? 0 : -1; + return populate_lookup_from_file(subcmd, settings->rghash, fn); } static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name) @@ -198,11 +240,11 @@ static int add_tag_value_single(const char *subcmd, samview_settings_t *settings if (d == NULL) goto err; if (settings->tvhash == NULL) { - settings->tvhash = kh_init(tv); + settings->tvhash = kh_init(str); if (settings->tvhash == NULL) goto err; } - kh_put(tv, settings->tvhash, d, &ret); + kh_put(str, settings->tvhash, d, &ret); if (ret == -1) goto err; if (ret == 0) free(d); /* Duplicate */ return 0; @@ -215,38 +257,14 @@ static int add_tag_value_single(const char *subcmd, samview_settings_t *settings static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn) { - FILE *fp; - char buf[1024]; - int ret = 0; if (settings->tvhash == NULL) { - settings->tvhash = kh_init(tv); + settings->tvhash = kh_init(str); if (settings->tvhash == NULL) { perror(NULL); return -1; } } - - fp = fopen(fn, "r"); - if (fp == NULL) { - print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); - return -1; - } - - while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { - char *d = strdup(buf); - if (d != NULL) { - kh_put(tv, settings->tvhash, d, &ret); - if (ret == 0) free(d); /* Duplicate */ - } else { - ret = -1; - } - } - if (ferror(fp)) ret = -1; - if (ret == -1) { - print_error_errno(subcmd, "failed to read \"%s\"", fn); - } - fclose(fp); - return (ret != -1) ? 0 : -1; + return populate_lookup_from_file(subcmd, settings->tvhash, fn); } static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) @@ -261,6 +279,18 @@ static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t return r; } +static inline void change_flag(bam1_t *b, samview_settings_t *settings) +{ + if (settings->add_flag) + b->core.flag |= settings->add_flag; + + if (settings->remove_flag) + b->core.flag &= ~settings->remove_flag; +} + +// Make mnemonic distinct values for longoption-only options +#define LONGOPT(c) ((c) + 128) + int main_samview(int argc, char *argv[]) { int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; @@ -268,8 +298,8 @@ int main_samview(int argc, char *argv[]) samFile *in = 0, *out = 0, *un_out=0; FILE *fp_out = NULL; sam_hdr_t *header = NULL; - char out_mode[5], out_un_mode[5], *out_format = ""; - char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; + char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = ""; + char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0; char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; @@ -290,12 +320,59 @@ int main_samview(int argc, char *argv[]) .library = NULL, .bed = NULL, .multi_region = 0, - .tag = NULL + .tag = NULL, + .filter = NULL, + .remove_flag = 0, + .add_flag = 0 }; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), - {"no-PG", no_argument, NULL, 1}, + {"add-flags", required_argument, NULL, LONGOPT('a')}, + {"bam", no_argument, NULL, 'b'}, + {"count", no_argument, NULL, 'c'}, + {"cram", no_argument, NULL, 'C'}, + {"customised-index", no_argument, NULL, 'X'}, + {"customized-index", no_argument, NULL, 'X'}, + {"excl-flags", required_argument, NULL, 'F'}, + {"exclude-flags", required_argument, NULL, 'F'}, + {"expr", required_argument, NULL, 'e'}, + {"expression", required_argument, NULL, 'e'}, + {"fai-reference", required_argument, NULL, 't'}, + {"fast", no_argument, NULL, '1'}, + {"header-only", no_argument, NULL, 'H'}, + {"help", no_argument, NULL, LONGOPT('?')}, + {"library", required_argument, NULL, 'l'}, + {"min-mapq", required_argument, NULL, 'q'}, + {"min-MQ", required_argument, NULL, 'q'}, + {"min-mq", required_argument, NULL, 'q'}, + {"min-qlen", required_argument, NULL, 'm'}, + {"no-header", no_argument, NULL, LONGOPT('H')}, + {"no-PG", no_argument, NULL, LONGOPT('P')}, + {"output", required_argument, NULL, 'o'}, + {"output-unselected", required_argument, NULL, 'U'}, + {"QNAME-file", required_argument, NULL, 'N'}, + {"qname-file", required_argument, NULL, 'N'}, + {"read-group", required_argument, NULL, 'r'}, + {"read-group-file", required_argument, NULL, 'R'}, + {"readgroup", required_argument, NULL, 'r'}, + {"readgroup-file", required_argument, NULL, 'R'}, + {"region-file", required_argument, NULL, LONGOPT('L')}, + {"regions-file", required_argument, NULL, LONGOPT('L')}, + {"remove-B", no_argument, NULL, 'B'}, + {"remove-flags", required_argument, NULL, LONGOPT('r')}, + {"remove-tag", required_argument, NULL, 'x'}, + {"require-flags", required_argument, NULL, 'f'}, + {"subsample", required_argument, NULL, LONGOPT('s')}, + {"subsample-seed", required_argument, NULL, LONGOPT('S')}, + {"tag", required_argument, NULL, 'd'}, + {"tag-file", required_argument, NULL, 'D'}, + {"target-file", required_argument, NULL, 'L'}, + {"targets-file", required_argument, NULL, 'L'}, + {"uncompressed", no_argument, NULL, 'u'}, + {"unoutput", required_argument, NULL, 'U'}, + {"use-index", no_argument, NULL, 'M'}, + {"with-header", no_argument, NULL, 'h'}, { NULL, 0, NULL, 0 } }; @@ -312,16 +389,11 @@ int main_samview(int argc, char *argv[]) opterr = 0; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:", lopts, NULL)) >= 0) { switch (c) { case 's': - if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { - // Convert likely user input 0,1,2,... to pseudo-random - // values with more entropy and more bits set - srand(settings.subsam_seed); - settings.subsam_seed = rand(); - } + settings.subsam_seed = strtol(optarg, &q, 10); if (q && *q == '.') { settings.subsam_frac = strtod(q, &q); if (*q) ret = 1; @@ -334,24 +406,36 @@ int main_samview(int argc, char *argv[]) goto view_end; } break; + case LONGOPT('s'): + settings.subsam_frac = strtod(optarg, &q); + if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) { + print_error("view", "Incorrect sampling argument \"%s\"", optarg); + goto view_end; + } + break; + case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break; case 'm': settings.min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; - case 't': fn_list = strdup(optarg); break; + case 't': fn_fai = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; + case LONGOPT('H'): is_header = is_header_only = 0; break; case 'o': fn_out = strdup(optarg); break; case 'U': fn_un_out = strdup(optarg); break; case 'X': has_index_file = 1; break; - case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; - case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; - case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; + case 'f': settings.flag_on |= bam_str2flag(optarg); break; + case 'F': settings.flag_off |= bam_str2flag(optarg); break; + case 'G': settings.flag_alloff |= bam_str2flag(optarg); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; + case LONGOPT('L'): + settings.multi_region = 1; + // fall through case 'L': if ((settings.bed = bed_read(optarg)) == NULL) { print_error_errno("view", "Could not read file \"%s\"", optarg); @@ -371,8 +455,14 @@ int main_samview(int argc, char *argv[]) goto view_end; } break; + case 'N': + if (add_read_names_file("view", &settings, optarg) != 0) { + ret = 1; + goto view_end; + } + break; case 'd': - if (strlen(optarg) < 4 || optarg[2] != ':') { + if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) { print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); ret = 1; goto view_end; @@ -393,7 +483,8 @@ int main_samview(int argc, char *argv[]) memcpy(settings.tag, optarg, 2); } - if (add_tag_value_single("view", &settings, optarg+3) != 0) { + if (strlen(optarg) > 3 && add_tag_value_single("view", &settings, optarg+3) != 0) { + print_error("view", "Could not add tag:value \"%s\"", optarg); ret = 1; goto view_end; } @@ -401,7 +492,7 @@ int main_samview(int argc, char *argv[]) case 'D': // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX // path translation as described at: - // http://www.mingw.org/wiki/Posix_path_conversion + // http://www.mingw.org/wiki/Posix_path_conversion if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); ret = 1; @@ -432,6 +523,8 @@ int main_samview(int argc, char *argv[]) //case 'x': out_format = "x"; break; //case 'X': out_format = "X"; break; */ + case LONGOPT('?'): + return usage(samtools_stdout, EXIT_SUCCESS, 1); case '?': if (optopt == '?') { // '-?' appeared on command line return usage(samtools_stdout, EXIT_SUCCESS, 1); @@ -453,7 +546,7 @@ int main_samview(int argc, char *argv[]) case 'x': { if (strlen(optarg) != 2) { - fprintf(samtools_stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); + print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long."); return usage(samtools_stderr, EXIT_FAILURE, 0); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); @@ -461,13 +554,22 @@ int main_samview(int argc, char *argv[]) } break; case 'M': settings.multi_region = 1; break; - case 1: no_pg = 1; break; + case LONGOPT('P'): no_pg = 1; break; + case 'e': + if (!(settings.filter = hts_filter_init(optarg))) { + print_error("main_samview", "Couldn't initialise filter"); + return 1; + } + break; + case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break; + case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(samtools_stderr, EXIT_FAILURE, 0); break; } } + if (fn_fai == 0 && ga.reference) fn_fai = fai_path(ga.reference); if (compress_level >= 0 && !*out_format) out_format = "b"; if (is_header_only) is_header = 1; // File format auto-detection first @@ -476,8 +578,7 @@ int main_samview(int argc, char *argv[]) // Overridden by manual -b, -C if (*out_format) out_mode[1] = out_un_mode[1] = *out_format; - out_mode[2] = out_un_mode[2] = '\0'; - // out_(un_)mode now 1 or 2 bytes long, followed by nul. + // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul. if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; @@ -488,20 +589,23 @@ int main_samview(int argc, char *argv[]) print_error("view", "No input provided or missing option argument."); return usage(samtools_stderr, EXIT_FAILURE, 0); // potential memory leak... } + if (settings.subsam_seed != 0) { + // Convert likely user input 1,2,... to pseudo-random + // values with more entropy and more bits set + srand(settings.subsam_seed); + settings.subsam_seed = rand(); + } fn_in = (optind < argc)? argv[optind] : "-"; - // generate the fn_list if necessary - if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference); - // open file handlers if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { print_error_errno("view", "failed to open \"%s\" for reading", fn_in); ret = 1; goto view_end; } - if (fn_list) { - if (hts_set_fai_filename(in, fn_list) != 0) { - fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + if (fn_fai) { + if (hts_set_fai_filename(in, fn_fai) != 0) { + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); ret = 1; goto view_end; } @@ -520,9 +624,9 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - if (fn_list) { - if (hts_set_fai_filename(out, fn_list) != 0) { - fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + if (fn_fai) { + if (hts_set_fai_filename(out, fn_fai) != 0) { + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); ret = 1; goto view_end; } @@ -567,9 +671,9 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } - if (fn_list) { - if (hts_set_fai_filename(un_out, fn_list) != 0) { - fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + if (fn_fai) { + if (hts_set_fai_filename(un_out, fn_fai) != 0) { + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai); ret = 1; goto view_end; } @@ -656,7 +760,10 @@ int main_samview(int argc, char *argv[]) // fetch alignments while ((result = sam_itr_multi_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } @@ -684,16 +791,20 @@ int main_samview(int argc, char *argv[]) if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file bam1_t *b = bam_init1(); int r; + errno = 0; while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } if (r < -1) { - fprintf(samtools_stderr, "[main_samview] truncated file.\n"); + print_error_errno("view", "error reading file \"%s\"", fn_in); ret = 1; } bam_destroy1(b); @@ -724,7 +835,10 @@ int main_samview(int argc, char *argv[]) // fetch alignments while ((result = sam_itr_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } @@ -768,7 +882,7 @@ view_end: if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); if (fp_out) fclose(fp_out); - free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); + free(fn_fai); free(fn_out); free(settings.library); free(fn_un_out); sam_global_args_free(&ga); if ( header ) sam_hdr_destroy(header); if (settings.bed) bed_destroy(settings.bed); @@ -776,13 +890,19 @@ view_end: khint_t k; for (k = 0; k < kh_end(settings.rghash); ++k) if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); - kh_destroy(rg, settings.rghash); + kh_destroy(str, settings.rghash); + } + if (settings.rnhash) { + khint_t k; + for (k = 0; k < kh_end(settings.rnhash); ++k) + if (kh_exist(settings.rnhash, k)) free((char*)kh_key(settings.rnhash, k)); + kh_destroy(str, settings.rnhash); } if (settings.tvhash) { khint_t k; for (k = 0; k < kh_end(settings.tvhash); ++k) if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); - kh_destroy(tv, settings.tvhash); + kh_destroy(str, settings.tvhash); } if (settings.remove_aux_len) { free(settings.remove_aux); @@ -790,6 +910,8 @@ view_end: if (settings.tag) { free(settings.tag); } + if (settings.filter) + hts_filter_free(settings.filter); if (p.pool) hts_tpool_destroy(p.pool); @@ -809,47 +931,52 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "\n" "Usage: samtools view [options] || [region ...]\n" "\n" -"Options:\n" -// output options -" -b output BAM\n" -" -C output CRAM (requires -T)\n" -" -1 use fast BAM compression (implies -b)\n" -" -u uncompressed BAM output (implies -b)\n" -" -h include header in SAM output\n" -" -H print SAM header only (no alignments)\n" -" -c print only the count of matching records\n" -" -o FILE output file name [samtools_stdout]\n" -" -U FILE output reads not selected by filters to FILE [null]\n" -// extra input -" -t FILE FILE listing reference names and lengths (see long help) [null]\n" -" -X include customized index file\n" -// read filters -" -L FILE only include reads overlapping this BED FILE [null]\n" -" -r STR only include reads in read group STR [null]\n" -" -R FILE only include reads with read group listed in FILE [null]\n" -" -d STR:STR\n" -" only include reads with tag STR and associated value STR [null]\n" -" -D STR:FILE\n" -" only include reads with tag STR and associated values listed in\n" -" FILE [null]\n" -" -q INT only include reads with mapping quality >= INT [0]\n" -" -l STR only include reads in library STR [null]\n" -" -m INT only include reads with number of CIGAR operations consuming\n" -" query sequence >= INT [0]\n" -" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 -" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) -" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n" -" fraction of templates/read pairs to keep; INT part sets seed)\n" -" -M use the multi-region iterator (increases the speed, removes\n" -" duplicates and outputs the reads as they are ordered in the file)\n" -// read processing -" -x STR read tag to strip (repeatable) [null]\n" -" -B collapse the backward CIGAR operation\n" -// general options -" -? print long help, including note about region specification\n" -" -S ignored (input format is auto-detected)\n" -" --no-PG do not add a PG line\n"); +"Output options:\n" +" -b, --bam Output BAM\n" +" -C, --cram Output CRAM (requires -T)\n" +" -1, --fast Use fast BAM compression (implies --bam)\n" +" -u, --uncompressed Uncompressed BAM output (implies --bam)\n" +" -h, --with-header Include header in SAM output\n" +" -H, --header-only Print SAM header only (no alignments)\n" +" --no-header Print SAM alignment records only [default]\n" +" -c, --count Print only the count of matching records\n" +" -o, --output FILE Write output to FILE [standard output]\n" +" -U, --unoutput FILE, --output-unselected FILE\n" +" Output reads not selected by filters to FILE\n" +"Input options:\n" +" -t, --fai-reference FILE FILE listing reference names and lengths\n" +" -M, --use-index Use index and multi-region iterator for regions\n" +" --region[s]-file FILE Use index to include only reads overlapping FILE\n" +" -X, --customized-index Expect extra index file argument after \n" +"\n" +"Filtering options (Only include in output reads that...):\n" +" -L, --target[s]-file FILE ...overlap (BED) regions in FILE\n" +" -r, --read-group STR ...are in read group STR\n" +" -R, --read-group-file FILE ...are in a read group listed in FILE\n" +" -N, --qname-file FILE ...whose read name is listed in FILE\n" +" -d, --tag STR1[:STR2] ...have a tag STR1 (with associated value STR2)\n" +" -D, --tag-file STR:FILE ...have a tag STR whose value is listed in FILE\n" +" -q, --min-MQ INT ...have mapping quality >= INT\n" +" -l, --library STR ...are in library STR\n" +" -m, --min-qlen INT ...cover >= INT query bases (as measured via CIGAR)\n" +" -e, --expr STR ...match the filter expression STR\n" +" -f, --require-flags FLAG ...have all of the FLAGs present\n" // F&x == x +" -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n" // F&x == 0 +" -G FLAG EXCLUDE reads with all of the FLAGs present\n" // !(F&x == x) TODO long option +" --subsample FLOAT Keep only FLOAT fraction of templates/read pairs\n" +" --subsample-seed INT Influence WHICH reads are kept in subsampling [0]\n" +" -s INT.FRAC Same as --subsample 0.FRAC --subsample-seed INT\n" +"\n" +"Processing options:\n" +" --add-flags FLAG Add FLAGs to reads\n" +" --remove-flags FLAG Remove FLAGs from reads\n" +" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n" +" -B, --remove-B Collapse the backward CIGAR operation\n" +"\n" +"General options:\n" +" -?, --help Print long help, including note about region specification\n" +" -S Ignored (input format is auto-detected)\n" +" --no-PG Do not add a PG line\n"); sam_global_opt_help(fp, "-.O.T@.."); fprintf(fp, "\n"); @@ -889,23 +1016,16 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "\n" "6. Option `-u' is preferred over `-b' when the output is piped to\n" " another samtools command.\n" +"\n" +"7. Option `-M`/`--use-index` causes overlaps with `-L` BED file regions and\n" +" command-line region arguments to be computed using the multi-region iterator\n" +" and an index. This increases speed, omits duplicates, and outputs the reads\n" +" as they are ordered in the input SAM/BAM/CRAM file.\n" +"\n" +"8. Options `-L`/`--target[s]-file` and `--region[s]-file` may not be used\n" +" together. `--region[s]-file FILE` is simply equivalent to `-M -L FILE`,\n" +" so using both causes one of the specified BED files to be ignored.\n" "\n"); return exit_status; } - -int main_import(int argc, char *argv[]) -{ - int argc2, ret; - char **argv2; - if (argc != 4) { - fprintf(samtools_stderr, "Usage: samtools import \n"); - return 1; - } - argc2 = 6; - argv2 = calloc(6, sizeof(char*)); - argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2]; - ret = main_samview(argc2, argv2); - free(argv2); - return ret; -} diff --git a/samtools/samtools.pysam.c b/samtools/samtools.pysam.c index b26f892..7044603 100644 --- a/samtools/samtools.pysam.c +++ b/samtools/samtools.pysam.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,25 @@ int samtools_puts(const char *s) return putc('\n', samtools_stdout); } + +static jmp_buf samtools_jmpbuf; +static int samtools_status = 0; + +int samtools_dispatch(int argc, char *argv[]) +{ + if (setjmp(samtools_jmpbuf) == 0) + return samtools_main(argc, argv); + else + return samtools_status; +} + +void samtools_exit(int status) +{ + samtools_status = status; + longjmp(samtools_jmpbuf, 1); +} + + void samtools_set_optind(int val) { // setting this in cython via diff --git a/samtools/samtools.pysam.h b/samtools/samtools.pysam.h index df8fd01..9d20ecb 100644 --- a/samtools/samtools.pysam.h +++ b/samtools/samtools.pysam.h @@ -3,6 +3,17 @@ #include +#ifndef __has_attribute +#define __has_attribute(attribute) 0 +#endif +#ifndef PYSAM_NORETURN +#if __has_attribute(__noreturn__) || __GNUC__ >= 3 +#define PYSAM_NORETURN __attribute__((__noreturn__)) +#else +#define PYSAM_NORETURN +#endif +#endif + extern FILE * samtools_stderr; extern FILE * samtools_stdout; @@ -40,6 +51,8 @@ int samtools_puts(const char *s); int samtools_dispatch(int argc, char *argv[]); +void PYSAM_NORETURN samtools_exit(int status); + void samtools_set_optind(int); extern int samtools_main(int argc, char *argv[]); diff --git a/samtools/stats.c b/samtools/stats.c index 55ede4c..f030cf5 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -1,6 +1,6 @@ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek Author: Sam Nicholls @@ -175,8 +175,8 @@ typedef struct // Arrays for the histogram data uint64_t *quals_1st, *quals_2nd; uint64_t *gc_1st, *gc_2nd; - acgtno_count_t *acgtno_cycles_1st; - acgtno_count_t *acgtno_cycles_2nd; + acgtno_count_t *acgtno_cycles_1st, *acgtno_cycles_2nd; + acgtno_count_t *acgtno_revcomp; uint64_t *read_lengths, *read_lengths_1st, *read_lengths_2nd; uint64_t *insertions, *deletions; uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; @@ -210,7 +210,7 @@ typedef struct uint64_t nbases_mapped_cigar; uint64_t nbases_trimmed; // bwa trimmed bases uint64_t nmismatches; - uint64_t nreads_QCfailed, nreads_secondary; + uint64_t nreads_QCfailed, nreads_secondary, nreads_supplementary; struct { uint32_t names, reads, quals; } checksum; @@ -250,7 +250,7 @@ typedef struct uint32_t nchunks; uint32_t pair_count; // Number of active pairs in the pairing hash table - uint32_t target_count; // Number of bases covered by the target file + uint64_t target_count; // Number of bases covered by the target file uint32_t last_pair_tid; uint32_t last_read_flush; @@ -647,6 +647,11 @@ void realloc_buffers(stats_t *stats, int seq_len) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t)); memset(stats->acgtno_cycles_2nd + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t)); + stats->acgtno_revcomp = realloc(stats->acgtno_revcomp, n*sizeof(acgtno_count_t)); + if ( !stats->acgtno_revcomp ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t)); + memset(stats->acgtno_revcomp + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t)); + stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t)); if ( !stats->read_lengths ) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); @@ -870,16 +875,20 @@ void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out switch (bam_seqi(seq, i)) { case 1: acgtno_cycles[ read_cycle ].a++; + reverse ? stats->acgtno_revcomp[ read_cycle ].t++ : stats->acgtno_revcomp[ read_cycle ].a++; break; case 2: acgtno_cycles[ read_cycle ].c++; + reverse ? stats->acgtno_revcomp[ read_cycle ].g++ : stats->acgtno_revcomp[ read_cycle ].c++; gc_count++; break; case 4: acgtno_cycles[ read_cycle ].g++; + reverse ? stats->acgtno_revcomp[ read_cycle ].c++ : stats->acgtno_revcomp[ read_cycle ].g++; gc_count++; break; case 8: + reverse ? stats->acgtno_revcomp[ read_cycle ].a++ : stats->acgtno_revcomp[ read_cycle ].t++; acgtno_cycles[ read_cycle ].t++; break; case 15: @@ -1129,6 +1138,8 @@ static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stat void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs) { + if ( !is_in_regions(bam_line,stats) ) + return; if ( stats->rg_hash ) { const uint8_t *rg = bam_aux_get(bam_line, "RG"); @@ -1145,8 +1156,6 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair stats->nreads_filtered++; return; } - if ( !is_in_regions(bam_line,stats) ) - return; if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen ) return; @@ -1159,6 +1168,11 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair return; } + if ( bam_line->core.flag & BAM_FSUPPLEMENTARY ) + { + stats->nreads_supplementary++; + } + // If line has no sequence cannot continue int seq_len = bam_line->core.l_qseq; if ( !seq_len ) return; @@ -1187,8 +1201,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair // These stats should only be calculated for the original reads ignoring supplementary artificial reads // otherwise we'll accidentally double count - if ( IS_ORIGINAL(bam_line) ) - { + if ( IS_ORIGINAL(bam_line) ) { stats->read_lengths[read_len]++; if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++; if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++; @@ -1200,7 +1213,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair count_indels(stats, bam_line); - if ( IS_PAIRED_AND_MAPPED(bam_line) ) + if ( IS_PAIRED_AND_MAPPED(bam_line) && IS_ORIGINAL(bam_line) ) { // The insert size is tricky, because for long inserts the libraries are // prepared differently and the pairs point in other direction. BWA does @@ -1495,7 +1508,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n"); fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals); fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); - fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) + fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); @@ -1510,6 +1523,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0); fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed); fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary); + fprintf(to, "SN\tsupplementary alignments:\t%ld\n", (long)stats->nreads_supplementary); fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len); fprintf(to, "SN\ttotal first fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_1st); fprintf(to, "SN\ttotal last fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_2nd); @@ -1535,7 +1549,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0); if ( stats->target_count ) { - fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count); + fprintf(to, "SN\tbases inside the target:\t%" PRIu64 "\n", stats->target_count); for (icov=stats->info->cov_threshold+1; icovncov; icov++) cov_sum += stats->cov[icov]; fprintf(to, "SN\tpercentage of target genome with coverage > %d (%%):\t%.2f\n", stats->info->cov_threshold, (float)(100*cov_sum)/stats->target_count); @@ -1612,7 +1626,18 @@ void output_stats(FILE *to, stats_t *stats, int sparse) 100.*(acgtno_count_1st->t + acgtno_count_2nd->t)/acgt_sum, 100.*(acgtno_count_1st->n + acgtno_count_2nd->n)/acgt_sum, 100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum); - + } + fprintf(to, "# ACGT content per cycle, read oriented. Use `grep ^GCT | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]\n"); + for (ibase=0; ibasemax_len; ibase++) + { + acgtno_count_t *acgtno_count = &(stats->acgtno_revcomp[ibase]); + uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t; + if ( ! acgt_sum ) continue; + fprintf(to, "GCT\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, + 100.*(acgtno_count->a)/acgt_sum, + 100.*(acgtno_count->c)/acgt_sum, + 100.*(acgtno_count->g)/acgt_sum, + 100.*(acgtno_count->t)/acgt_sum); } uint64_t tA=0, tC=0, tG=0, tT=0, tN=0; @@ -1800,7 +1825,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) } } -static void init_regions(stats_t *stats, const char *file) +static void init_regions(stats_t *stats, const char *file, stats_info_t* info) { FILE *fp = fopen(file,"r"); if ( !fp ) error("%s: %s\n",file,strerror(errno)); @@ -1877,8 +1902,15 @@ static void init_regions(stats_t *stats, const char *file) } reg->npos = ++new_p; } - for (p = 0; p < reg->npos; p++) - stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1); + for (p = 0; p < reg->npos; p++) { + if (reg->pos[p].end < HTS_POS_MAX) { + stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1); + } else { + uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, r); + if (hdr_end) + stats->target_count += (hdr_end - reg->pos[p].beg + 1); + } + } } if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)))) @@ -1941,7 +1973,7 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats) return 1; } -int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) { +int replicate_regions(stats_t *stats, hts_itr_multi_t *iter, stats_info_t *info) { if ( !stats || !iter) return 1; @@ -1975,8 +2007,13 @@ int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) { for (j = 0; j < stats->regions[tid].npos; j++) { stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1; stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end; - - stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1); + if (stats->regions[tid].pos[j].end < HTS_POS_MAX) { + stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1); + } else { + uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, tid); + if (hdr_end) + stats->target_count += (hdr_end - stats->regions[tid].pos[j].beg + 1); + } } } @@ -2073,6 +2110,7 @@ void cleanup_stats(stats_t* stats) free(stats->mpc_buf); free(stats->acgtno_cycles_1st); free(stats->acgtno_cycles_2nd); + free(stats->acgtno_revcomp); free(stats->read_lengths); free(stats->read_lengths_1st); free(stats->read_lengths_2nd); @@ -2257,6 +2295,8 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr if (!stats->acgtno_cycles_1st) goto nomem; stats->acgtno_cycles_2nd = calloc(stats->nbases,sizeof(acgtno_count_t)); if (!stats->acgtno_cycles_2nd) goto nomem; + stats->acgtno_revcomp = calloc(stats->nbases,sizeof(acgtno_count_t)); + if (!stats->acgtno_revcomp) goto nomem; stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); if (!stats->read_lengths) goto nomem; stats->read_lengths_1st = calloc(stats->nbases,sizeof(uint64_t)); @@ -2279,7 +2319,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr goto nomem; realloc_rseq_buffer(stats); if ( targets ) - init_regions(stats, targets); + init_regions(stats, targets, info); return; nomem: error("Out of memory"); @@ -2459,7 +2499,7 @@ int main_stats(int argc, char *argv[]) if (iter) { if (!targets) { all_stats->nchunks = argc-optind; - if (replicate_regions(all_stats, iter)) + if (replicate_regions(all_stats, iter, info)) fprintf(stderr, "Replications of the regions failed\n"); } diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index 3d126a7..9e8165d 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -2,7 +2,7 @@ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek Author: Sam Nicholls @@ -177,8 +177,8 @@ typedef struct // Arrays for the histogram data uint64_t *quals_1st, *quals_2nd; uint64_t *gc_1st, *gc_2nd; - acgtno_count_t *acgtno_cycles_1st; - acgtno_count_t *acgtno_cycles_2nd; + acgtno_count_t *acgtno_cycles_1st, *acgtno_cycles_2nd; + acgtno_count_t *acgtno_revcomp; uint64_t *read_lengths, *read_lengths_1st, *read_lengths_2nd; uint64_t *insertions, *deletions; uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; @@ -212,7 +212,7 @@ typedef struct uint64_t nbases_mapped_cigar; uint64_t nbases_trimmed; // bwa trimmed bases uint64_t nmismatches; - uint64_t nreads_QCfailed, nreads_secondary; + uint64_t nreads_QCfailed, nreads_secondary, nreads_supplementary; struct { uint32_t names, reads, quals; } checksum; @@ -252,7 +252,7 @@ typedef struct uint32_t nchunks; uint32_t pair_count; // Number of active pairs in the pairing hash table - uint32_t target_count; // Number of bases covered by the target file + uint64_t target_count; // Number of bases covered by the target file uint32_t last_pair_tid; uint32_t last_read_flush; @@ -649,6 +649,11 @@ void realloc_buffers(stats_t *stats, int seq_len) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t)); memset(stats->acgtno_cycles_2nd + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t)); + stats->acgtno_revcomp = realloc(stats->acgtno_revcomp, n*sizeof(acgtno_count_t)); + if ( !stats->acgtno_revcomp ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t)); + memset(stats->acgtno_revcomp + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t)); + stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t)); if ( !stats->read_lengths ) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); @@ -872,16 +877,20 @@ void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out switch (bam_seqi(seq, i)) { case 1: acgtno_cycles[ read_cycle ].a++; + reverse ? stats->acgtno_revcomp[ read_cycle ].t++ : stats->acgtno_revcomp[ read_cycle ].a++; break; case 2: acgtno_cycles[ read_cycle ].c++; + reverse ? stats->acgtno_revcomp[ read_cycle ].g++ : stats->acgtno_revcomp[ read_cycle ].c++; gc_count++; break; case 4: acgtno_cycles[ read_cycle ].g++; + reverse ? stats->acgtno_revcomp[ read_cycle ].c++ : stats->acgtno_revcomp[ read_cycle ].g++; gc_count++; break; case 8: + reverse ? stats->acgtno_revcomp[ read_cycle ].a++ : stats->acgtno_revcomp[ read_cycle ].t++; acgtno_cycles[ read_cycle ].t++; break; case 15: @@ -1131,6 +1140,8 @@ static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stat void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs) { + if ( !is_in_regions(bam_line,stats) ) + return; if ( stats->rg_hash ) { const uint8_t *rg = bam_aux_get(bam_line, "RG"); @@ -1147,8 +1158,6 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair stats->nreads_filtered++; return; } - if ( !is_in_regions(bam_line,stats) ) - return; if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen ) return; @@ -1161,6 +1170,11 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair return; } + if ( bam_line->core.flag & BAM_FSUPPLEMENTARY ) + { + stats->nreads_supplementary++; + } + // If line has no sequence cannot continue int seq_len = bam_line->core.l_qseq; if ( !seq_len ) return; @@ -1189,8 +1203,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair // These stats should only be calculated for the original reads ignoring supplementary artificial reads // otherwise we'll accidentally double count - if ( IS_ORIGINAL(bam_line) ) - { + if ( IS_ORIGINAL(bam_line) ) { stats->read_lengths[read_len]++; if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++; if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++; @@ -1202,7 +1215,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair count_indels(stats, bam_line); - if ( IS_PAIRED_AND_MAPPED(bam_line) ) + if ( IS_PAIRED_AND_MAPPED(bam_line) && IS_ORIGINAL(bam_line) ) { // The insert size is tricky, because for long inserts the libraries are // prepared differently and the pairs point in other direction. BWA does @@ -1497,7 +1510,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n"); fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals); fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); - fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) + fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); @@ -1512,6 +1525,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0); fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed); fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary); + fprintf(to, "SN\tsupplementary alignments:\t%ld\n", (long)stats->nreads_supplementary); fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len); fprintf(to, "SN\ttotal first fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_1st); fprintf(to, "SN\ttotal last fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_2nd); @@ -1537,7 +1551,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0); if ( stats->target_count ) { - fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count); + fprintf(to, "SN\tbases inside the target:\t%" PRIu64 "\n", stats->target_count); for (icov=stats->info->cov_threshold+1; icovncov; icov++) cov_sum += stats->cov[icov]; fprintf(to, "SN\tpercentage of target genome with coverage > %d (%%):\t%.2f\n", stats->info->cov_threshold, (float)(100*cov_sum)/stats->target_count); @@ -1614,7 +1628,18 @@ void output_stats(FILE *to, stats_t *stats, int sparse) 100.*(acgtno_count_1st->t + acgtno_count_2nd->t)/acgt_sum, 100.*(acgtno_count_1st->n + acgtno_count_2nd->n)/acgt_sum, 100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum); - + } + fprintf(to, "# ACGT content per cycle, read oriented. Use `grep ^GCT | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]\n"); + for (ibase=0; ibasemax_len; ibase++) + { + acgtno_count_t *acgtno_count = &(stats->acgtno_revcomp[ibase]); + uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t; + if ( ! acgt_sum ) continue; + fprintf(to, "GCT\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, + 100.*(acgtno_count->a)/acgt_sum, + 100.*(acgtno_count->c)/acgt_sum, + 100.*(acgtno_count->g)/acgt_sum, + 100.*(acgtno_count->t)/acgt_sum); } uint64_t tA=0, tC=0, tG=0, tT=0, tN=0; @@ -1802,7 +1827,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse) } } -static void init_regions(stats_t *stats, const char *file) +static void init_regions(stats_t *stats, const char *file, stats_info_t* info) { FILE *fp = fopen(file,"r"); if ( !fp ) error("%s: %s\n",file,strerror(errno)); @@ -1879,8 +1904,15 @@ static void init_regions(stats_t *stats, const char *file) } reg->npos = ++new_p; } - for (p = 0; p < reg->npos; p++) - stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1); + for (p = 0; p < reg->npos; p++) { + if (reg->pos[p].end < HTS_POS_MAX) { + stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1); + } else { + uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, r); + if (hdr_end) + stats->target_count += (hdr_end - reg->pos[p].beg + 1); + } + } } if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)))) @@ -1943,7 +1975,7 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats) return 1; } -int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) { +int replicate_regions(stats_t *stats, hts_itr_multi_t *iter, stats_info_t *info) { if ( !stats || !iter) return 1; @@ -1977,8 +2009,13 @@ int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) { for (j = 0; j < stats->regions[tid].npos; j++) { stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1; stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end; - - stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1); + if (stats->regions[tid].pos[j].end < HTS_POS_MAX) { + stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1); + } else { + uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, tid); + if (hdr_end) + stats->target_count += (hdr_end - stats->regions[tid].pos[j].beg + 1); + } } } @@ -2054,7 +2091,7 @@ static void HTS_NORETURN error(const char *format, ...) vfprintf(samtools_stderr, format, ap); va_end(ap); } - exit(1); + samtools_exit(1); } void cleanup_stats_info(stats_info_t* info){ @@ -2075,6 +2112,7 @@ void cleanup_stats(stats_t* stats) free(stats->mpc_buf); free(stats->acgtno_cycles_1st); free(stats->acgtno_cycles_2nd); + free(stats->acgtno_revcomp); free(stats->read_lengths); free(stats->read_lengths_1st); free(stats->read_lengths_2nd); @@ -2259,6 +2297,8 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr if (!stats->acgtno_cycles_1st) goto nomem; stats->acgtno_cycles_2nd = calloc(stats->nbases,sizeof(acgtno_count_t)); if (!stats->acgtno_cycles_2nd) goto nomem; + stats->acgtno_revcomp = calloc(stats->nbases,sizeof(acgtno_count_t)); + if (!stats->acgtno_revcomp) goto nomem; stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); if (!stats->read_lengths) goto nomem; stats->read_lengths_1st = calloc(stats->nbases,sizeof(uint64_t)); @@ -2281,7 +2321,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr goto nomem; realloc_rseq_buffer(stats); if ( targets ) - init_regions(stats, targets); + init_regions(stats, targets, info); return; nomem: error("Out of memory"); @@ -2461,7 +2501,7 @@ int main_stats(int argc, char *argv[]) if (iter) { if (!targets) { all_stats->nchunks = argc-optind; - if (replicate_regions(all_stats, iter)) + if (replicate_regions(all_stats, iter, info)) fprintf(samtools_stderr, "Replications of the regions failed\n"); } diff --git a/samtools/stats_isize.c.pysam.c b/samtools/stats_isize.c.pysam.c index 96feb90..1bb2bd4 100644 --- a/samtools/stats_isize.c.pysam.c +++ b/samtools/stats_isize.c.pysam.c @@ -97,7 +97,7 @@ static void sparse_set_f(isize_data_t data, int at, isize_insert_t field, uint64 a->max = max(at, a->max); } else { fprintf(samtools_stderr, "%s\n", "Failed to allocate memory for isize_sparse_record_t"); - exit(11); + samtools_exit(11); } } else { return; diff --git a/samtools/tmp_file.h b/samtools/tmp_file.h index 15d088e..4f2647c 100644 --- a/samtools/tmp_file.h +++ b/samtools/tmp_file.h @@ -31,7 +31,7 @@ DEALINGS IN THE SOFTWARE #include #include "htslib/sam.h" -#ifdef _cplusplus +#ifdef __cplusplus extern "C" { #endif diff --git a/samtools/version.sh b/samtools/version.sh index 5ccd9bb..9d28100 100755 --- a/samtools/version.sh +++ b/samtools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.10 +VERSION=1.13 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/setup.py b/setup.py index 072ed8a..5f2bb00 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,10 @@ import subprocess import sys import sysconfig from contextlib import contextmanager -from setuptools import setup +from distutils import log +from setuptools import setup, Command +from setuptools.command.sdist import sdist + from cy_build import CyExtension as Extension, cy_build_ext as build_ext try: import cython @@ -79,6 +82,61 @@ def run_make_print_config(): return make_print_config +# This function emulates the way distutils combines settings from sysconfig, +# environment variables, and the extension being built. It returns a dictionary +# representing the usual set of variables, suitable for writing to a generated +# file or for running configure (provided the returned LIBS is ignored). +def build_config_dict(ext): + def env(var): + return [os.environ[var]] if var in os.environ else [] + + def sc(var): + value = sysconfig.get_config_var(var) + return [value] if value is not None else [] + + def optionise(option, valuelist): + def quote(s): return "'"+s+"'" if " " in s else s + return list(quote(option+v) for v in valuelist) + + def kvtuples(pairlist): + def appendoptvalue(t): return t[0] if t[1] is None else t[0]+"="+t[1] + return map(appendoptvalue, pairlist) + + # For CC, select the first of these that is set + cc = (env('CC') + sc('CC') + ['gcc'])[0] + + # distutils ignores sysconfig for CPPFLAGS + cppflags = " ".join(env('CPPFLAGS') + optionise('-I', ext.include_dirs) + + optionise('-D', kvtuples(ext.define_macros)) + + optionise('-U', ext.undef_macros)) + + cflags = " ".join(sc('CFLAGS') + env('CFLAGS') + ext.extra_compile_args) + + # distutils actually includes $CPPFLAGS here too, but that's weird and + # unnecessary for us as we know the output LDFLAGS will be used correctly + ldflags = " ".join(sc('LDFLAGS') + env('LDFLAGS') + env('CFLAGS') + + optionise('-L', ext.library_dirs) + + ext.extra_link_args) + + # ext.libraries is computed (incorporating $LIBS etc) during configure + libs = " ".join(optionise('-l', ext.libraries)) + + return { 'CC': cc, 'CPPFLAGS': cppflags, 'CFLAGS': cflags, + 'LDFLAGS': ldflags, 'LIBS': libs } + + +def write_configvars_header(filename, ext, prefix): + config = build_config_dict(ext) + if prefix != 'HTS': + config['HTSDIR'] = '(unused)' + config['CURSES_LIB'] = '(unused)' + + log.info("creating %s for '%s' extension", filename, ext.name) + with open(filename, "w") as outf: + for var, value in config.items(): + outf.write('#define {}_{} "{}"\n'.format(prefix, var, value)) + + @contextmanager def set_compiler_envvars(): tmp_vars = [] @@ -140,6 +198,46 @@ def get_pysam_version(): return version.__version__ +# Override sdist command to ensure Cythonized *.c files are included. +class cythonize_sdist(sdist): + # Remove when setuptools (as installed on GH runners) has these options + if not any(opt[0] == 'owner=' for opt in sdist.user_options): + sdist.user_options.append(('owner=', 'u', 'Specify owner inside tar')) + if not any(opt[0] == 'group=' for opt in sdist.user_options): + sdist.user_options.append(('group=', 'g', 'Specify group inside tar')) + + def run(self): + from Cython.Build import cythonize + cythonize(self.distribution.ext_modules) + super().run() + + +class clean_ext(Command): + description = "clean up Cython temporary files" + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + objs = glob.glob(os.path.join("pysam", "libc*.c")) + if objs: + log.info("removing 'pysam/libc*.c' (%s Cython objects)", len(objs)) + for obj in objs: + os.remove(obj) + + headers = (glob.glob(os.path.join("htslib", "*config*.h")) + + glob.glob(os.path.join("samtools", "*config*.h")) + + glob.glob(os.path.join("bcftools", "*config*.h"))) + if headers: + log.info("removing '*/*config*.h' (%s generated headers)", len(headers)) + for header in headers: + os.remove(header) + + # How to link against HTSLIB # shared: build shared chtslib from builtin htslib code. # external: use shared libhts.so compiled outside of @@ -170,8 +268,6 @@ package_dirs = {'pysam': 'pysam', config_headers = ["samtools/config.h", "bcftools/config.h"] -cmdclass = {'build_ext': build_ext} - # If cython is available, the pysam will be built using cython from # the .pyx files. If no cython is available, the C-files included in the # distribution will be used. @@ -191,22 +287,6 @@ if not os.path.exists(fn): "from the repository" .format(fn)) -# exclude sources that contain a main function -EXCLUDE = { - "samtools": ( - ), - "bcftools": ( - "test", "plugins", "peakfit.c", - "peakfit.h", - # needs to renamed, name conflict with samtools reheader - "reheader.c", - "polysomy.c"), - "htslib": ( - 'htslib/tabix.c', - 'htslib/bgzip.c', - 'htslib/htsfile.c'), -} - print ("# pysam: htslib mode is {}".format(HTSLIB_MODE)) print ("# pysam: HTSLIB_CONFIGURE_OPTIONS={}".format( HTSLIB_CONFIGURE_OPTIONS)) @@ -364,11 +444,20 @@ libraries_for_pysam_module = external_htslib_libraries + internal_htslib_librari # The list below uses the union of include_dirs and library_dirs for # reasons of simplicity. +def prebuild_libchtslib(ext, force): + if HTSLIB_MODE not in ['shared', 'separate']: return + write_configvars_header("htslib/config_vars.h", ext, "HTS") + +def prebuild_libcsamtools(ext, force): + write_configvars_header("samtools/samtools_config_vars.h", ext, "SAMTOOLS") + modules = [ dict(name="pysam.libchtslib", + prebuild_func=prebuild_libchtslib, sources=[source_pattern % "htslib", "pysam/htslib_util.c"] + shared_htslib_sources + os_c_files, libraries=external_htslib_libraries), dict(name="pysam.libcsamtools", + prebuild_func=prebuild_libcsamtools, sources=[source_pattern % "samtools"] + glob.glob(os.path.join("samtools", "*.pysam.c")) + [os.path.join("samtools", "lz4", "lz4.c")] + htslib_sources + os_c_files, libraries=external_htslib_libraries + internal_htslib_libraries), @@ -447,12 +536,11 @@ metadata = { 'packages': package_list, 'requires': ['cython (>=0.29.12)'], 'ext_modules': [Extension(**opts) for opts in modules], - 'cmdclass': cmdclass, + 'cmdclass': {'build_ext': build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist}, 'package_dir': package_dirs, 'package_data': {'': ['*.pxd', '*.h'], }, # do not pack in order to permit linking to csamtools.so 'zip_safe': False, - 'use_2to3': True, } if __name__ == '__main__': diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 3c5dda5..8fb1971 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -7,7 +7,7 @@ import string import copy import array -from TestUtils import checkFieldEqual, BAM_DATADIR, get_temp_filename, get_temp_context, IS_PYTHON3 +from TestUtils import checkFieldEqual, make_data_files, BAM_DATADIR, get_temp_filename, get_temp_context, IS_PYTHON3 if IS_PYTHON3: @@ -15,6 +15,11 @@ if IS_PYTHON3: else: maketrans = string.maketrans + +def setUpModule(): + make_data_files(BAM_DATADIR) + + class ReadTest(unittest.TestCase): def build_read(self): @@ -65,7 +70,7 @@ class TestAlignedSegment(ReadTest): a = pysam.AlignedSegment() s = str(a) self.assertEqual( - "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]", + "None\t0\t*\t0\t0\tNone\t*\t0\t0\tNone\tNone\t[]", s) def testSettingTagInEmptyRead(self): @@ -525,13 +530,13 @@ class TestAlignedSegment(ReadTest): def test_query_length_is_limited(self): a = self.build_read() a.query_name = "A" * 1 - a.query_name = "A" * 251 + a.query_name = "A" * 254 self.assertRaises( ValueError, setattr, a, "query_name", - "A" * 252) + "A" * 255) def test_header_accessible(self): a = self.build_read() diff --git a/tests/AlignmentFileHeader_test.py b/tests/AlignmentFileHeader_test.py index e6c4287..a665f43 100644 --- a/tests/AlignmentFileHeader_test.py +++ b/tests/AlignmentFileHeader_test.py @@ -13,7 +13,7 @@ import copy from collections import OrderedDict as odict import pysam import pysam.samtools -from TestUtils import get_temp_filename, BAM_DATADIR +from TestUtils import get_temp_filename, make_data_files, BAM_DATADIR if sys.version_info.major >= 3: from io import StringIO @@ -21,6 +21,10 @@ else: from StringIO import StringIO +def setUpModule(): + make_data_files(BAM_DATADIR) + + class TestHeaderConstruction(unittest.TestCase): """testing header construction.""" diff --git a/tests/AlignmentFilePileup_test.py b/tests/AlignmentFilePileup_test.py index 43072fa..8e75a52 100644 --- a/tests/AlignmentFilePileup_test.py +++ b/tests/AlignmentFilePileup_test.py @@ -2,10 +2,14 @@ import os import pysam import unittest -from TestUtils import BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list +from TestUtils import make_data_files, BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list import PileupTestUtils +def setUpModule(): + make_data_files(BAM_DATADIR) + + class TestPileupReadSelection(unittest.TestCase): '''test pileup functionality.''' diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 28de420..3a6cafc 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -24,7 +24,11 @@ import pysam import pysam.samtools from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \ check_samtools_view_equal, checkFieldEqual, force_str, \ - get_temp_filename, BAM_DATADIR + get_temp_filename, make_data_files, BAM_DATADIR + + +def setUpModule(): + make_data_files(BAM_DATADIR) ################################################## @@ -723,7 +727,7 @@ class TestIO(unittest.TestCase): read = load_bam() self.assertEqual(read.reference_name, "chr1") - # TOOD + # TODO # def testReadingFromSamFileWithoutHeader(self): # '''read from samfile without header. # ''' @@ -1391,12 +1395,12 @@ class TestEmptyHeader(unittest.TestCase): self.assertEqual(s.header.to_dict(), {'SQ': [{'LN': 1000, 'SN': 'chr1'}]}) def test_bam_without_seq_in_header(self): - s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header.bam")) + s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "0example_no_seq_in_header.bam")) self.assertTrue("SQ" in s.header.to_dict()) self.assertTrue("@SQ" in str(s.header)) def test_bam_without_seq_with_null_bytes_in_header(self): - s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header_null_bytes.bam")) + s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "0example_no_seq_in_header_null_bytes.bam")) self.assertTrue("SQ" in s.header.to_dict()) self.assertTrue("@SQ" in str(s.header)) @@ -1460,6 +1464,24 @@ class TestTruncatedBAM(unittest.TestCase): return len([a for a in x]) self.assertRaises(IOError, iterall, s) + # Ignore closing errors, as s is now in an error state + try: + s.close() + except IOError: + pass + + +class TestCorruptBAM(unittest.TestCase): + """See pull request 1035.""" + + def testCorruptBamIterator(self): + s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex2_corrupt.bam")) + + def iterall(x): + return len([a for a in x]) + + self.assertRaises(IOError, iterall, s) + COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204, 0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78, @@ -2316,26 +2338,6 @@ class TestSanityCheckingBAM(unittest.TestCase): self.check_write(read) -class TestHeader1000Genomes(unittest.TestCase): - - '''see issue 110''' - bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam" # noqa - bambase = "HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam" # noqa - - def testRead(self): - - if not check_url(self.bamfile): - return - - f = pysam.AlignmentFile(self.bamfile, "rb") - data = f.header.copy() - self.assertTrue(data) - - def tearDown(self): - if os.path.exists(self.bambase + ".bai"): - os.unlink(self.bambase + ".bai") - - class TestLargeCigar(unittest.TestCase): def setUp(self): @@ -2422,9 +2424,6 @@ class TestLargeCigar(unittest.TestCase): # mode = "w" if __name__ == "__main__": - # build data files - print("building data files") - subprocess.call("make -C %s" % BAM_DATADIR, shell=True) print("starting tests") unittest.main() print("completed tests") diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py index f09ef37..07adea8 100644 --- a/tests/StreamFiledescriptors_test.py +++ b/tests/StreamFiledescriptors_test.py @@ -5,11 +5,15 @@ import threading import errno import unittest from pysam import AlignmentFile -from TestUtils import BAM_DATADIR +from TestUtils import make_data_files, BAM_DATADIR IS_PYTHON2 = sys.version_info[0] == 2 +def setUpModule(): + make_data_files(BAM_DATADIR) + + def alignmentfile_writer_thread(infile, outfile): def _writer_thread(infile, outfile): """read from infile and write to outfile""" diff --git a/tests/TestUtils.py b/tests/TestUtils.py index f33761e..97bd2ed 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -5,6 +5,7 @@ import difflib import gzip import contextlib import inspect +import subprocess import tempfile import pysam @@ -251,6 +252,18 @@ def get_temp_context(suffix="", keep=False): os.unlink(f) +def make_data_files(directory): + what = None + try: + if not os.path.exists(os.path.join(directory, "all.stamp")): + subprocess.check_output(["make", "-C", directory], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + what = "Making test data in '%s' failed:\n%s" % (directory, force_str(e.output)) + + if what is not None: + raise RuntimeError(what) + + def load_and_convert(filename, encode=True): '''load data from filename and convert all fields to string. diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py index 4458d1f..fcc39a6 100644 --- a/tests/VariantFile_test.py +++ b/tests/VariantFile_test.py @@ -7,14 +7,17 @@ import unittest import pysam import shutil import gzip -import subprocess try: from pathlib import Path except ImportError: Path = None -from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context +from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, make_data_files, CBCF_DATADIR, get_temp_context + + +def setUpModule(): + make_data_files(CBCF_DATADIR) def read_header(filename): @@ -33,6 +36,12 @@ def read_header(filename): return data +def read_index_header(filename): + with gzip.open(filename) as infile: + magic = infile.read(4) + return magic + + class TestMissingGenotypes(unittest.TestCase): filename = "missing_genotypes.vcf" @@ -199,6 +208,7 @@ class TestIndexFormatsVCF(unittest.TestCase): shutil.copyfile(self.vcf_filename, fn) pysam.tabix_index(fn, preset="vcf", force=True) self.assertTrue(os.path.exists(fn + ".gz" + ".tbi")) + self.assertEqual(read_index_header(fn + ".gz.tbi"), b"TBI\1") self.assertFalse(os.path.exists(fn + ".gz" + ".csi")) with pysam.VariantFile(fn + ".gz") as inf: @@ -210,6 +220,7 @@ class TestIndexFormatsVCF(unittest.TestCase): pysam.tabix_index(fn, preset="vcf", force=True, csi=True) self.assertTrue(os.path.exists(fn + ".gz" + ".csi")) + self.assertEqual(read_index_header(fn + ".gz.csi"), b"CSI\1") self.assertFalse(os.path.exists(fn + ".gz" + ".tbi")) with pysam.VariantFile(fn + ".gz") as inf: @@ -221,6 +232,7 @@ class TestIndexFormatsVCF(unittest.TestCase): shutil.copyfile(self.bcf_filename + ".csi", fn + ".csi") self.assertTrue(os.path.exists(fn + ".csi")) + self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1") self.assertFalse(os.path.exists(fn + ".tbi")) with pysam.VariantFile(fn) as inf: @@ -232,6 +244,7 @@ class TestIndexFormatsVCF(unittest.TestCase): pysam.tabix_index(fn, preset="bcf", force=True, csi=False) self.assertTrue(os.path.exists(fn + ".csi")) + self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1") self.assertFalse(os.path.exists(fn + ".tbi")) with pysam.VariantFile(fn) as inf: @@ -244,6 +257,7 @@ class TestIndexFormatsVCF(unittest.TestCase): pysam.tabix_index(fn, preset="vcf", force=True, csi=True) self.assertTrue(os.path.exists(fn + ".csi")) + self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1") self.assertFalse(os.path.exists(fn + ".tbi")) with pysam.VariantFile(fn) as inf: @@ -668,9 +682,6 @@ class TestUnicode(unittest.TestCase): if __name__ == "__main__": - # build data files - print("building data files") - subprocess.call("make -C %s" % CBCF_DATADIR, shell=True) print("starting tests") unittest.main() print("completed tests") diff --git a/tests/VariantRecord_test.py b/tests/VariantRecord_test.py index fd80a80..5043d1f 100644 --- a/tests/VariantRecord_test.py +++ b/tests/VariantRecord_test.py @@ -13,7 +13,11 @@ try: except ImportError: Path = None -from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context +from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, make_data_files, CBCF_DATADIR, get_temp_context + + +def setUpModule(): + make_data_files(CBCF_DATADIR) @pytest.fixture diff --git a/tests/cbcf_data/Makefile b/tests/cbcf_data/Makefile index 796c3a6..9c3fe75 100644 --- a/tests/cbcf_data/Makefile +++ b/tests/cbcf_data/Makefile @@ -4,7 +4,10 @@ VCF=$(filter-out example_empty.vcf,$(ALL_VCF)) VCFGZ=$(VCF:%.vcf=%.vcf.gz) BCF=$(VCF:%.vcf=%.bcf) -all: $(VCFGZ) $(BCF) +all: all.stamp + +all.stamp: $(VCFGZ) $(BCF) + touch $@ %.vcf.gz: %.vcf bgzip < $< > $@ @@ -19,5 +22,4 @@ example_empty.bcf: example_empty.vcf.gz touch $@ clean: - rm -f *.gz *.tbi *.csi *.bcf - + -rm -f all.stamp *.gz *.tbi *.csi *.bcf diff --git a/tests/compile_test.py b/tests/compile_test.py index f56adb7..300ab92 100644 --- a/tests/compile_test.py +++ b/tests/compile_test.py @@ -10,7 +10,13 @@ pysam and tabix works. import os import unittest import pysam -from TestUtils import BAM_DATADIR, TABIX_DATADIR +from TestUtils import make_data_files, BAM_DATADIR, TABIX_DATADIR + + +def setUpModule(): + make_data_files(BAM_DATADIR) + make_data_files(TABIX_DATADIR) + try: os.unlink('tests/_compile_test.c') diff --git a/tests/faidx_test.py b/tests/faidx_test.py index 171fae3..72520e7 100644 --- a/tests/faidx_test.py +++ b/tests/faidx_test.py @@ -6,7 +6,11 @@ import gzip import copy import shutil -from TestUtils import check_url, BAM_DATADIR, get_temp_filename +from TestUtils import check_url, make_data_files, BAM_DATADIR, get_temp_filename + + +def setUpModule(): + make_data_files(BAM_DATADIR) class TestFastaFile(unittest.TestCase): diff --git a/tests/pysam_data/example_no_seq_in_header.bam b/tests/pysam_data/0example_no_seq_in_header.bam similarity index 100% rename from tests/pysam_data/example_no_seq_in_header.bam rename to tests/pysam_data/0example_no_seq_in_header.bam diff --git a/tests/pysam_data/example_no_seq_in_header_null_bytes.bam b/tests/pysam_data/0example_no_seq_in_header_null_bytes.bam similarity index 100% rename from tests/pysam_data/example_no_seq_in_header_null_bytes.bam rename to tests/pysam_data/0example_no_seq_in_header_null_bytes.bam diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile index 3921e8a..c6ad884 100644 --- a/tests/pysam_data/Makefile +++ b/tests/pysam_data/Makefile @@ -3,11 +3,13 @@ BAM=$(SAM:%.sam=%.bam) BAI=$(BAM:%.bam=%.bam.bai) CRAM=ex1.cram ex2.cram ex3.cram CRAI=$(CRAM:%.cram=%.cram.crai) -NO_PG:=$(findstring --no-PG,$(shell samtools view)) +NO_PG:=$(findstring --no-PG,$(shell samtools view '-?')) # ex2.bam - bam file without index -all: ex1.pileup.gz \ +all: all.stamp + +all.stamp: ex1.pileup.gz \ ex1.sam ex1.bam \ ex2.sam.gz ex2.sam ex2.bam ex2.bam.bai \ with_md.sam.gz with_md.bam with_md.bam.bai \ @@ -17,13 +19,15 @@ all: ex1.pileup.gz \ example_bai.bam \ rg_with_tab.bam \ ex2_truncated.bam \ + ex2_corrupt.bam \ empty.bam empty.bam.bai \ explicit_index.bam explicit_index.cram \ faidx_empty_seq.fq.gz \ - ex1.fa.gz ex1.fa.gz.csi \ + ex1.fa.gz ex1.fa.gz.fai ex1.fa.gz.gzi \ ex1_csi.bam \ example_reverse_complement.bam \ example_dash_in_chr.bam + touch $@ # ex2.sam - as ex1.sam, but with header ex2.sam.gz: ex1.bam ex1.bam.bai @@ -36,13 +40,13 @@ with_md.sam.gz: ex2.bam ex1.fa # samtools view $(NO_PG) -bo $@ -t ex1.fa.fai $< uncompressed.bam: ex2.sam - samtools view $(NO_PG) -buS $< > $@ + samtools view $(NO_PG) -bu -o $@ $< %.bam: %.sam - samtools view $(NO_PG) -bS $< > $@ + samtools view $(NO_PG) -bo $@ $< %.cram: %.sam - samtools view $(NO_PG) -bC -T ex1.fa $< > $@ + samtools view $(NO_PG) -Co $@ -T ex1.fa $< %.cram.crai: %.cram samtools index $< @@ -50,8 +54,11 @@ uncompressed.bam: ex2.sam %.sam: %.sam.gz gunzip < $< > $@ -ex1.fa.fai:ex1.fa - samtools faidx ex1.fa +%.fa.fai: %.fa + samtools faidx $< + +%.fa.gz.fai %.fa.gz.gzi: %.fa.gz + samtools faidx $< ex1.bam:ex1.sam.gz ex1.fa.fai samtools view $(NO_PG) -bo ex1.bam -t ex1.fa.fai ex1.sam.gz @@ -65,12 +72,16 @@ ex1.pileup.gz:ex1.bam ex1.fa ex2_truncated.bam: ex2.bam head -c 124000 ex2.bam > ex2_truncated.bam +# Append a corrupt read with block_size < sizeof(bam_core_t fields) +ex2_corrupt.bam: ex2.bam + (bgzip -d < $<; printf '\37\0\0\0\1\0\0\0') | bgzip > $@ + ex1_csi.bam: ex1.bam cp ex1.bam ex1_csi.bam samtools index -c ex1_csi.bam empty.bam: ex2.sam - grep "^@" $< | samtools view $(NO_PG) -Sb - > $@ + grep "^@" $< | samtools view $(NO_PG) -bo $@ - example_unmapped_reads_no_sq.bam: example_unmapped_reads_no_sq.sam touch tmp.list @@ -89,9 +100,9 @@ explicit_index.cram: ex1.cram cp ex1.cram $@ clean: - rm -fr *.bam *.bai *.fai *.pileup* *.cram \ - *~ calDepth *.dSYM pysam_*.sam \ - ex2.sam ex2.sam.gz ex1.sam \ + rm -fr [a-z]*.bam *.bai *.csi *.fai *.gzi *.pileup* [a-z]*.cram *.crai \ + all.stamp *~ calDepth *.dSYM pysam_*.sam \ + ex2.sam ex2.sam.gz ex1.sam ex1.fa.gz \ with_md.sam.gz \ *.fq.gz @@ -100,6 +111,3 @@ clean: %.fa.gz: %.fa bgzip < $< > $@ - -%.fa.gz.csi: %.fa.gz - samtools faidx $< diff --git a/tests/pysam_data/ex1.sam.gz b/tests/pysam_data/ex1.sam.gz index 8dd2bc447cb504be23c29aa54d1a7b8ccfb8fa73..16044675f2473b5bccf026d374ac1ee04dfe6b40 100644 GIT binary patch literal 109698 zcmV(&K;ge1iwFo7Ig4Kc17&zIE^}dR0JMF{lHaDV^;K$Wt)vY6HH-98AzC@ zQ}T^XAtjlrfB(1mzoXUF{r|x4b@i`*r{Dg^f1}_2OAWt?>c0s5=|BAES&Tw4kahLh zBYXg&JK;e{cLEG3f9~)H&-)$zFA=+BKkUdX9k?2 z<~R-D#O+)L_6??-~aYdwJ-u!Ro4-KdstmvSHgDt zDkQ@jL?wl;ifA8?IlKVWB;4(Q+w0{nuu4a; zg$Vo9T$_9rulVU2J{t`ytKXb{Ie*iCzff)fFx*nYP+p~<>UK-F{N?bQ#UEp$KeP50 zqwJPx+b#W1?r#%Zlu;-dbx<)#!PY4BYZP{^(R3SOv0SR*np)@Futu~0cD6#Hf}7B- z5F%Jzo5v5{-)i<>-+xHS`?%AN38DX1U_!u%spvw8#_|)7D`vYB@6nRDstB`Psx#TK z?gK)MwM#xZT}!Q}H`Er92CyViL@5m{u!%;i!IDUB@!*pbqAH^NS+sZo;Kr!D55Z4v zrYxR0cn#oP5McD+mlBSlv;#^72&Js65=8i?Yjzno-lYC<=unzX`yOC99hf@pqSt)a z{cwGyN4qSmBFg7H&f(Eb@1|*XzaQTJO}EIycQ61R{$Cl;z_%R^Zm?Au=juZTUkX7_ zfR>Nv-*DSu&jBVZYu@P$QVG862F_-viV(pZazO10!b^eJv<}TB2Dkf_Yi~OoOIS(= zm#B3WTA=I_sajUD1}jp6`Fs>slYFg1$FoNN1`nx#MYVFU-O6;fn?7x{-EJ0JKA*}Z zHG1D>0Sx9F{=KfViy1_5nObX;ranvSx#ZvXns4s5;A0H;1MY~?gTstS-65ofuRn!! zaGSSo>(AnoEUwoS(V-G=x^^G6JGtw4Tj>!2^NeQC!fWT4HBp{h6Yyt(s%WL2%PaNl z+`IG=V|2Tl()n$C{&jo)WnL}SE?1qGOJv`{z%}*ocP+t8vPT3wQtNx;4D07YwZ5CQ za%`zmzz2@*rBnU+7w7B5ocJxq8GU=_qXGB9I_^4X$<3YxYZ9s=%8}WFeQd}_jxNC) zhyJp>T>K^Nbsf5334B$p4A@D*meX;V9$?yP9U=a~2xhnKkv@0n)z2mWSn~DDogRVfJ|GC4UQt5kQfY@arC*^ih}ddx!?-I z7YX6rJjf@3#6N>SsZ!FgbZ7W=sYObs3;n?f-e&dje){~BZnE>>0PyP$|C;f2XXyk5 z&aDHe(z9^hj5$ZN#1Q4=FjY# z=<};hZw`Fe?cKt1DzNi1=xAW;y$@`h!L0uGh zjU>L`+%`Fu$5HTB;hkSZdCw@HFRyp%%#=+ZO$H%Z~v>#9R}pIb{zM*X~OeYTv;ef)}X{)KOisC2VHy}-absHhB{Rc@Rd zzf1;=ldIM?Jp-jBjs1lh+0vx(pPgS0sVFB=42In`gtZ1!WBieKW_m<1-C08UgLgh) zVnD)gUnyDLWn=65Q#9Jg^Ca#z5+!pkWDtgm@U>v-sG>ed@6;hfy1*tv9g z=esv_8r!%H-iSjRs_^QAcZZ(R0Mv`|JC7Ncb&U8AU`73cpRR3VwNL3++UwrI33VcR zJelCNdzgIom~C&JysgSG@2mNt?!fNPXOJ@r79Vbsw_w4- z3((%)W8QuP(2Bwiw%4!{u-*C5m)Rujbx6?M3wS|=R*~X4PcD6n6ReZ(H@>Y9zT$LN>0cvj3SKA58Bs6_TYmnyTrTQ#Qm4!1Tz{O-3M`LvA6KPRY-&5~c$=@E`S;8I z-w%uAz}irbfWY8NuqApkb<%4*3*$2v;k?AIHb^|;oU}vF*Ry>g9|wO<4O5IIG>(}6oF-q6oeB} zV(3#?P3B#sX-2Y*eP6u`Q{HxTf|LYv2uMc}FFI#z-yI#-BFN`_hF6Y|LyS3pKD+f) zizNt5bi`mGT=Vi3YrCZIXsZoamok@ugY~`fg?l5Sv0Yh1nyB%Z>41Ks{ehJNALdtt{evRI|QB`$e^gv^%%ow@%=9 zc(yT!SODrQS{IxEPgp3n^MPCXTzV>je9ara+R%93oki$F zUSXn99*mde3(A3Fao6tyN@p@3%s89@EJf|!d0qp6`z|o_0c%=AnB(W*f|GJVCvFSe zFi9ST;o8r?^c&LKxdsF<9p)1}AZ9*^C>N`GIr9q~eMaXU-oSp_VP$>is#}5E?y4e2 zctN%H+TUPztoXL}cKeU9xyAoz{~`zUufJZmU5afEo{kbCNonDCB?ey^fesxJ)Y`HI zW=NRjz--5dq}ofc`4qVRlI?9kn?vat${bPTPY>)ed)}#_TWA( z0yT>Am^*+bc5#QNTmV?t9J|7lW%DlJ$>TWr+Z`+cFT?yPR-5Ji+E14k@DTohw;mj~ zh<1ED(S6?n^S1caAK1MxrSLgviIF+@!NLi_fclAdiM7cR%$fCcBmYg#M7?fHR};X) z^nt~`NM^bDgI!k_>>4R!yjLEsvrnOauH&m?G8<+A5dWV6s$GYPNU{6O;QRLqYW?%C<{rzyu{OVWjR&drYpg-!1S*H z3&lDSFpb(n)exBD6SM*88r0Ng1`ou&9=qAFZtk!lA@C(t6SD*!&_DaOgw{TeHw)hv!l%+;% zS9^{#K#b7h1h?q|7Pl(HyjSM?!>a2ELRI%}w0^IRd9St6C+BKwthH`ho7UUvX_5-) z7R~C&NU>)WYpj@B+Jf`d83FT2zrhK(-CUZMrZ;wpVsn4n*QEn{?g6LMz4HY7E*;@~ z3^?y|I8XKzoYSW!S=VTNx@ao+IA-?eGqQ3ub8M+$$^ll0)PrU~6?^Z&`@3K+xxAh^ z?oH<*zg4!TW@CL)U_T!?C7Ai*sZKRYb!uvy*@E=jrre7}Uk*QC^N;us;GDiR2yb-n zF{*^)Y2Yp?=8p14W=;LUlu0VU96k5`C7pPJ->$pIU-}irb`-FwlC_|XaOU;&n{eiJ z#DN^^^2b0w<(DqJ1jO=~x;m~ws5A|hHXKfB-Zc>%I?NnKOu7w* zUa=sV{3nt9r*)i9j6=9bGBouxlhn-g>D-Vwl$hQIy9BPsu%ImZv?@DLWtddbf`?P5 z5ZU}<&de^@rL7H;HfHlFwBm_l4iu}gIX$ug=*yR9-*HOH?QT2F?i0L8aG9*bk`YdO z^hr5@3a30fQ20Z=1t>L=SJ=82aNP29QsWSD6hZHutC~mb8R9lUN1q5*X7sw>#M1){rdQ*Lw$}X?QIQDwK<*0@2e-NI#FYLj0_HzRvY%XS_?EWDQ4WEwoUYm zu_B0tR?nSz&C3EcHk7Ms!%18%U^R&nCC5sFm|B8h)A|@3D_v?kWZX4ac?^@+?@Au+ zbFKA(7$Xf~(FZsm=f<7Y`E8n{m3mwXwu^>H!`fr>7I+|mr*1U2M5>*eD?xocW-L+u zGW1r2){&`NHr|YCo+q4-b z)vl;t%-rCFmyXKR5KQxEqlIew%Y>s|#Ra8P>gW1C>O9Kac=Ot_pf(&+i|GcO?~?`s zWNa?(-*wIIJywP=?X(FgI3N=IH@G{OAoDQEx*sn#4YNyh=mcE;8jYzQw%?5EfvR&m z*I};CTDjI2(Yy0@?bIarl+imEwP%45;PhPu%TuDsY{D5@|2=VwpH>!B)LU;8xJWL< zU>Qi}cDj(gxdK3Z460ut1e@Mtx-^Np_HO;Rd5~)RV&Ytez-d6f>D)cWW}6^4;hcYt zTx(Ekd{+;J?KFFgPN&iclq2OvREE|%6>zQ$AFIv{J)h^1wt>6$D-S)aCU`DjcX{TH zBh7m+HzJMe&3;p)a*WiBdlCmf4!adJ&O4!>%#-zoB}26IWJVdxdxrTyn9mL-))e}9 zXb@S@@)y0)PA zN1h_uon3Z*>pUJOFB!IMDBdScb0dUp0PvyGoQue61!~eRrT5PY^d9YYP;*9g~0_4xF)29O!!ndsJ((ru8 zQ-t%RX+Pl(&JGKB9%6{PW-y(=U|#rtH~&>g!fG|u8|IAuPafRr;I8&7jt^{aS!dH@ z_3Md+d*yKJvGajH0{C{--HBC(0zUGmo_tO3le3D?+4EkV&bG0b1BC})z$J#2{0QI( zH{Ty{EaU}VgO6`#!@OtJz6NkG#-Q5@9(qiW5^*|+nyaE^E5-7raR_8`Emt;du>qV;A7jXLG3802HgU$dB8Ae-k@5RHB6`l9u z>g>G~fj*au<0;43$=T`Viy~5*+yw8{16qqCrc^Zu?x?V7J*#eZ0n9eV6ngUf@HeX_`87Y%I(drEkf_K9hmo^x=z-=nQ1lxK( zuepMLrZ==44|I0QfdAsaA>a{~=lLHTQUbqq#!JDEBb*%7)pah*1^&tfGW6qT9f6hN zeGXc&(Lhq8wd60>2q;d~NONRC%`2CVJ+h!Q4WPll2EWy6UL*57=WBG%O&4+A`K`B| zX@c)EUwZiZN{2c)FiYhO?7GDWo0S|#Kyhk8x=lc5SVYLFqCwb3>n7^mKxY`U`b!^J zjtbecF2tBsf=Q2PXqL+12PaHdC>IC3f(I_xZbX!ei1EGxmmdB2{r&A++=_QRuq5>I z3@6-s@KQA2|90+A=Vqgts=K8%T}b`4GzbaowtHzLmYYhs*!qaYM~zv329oEhd~V_qrY&qJ&!?S{@N_OZ*6?;npcnFqBc=tMaI4v%fHo6F?50C54^DM)Ue;uw&zd z({yE+_vHy7-My?~ccF(8DjJK84qi_!^de5x@ty3zIYWXFk&4dkwN2f&ON)Utg#NN# z0*Z6`wL1nB2Rz$xrLQQ`N3)yyV1_5m&%s2_u*=qscg_>0uzXPGSwu<;ivCg{PQPGz z-3+3R&^yg=`_U@>tBCWb`p4J#^E`ZYpFcl8;PdC_`Tg^A5|56%3G4acSwlW3vK|;7 z+y&nG=53V=JOzwT;t&}IXz+LHE7auJ`71R>?QScY;TRplB@PrBN2#TB3sUNRIH&i% zu9HfZNp7o|2_g{@9{CT36@@32ThNfF&`T(L3+gk`0r;N*mhysLs7K$idr=v{}gZ5MReZ%E>V|X21E!hskce%5x57q!59|xHBtlw)G!i z8M1DvgNETN#mT?Pxvw!PU4pq>hD=>Laz8H%_wWRcc?%8SB%2{nwdT_!iu%pUV2cEm zp>c;sTpR^VvPP~zeoRL$Nk?;30@+ujMv3S=-~cX@@~D)N~h8-UJ*NUhC>NXdpHBN?ycrJakX_=F(Rb6wDgmSH00 z;@(l5BjlxQ=RF6C#i!aA|0F6Qd=d=#Wxx*+3Na8s{CE3&y0lw1LuNPO!_w=h`UgfDO7U zFgXE1HwlV~B3RdN`5lR6{IgN-|D579est@@d};*AN4&?fuG8Ebm8SB2LL6H<12#!4 zg2bOPIgb0pKr>!{4Poc6#;RZT{0qFC&$qYp{Aj;vvyEydA^yr=$~K-~CaZDd@z$?w zInmkzQW3wKPn+rMy6+UuH(qA>gY^V%Jh57sr;S>Ig#onQ&Iil6!F7fFWk>~DoRm4dTQ%rtO7EhZa zq>C>|Z6?2yNIhb=AkDSKXd8a#e>*9jUwgRnwT#H$)Ybs{ip*{KIZ!H0Dv8C>|m4c_9Js9X<3-dl??n_8Jy( z!W(Mc)#9LBq)|IKTPYgM@=#`!9D!z@1bls%Sy0D@!3fV9ww{g;UFQb)GRPMuh{3l{ zNyAS8K26#5wb?etig^;aQ{b0aD_@Z6d`2aWGH+PU#g0wt-@YE0C7ENa9=6?a>r`iF zbi;yHKA_4{jzD!!PM`d;by5e2R+(|)K*>0T+y`9(8r~ZDA+Iv#QoWq90rR5|3$*{U zJkb4m@|hQ;b0hMC;^{;m4??%#(M7VY!_<81mUdxwJ|yP!7!#n?T? zHuIpGxg5l?cu7-NnZl`H#YulgP!0BLOTBGLUwIGzM3rHxPB5HN%_BRSXheO2BG5+_ z5>a!~U`Aff$U^&>2FML`!0oMfHW$K-=PN44uM*b2x8Q7(cNi^s-3H7aSp`3|4EV=> zH?>@sz6bLNw;r3F>=IDaB%w#C1a!`dCYBn9Q|nLjhjf{@I9Ei#zsCarFTU2yAsOLB zry2ftw=L+xBj0g?oXzedIzy^H+zk&;LHhjoZlu<0mj{#-IV4~+VV zJ>)gG_mbyvI8bsz>y0?M@982j(_ApIgI|r)?uDVmZcB@y`w8m_Z<6Jk?k6GgkQG}& z+42B%!A9!nKbqW_FX4x@Te7laww4OE9-=v8SaX{)qX*vQf#wp--pao=U}EdFw&4l|@mZCQOL<|8gsG7d#Br4_jffv&^uNsc zoX$e*I-zayj)v`FHSxaJFDUJR(riJwVm(dPDm4%yAERP+#6u-8LRO>m$rlu7$xx4C z^97Zbq(&MfoZucsXa$d>L=8D%>QBs$m0XRqnjdQicMo4V~bNi?{Ee4YZU z3g9=he7%)?-(^qSybg2O(vy$xRTB>^XRa)O-PFk1jHxkpi}=JfH4js1zJzlM6kC^F z9{)muH{|h&aqog+`Yz`gk$_Z0<9M(%0*b?mBo43_=-AQE-NT2v9TainY9b_Ka|_5{ zV*9eE*0k{@l#k9TC6-;fl61w{oTGS8BK@;@pSYsq@d~xrh$U&R${9GJfhX^*L9&uA zmiI(ou>usSik_q~`7yNI>NY>(d~JPve{0mIx8iG+cT%W%wbrTB%3zzgigax|Ph@S^ zi9EAv+v;bj5`Uw)?L`=G7HNJqrQC$nx!@pb4>zh5Mu8lz3s6D*(o>l1( zr|vIPopn&hFz>T4msK0d==;#@m;a(h7d)ULY8L65QeZYadyOr@|@c|nKa0L(RKH(uy3)NQ@gjXXCW z-(K!jTFJRvIGecp9v>*?lW+xGqkY%MUVypW%iKoKqM1HTn*^4o3U$mjxj~|iB2NtG zGX28@aNB4#AXa8#!=Cf+Kk7vwiA(mg27@bO6@Bnzd&t`}4v5rFHT zP+w2nC%pG%oO$Lu{aK({odxVY_BREh{Tu8P2a1XPuJgKtlTXz|Y9+{yw9gVfNj2B~ zQlF?Q5g@9^mK?<^b)LzI7?SFxQgQ+0cr@9QTve|~Dk+I)U*>@}lo{IA3_PX~Dyz{; z&sCZY4y`G*g7ncLowZGgF4IR4xVF<-|4EU-&sfnH(Wv2g%1H`0Yu#^H&-XfKK;2YO z!mWFv%cB-KCx*3oY{nwFo8fdy)`#V?M^fJQC9>Q^8Sy5{&;6Z(uhMjbRq~pzrfA8q zOKbUfz=5_mn}+mA<``d-V%jPqu8`mX-o;45*2C$v+z<8iau#7emdUW&&))fc6tuJI zLBr|fIa?{(A&Vl)Z7hGyZWhv4Ii@Mnyz{c&jI<(#{(;hM&Mnz3b>1=3dGJB4v51!4 zB*#i2$o?3_2*qai?hVtR^kHRs5~5O&$j;pMiFwI;|#i zkF;mJ2%IfIh5yxderW^LKYv_vKc2^^2>{|Z9*%INNcQzRK+~xE<>a(Tk+v^` zk5LHnd8=_Z3CC$%*0$UNb4e*MF3DePW1aW-+zT56DvEI0`7uzGd6jPadJeop@FX?) zi2`3TdHT4&<-X%0g$r*aM9zWty@Y9M986Rh=wpSCq;pbsXB~4D*s%{)}!R`esE+D5=%GSDT3lL8J;$Zj{`gcF zH5Q=iQ4NcWy#A8gsW6F6YF(9a-q*5v^sWYzj$}$F5_jE#;=o8BlHzMm!UG#%2Gixq ztm?94KG#E!Qc&ZAPnCwK1U~#M($k!h2CY6Z*fQYQ`l3`A`SDmzuv}3GgcU4NO|8Ii3Y+V_dxGUK<^cjHzuqx zynpn~L}{)6Ue6{?rgwg-c{Lo-5m4J^cZw?tOB%2y+Ka`(L#mo2;cuI|X3ZmEK6bll zWXD5ow%#e#HP>K8RuW>HEP=eQGWn%WL~g&BF|kd8AEgz%`o=Mwc3f@{9+OZdl#6X8 z_Js92fb=!>NzD<-reW8)rPq{;Kucdhi8wWLLT%3E`QLo*JLx##EV-qIy`w02#~%DT zbIliV;`mxkW2$Wa_xD-{7i3X}E6*u@P$Y#&G|E8n%G)N|Z@8fNH%|@;RP$|M zk2GmDiKAevPA49RPzP#h_@Ul}lZfxV(>O$>LpOA0xiC?HI zhAlfpLh0VRh;t^rjDA;AZt#AWWYJX{9<{XR&5S$t$O8_ z(i;Oz%qHPLH#qOi9U23*IbnAQC_0yIiVp40Wja@}QI%sV7{x9Le{M{j5?ox;VAHH3M?h^?^9YJ) z56d{K$unp|)6ya#<1u)#&Bvs#!EGd*7>ln>*E1*SGiQ_DvYtpd5fo;>RhrH$9A_As z(Og{Ed7s%4w(NZ8q&ZYJdgZr1ti(goCfNOLbhKQ`K#Ni30}m;9#N{P@AZ1sc^6Z8@ z|I{(X20?|B$O52pJIXs7yrAso7M#r@(}+i>L&IN36vL?Gr8VYhu=>H8ZO2H7tgYUJ zwBr(LT4@{Xv26l23P$+o;Drynlxo{lGD$-s^R_kY(#v}CEa9auK`a%#vZ(3qI?}5) zGMtl@7wBIU59I1U&&3b}X99hxsfJ%CrP%VhnI<(}|I@@HNmF!duK{$(K5?V#-$)a*RN*$bCQ;=;i^ z(ng>X+iWk>v7f&2YQKj!r)9`)6R#T@+uo)kR6L~dtqC8jF)aM9pjmEUwGrDxaW2g> zV?W~S!n2FT<08s@mjZt`u@KjCWu*N3ZspoJjop6p6_3|+N6A_Q3UScI`;eiboyNdua~Hj=D*5@Fvcf~&4hQFQK@yL_2-oLXh?nSVc=zy9B^OfQ>^ z5GYABx1INPk-&G;U%ymi(}(;3N3*$D5=lC{#+)Z}ag)HdeQu;!nwJ>sP!D%m>0!FO zJ$kC@ck`Zs-Y2_>e#YgJYHL#>Ham$lX6HtxJZ#u{pYux8a<(L}to1hB5?D^%)iIpR znIar7rzOqM__@on?Nrrx{#5_?NBTrE4XUhvcv5{M=PkxhNIO-HL)MwwF6vMPEC7qS}mD6e*CHOC-51KFK9{4}w8 zb9pPc9WAj|;x#D8S?k7Uj)>5)^AY7a!0|SYANpeeNW?tO4~8~d01Suz;5LE#*~OuF z(&a*czj>QFtpKFO0&|8eF6_+n${~Crj)xM+pJ%!ID(!A+niJLQ?XIqwYwAr)gIC1) zQw5aGOSj~5XK4CUg>D)}$5bt{bdr!>3shFl5ztTAallwY_$ZO@2XF{l0|G0ur}Gj! z*uc(-X!&GC+tk*%Pm=m>3lQ(w&WSkr>*VhmK1fi$$8#}bQqWZL)zLsQZK>R zLJyiXvrd*&P;9>g=%TvrOZ&}+GdiaOJz3J$)3SE+l4iaXdkz*nx8?*doeOcwl+`~m zM+hD!>W_mT%lWZTVv%&qHz2M~K{xv3uc!nJ$F`0T+D9UCYCh+y+{c8V8C1s0jk3@e zbcMDoZI!-LmL_~}-Y40}bhsU(n^M++^nXLm?G-0=vci%?&?cm-U(N^Pk4%7$HI@7G zYDl!YT~T5{6)3$@JgVeB`X=XZ7yHJJ%zcBU(oy0`^;C%9OFD|6UHb%-uT`O?6&~4i zOkodVe3S@6cZ%76!wPv;w^Oy-R*arAbhS2sSvjBn6}K zi{0G=c}5uw6VBB+;=E=-4nQ4h@re`qQD1$z|=67BK zjViXqrUizF*9_0A1m&=y`ej2Y9Z^`PpDKMI3s=8H>6NGL$kRC0@tw!%eeVC-f)giw zLJDcS>g*=QD{zT%W(I-`t}?G7ezT&r;fNny4ubexhWO}5IyLpDezqYte*OAv^acJ* zpBe~YRx?q0u~7}|4d(WUAED-8c~!wk>ZmL%Cmz8{4;7Cqa@W5nc+$UkMMHW+`|M5>r4-mLL#hsl~Cr?!ub!%CHCXd*h%TY%kYZTIxXwPuqHDE zmqC^l^WYF@f2im+)yf<;o`n~vn@L8rjFhClhOvY{#}ho&@g0Er$*a^}r28j@R$ zGIz3V&uKdLC^@B&s$0o|BF5Z%6}$ec0iye@3Sz7Bg*=?PMmBgk0@}Mml>5pTegjZ& zcV(t8?nK3Zx94+4wtKkpocu^;A{5Fj6>vT)p7TwAlE@mnS`=n{FlJ+KryB(pH4S!^ zaXzaqWA2)+XDEli-l7Bu_{cULahs-co@`0w%ORE9koG6k&(8Cv{YyG4?8u<9;6f3% zE%>5kq_F$nZFF`X$Rar*iZ~WrP(r~V`H_t+uVo9AIio20h>c=pm}v1sTY66|Z(vJx zM&B{3xSJompGCldIcKqlT8+*r&(#B8OhRr#@C$v-9cMjGRUB`LG3N z)wd2z6AO5)^mq79N2Ck7B9{RR%0zrSkSaUPmePm2ZcwZE6D>6knTx4wOr{>wptDHW z5|B8sR%|GGwB;5`fOhq z5K{VWWqFU4LkR&LN_t8*&c%UJ%AvNKaLy76<++8>b|D<`?`ZhQ;>@ALGAF<7a`JBa z%qp~w^U={*BUfbGcYx@eyQE|kp>_un#}i@P-gIK8eHY22NNy*QxegLcynZ{My-ycf zyyon)LGp3g#EX))8=6ieUv;F9X;{926CcSC^67lE!*Poksa`ys#dF@}I<3wsj46Vp zT~T>L3Jl_0=4Q#msqA*$+IIY-0CSi&Ov7YcY85iWS%HnWZ{PE#X<<407ZZ^RluE2w zo{96wST8d%R{`YDD=~EX9`yucXKSs-xo41G@IAFp-48kv(GYDC)JL}T@mJ(>ga`1Bv65Px7r6?c!k{h3LC&J)212Yg#Tt# z(%6D?B%LFv%MQ&vDLST8knq=mOueBg1*AQY4&EfzQja+M#9eUtzHb6*=qwuF#YX<+ zUd?NYZ)pDaj?`X!Fbf+`99P?Lwi8|y0GmL05L`+OK$BL^Iq#2TM6gLbuO{a0Ywjus z!;?(iC~6Vuv#OEHXjfH1Zs9$^+>Q5e@jY*^;Iur}TQW>#jFXze2rZ7n1O6|0BomgU z&ml;1H3UkY-*{X3J{v+ezTN_~*Lz7*3v~vsNd+)Yy54<6JAo3|{(u4yEGiCe61@vi zZH2ksw{~bE4OMf%gi;cWTza2GWZ3fhkj!GFL z)sNjYO=lYPnv;RWJxd7kCVrDJTScBQ1~H+|^axfBP{)oFrWVUkt2tG#l*?erdE-;^ zd+jI|KmP5ZYHCwMx-g;Po8%+tBe+<`f%jf-#2a62)7bv+ZLXbBB4a?=dyl$g7t$iz za0I-iafFyNPVJ2v2PFpr5#q2$Cfk6LnT&kX)P`k4jiKp^Ehly6%cWsK|9LPD8Hphe zCW_ZSwC+ee)-gV@r`pZEqQ6ZRIF-p9fO@{|Di2YVabnpsjR%I(h|4&nI1C7k`bnz*xNXjT??uuGh{r#+y$D#N^2K?moOb@&3vRv^sl#u0*{zH!YpbD7bkwj!mHqAxEmF?vudhv%IL^L4GH2KS*QL zz7&#Qky>*=Uz`EuL8$g^zOe+8m}ZbqC}95(s%U(MCeKnd0;R^)`K6e+sZ2*Bt>8Xk z&zR=MiPLxxF^b=Wawb}8OJSxhH>AA^I-L}m0!xZ)BTg+XCsG1a+wBIRrg1fZ0w=VQ z`f8>!I`Y82&xs=U@26lw04R{0X##&9&lQ-&?LtmI{O4aOe`jWy7YkCuZJ)5N=k?Rj zDV3<|ik;U?;?Hmj9}4t{!Ktvac>S+U6PrJnFzF4ILnq4FI=1=nj8pQux6ea9MUXte z_qorlpG%PROii^NHwSrR`O)F9auy%d_-scgiU%OGH@O2M)2kW~)XQAcR06E2;Xs-0 zPqzDbtF|0e?#aRTbf|-NP}?)=+8DpI_xGRb znyy!s{MAz`B4yGhH>x-7wx?~u0i`Mwc4Zmvmr7n&4moz-vZOCDooOPP0Tq8WPX7A% zOktI~+Rp7gajcV@4{w~S|)q1@l@0<;)CLSLFPpMUc zFDR3lx7ZZn^%kT!Uh}M`bil1Z6){(X7x31QGTpS*Z9~p9$iEfID!FJeHqqQok^04= z`l@&YV0FPF;R2`4U9KChRhx~p)URQKrcTG{$I&N$

D9eYS*nJM4m`AoORobNXJkYESODDcZX6N6iH_kz1jX-{sM@hZ+smnENW|$ z!k>bjm`XQh^JAI+{kX>%q&XVYMFn zM`d&@Zhl*43CHl+a};cT4Z;#T z0eXRO!B9+_^CcRvronZW3#=ElBa{J8h|R{`1w9)Th81Sk zO4h9=Tp8z*5Zv(PkByLEHv*hktn~9z?+B-8`?bplbyXzc2q`e{8!cA@ zG@56rt!#y?iYp4Qf9l&`|I}>jtT1_RtVT$2{F;QxGt&DSQrnYx_C07#ZQE!-lw&F( z7wxL0NpVy%@{GsXXvN+CkaKFqC1q8gmnb#1Y+Ci+1aMCOU_*kYqsseY4R|JYu>ZQ- zeCjyuaX!a5akZ<{85_$XWJu>z2)Lq2OVvo5SNRJ<*A^%yKT3M2Vd>`L84a08@}2q#(J{0V+>WJKatW7LmU6cAd*NUu{nOfIp34py0NYm!xsA zT}uzKit=Z!B;ALt-Y4YxI;PlwkEliY6V|#tLWxJ0yp!E1$Cp16sgB!EhK~cIf~Iq% zR}uLThPR%RdO2dCl2sY!b43F*q>r;mB3$Q%3fznG=R9+N00Vz7rK8~xUE4l1Xy8{SsM_2eZKH(AiHmX$D0 z;u5R2O^RbDA1o&3C!CcFK&8*qgL{DTF^%DrCK=-qO%a93GxYLMohPuD-oR(kTPBq` z@;aE$p9PqY2a?pe)A@AzK7D^XrN5EX794#%0*YE^)WUdHN0b?6XsM~O2mDcD29f0F zvD?HGeaK>>u-3D*oIivUo`Y%_g80H7ZkbST@8d(u<|I$=-vVBPNdTorNW-Fok2QV^ z_6c6mz=1j*J_j0aUElTx7J}~;vq`>to>Y#Zl{tSIlfKk`fxW449lQ3-kfMy;g+>TF zw)RDwkLuSi_v<_T5^uF8MX*oG?tEtA9XmHuq~=OJGg=7XJ=gFqp|+-H6<&^1Ico55 zRe41pNJQq~uT*cADOKBAxY6Fj2{1BOPWsIU=gaqTD6BMQqgHW}=tw@LPUQB}NV?>j z^bmxF7xbj;`R#3TpW4SsFAoyVYP}EJZ+V&5hfZ3vw5D)Q{g>aDI@jA4)svXAH#k7d z9(*e%Rv$LM4D-J7j_U3qnB07EKvcCqq?}v}1y*yvc@ZY=y^k~FFTY6*V<{HF^tmR$ z^FS35{NqsB2B2qGJ5?um^pRweYwm@G9_~Zp1Z$GCJY1GJTwaBVu{64*y=X3d8T`BXGNHMN zTXRaM7nvSA(Pvr~ICK;-840mSV1+1-RM8+Q3Mm&?^jYb6xtwa)C%RVb+Uw@%A9e-jUMxa70;L@4y!$8qM{kB+k*oiv;iJ zp{IVTNItA659&j!Bvr{hW~Di$%gug?uLPjz?_UnqN|HvzxbNNi_*myF8t#>1hJr3E4h#{Ipk@ znv<)cJh8;4E*^|H_{H~~phyYo z9d#&_KrREM#7-JT`0zvfMW1$QTQd4wiDQsD>YDfPaJD*UbgzN24)b1h9neiLWvlzk z1m-G)t2>}dM59{L;IIM4BYZ)57q^$~BU;4ytlsUpNoR|Mx!~_K!4!GG%vP2IkLq1S zt2o$EPN}CtE7fr?!n{{+=bN0#prN(q!< z0?yAs`+g!ywbNY0yt`=kDq=j`cg_XY8c<&qVS=%TlNg)Yd#uSmQA1?+Jg!inq$m^-wM6`Mc^S%u8@l(CuzQy@=inmiw71OLr zadj_}92dZ6Qlm8K?ma}IjCW^*)FDk_jUIDhJs<1EFlye9&Hhd;Y<|T{gq>QRx4^gj zYB?p2NyuQUBE|)|d6ACL)> z2h5xi5FD(mK~O#bs|ixdocZQRM`>mUvo$so^B|N{0%BY*ysfqz(>eOjCNeoT@4=-% zF!xHa2+k5paJu+{asoS?|G^i33(#Bn>j=*t(ryv8yHK$u2x@W4sg=s1$gXRjrUb<) z9SoV!@Vn@dFh;SwIH#|3M80IIGRWtWW1%5rfdIXZlUQ zr8I+7x^ibhH}Hz>na1MUnx#bz$d)gU@BTCKVX_ zt8;-Lz+e6*+O1@e(5O*qq7`S{HY191GpH0N4Idr}tc6e$M-Y0?MVJ`Zry0et^@bw5 zC7CA>A)E|YPWE^rGo0=(ebx!>l#*9bj+n3;Dt4EwXN@_tG1qjQM2_5|$!GG58|}OCxFt zDLPMJ|9*Q(X^Pw*VXmYhEQLHFU{JXvtgU&~4k8j9zbxbA@BHhuz&p_~xcTwR(oBmY z6%T`SF}$~Z1XN4Y4z+JU>PzP7!29X(j5>vbOi}y6X||kV&$*UMc^x21hR*GK7_9zG zwRS7*iX~<+kFwb61*XosZmGynhB@;jrF0(n;9e6?^>)23O#z+dlVRi443b)i4z6aQ z$griPj09%z&7reAhRI7d$hagi;aC9q=yg1nUFNjk&NH?J%;X8<`03<8efI9Q0DZxx zDca1`fZ3HgqEt~(Lo2wI%@x(lMIEWun&h-E=&;g8&R)JXuKL8AV^kYu);{V`v6RUKK* z#7m1FwZwMQ(+-tnWKhM*ci;;;%7@98PV>NeG8IWuz;b@?a&jJBLlPMhQESEF>dCr_ zP?fM|mGT!y#H7H9H;$zoBY)kK z3LYU7Z3&5QmvBB{?IEn{${|^V8!&!`Rsgd(KvzhwCtQ4GqS4rpRkQURIrT1!auzV& zfAT3!Ll!hlZ(_YQqc; zDvCub`HIRHn4&79d{m^iPL1KW>-#O-%sBi^uZ8cev}#yWG72?=hkkp@In_{s1E2XK zNc!ui7j~FdYa(5E@X(0{@WsDZVe62kz}-jIIv@DbYMyy$#lkmWu98XV#0wy)r&L)d z|JmS>qM35wloGM?a9HijDWI12(&AS3;ywINC7(PT2y3yZ>_bo(Em@c3YvZ8Us70YM z=a4zoxZUS;RoaMrUN@YAN*3ef1KPp3{TBIjg+yKjDGWQY+*foBq)oh0PjBs8qgIgB25DfIH=dyHEd||q20{3Z_bgxv4mTX z4%F*%>Mpv&(oz{`;dF*z-;bp2T_oD{+(k4%WN(v4rWZ6y#g!~3NHFWXyOljZO$Pg2 zoWZ6Het|O3{5W}`sn=lx(4YIWxS|=alU5#6O~9g^2ZzrFRs)$jB*(}*R`Gr-07jO=Fw)QdK1+JJNBqBHZ4SPyC1W5G_sO~TBP@Nb+)U4mZ5qd-~nn7Cs%G4TA3hi#Ig`s(J37rfW$dY5m$iVe>_7 zyrRrudwCk z=6w3c3p33N^@AQaXsFi)jTtRO4s}>ioB~EApYnSCfRE?F&9jyyB_lb`{x_#C@)!$h zAi|PbxPB}_%2~6#MuG}ml~Hz=-Z6SFaG5iAcCcXn;Q2q6DjUnM59Xe8W}$+3Zfh?B zy;pCd@kUUdF?h+ibGqh1bs$-ipCxKGanqbFERgKb3a#WrnYp_gPtE%K5?&G{THr_A!oxAKw&m~)9o`;yh)y`VHHU^9sd za^*=@T;v>2=<8t^QbakWNqiZt^het;iPjDLdu+<`e&~A`;mlV_sf|Dr;z_v;Z%gB# z*+%J#y<-k;y;r>QZJ4BkCZ1Gk6fVKUG)W|F!8So+X6YYxcWBTVrOs>AB87+99Fjc7b!E;dhlz_#<;4v!rzH{UvnO4=*kso) z9-39gHWK&+uBd{jNs11ptX$VHia*rVmRI||zfYCFIb@e#_T={xX zvqokcT=;VBSG$Tq4%Wt#*nvjXdg=J};Q3h^okPx^4eLZL3#+{R`#EN)Aiiqa`&|LR;kG&zVmgNy zGXVj-+>WweZpwmXHmAJtS*$VhCB(~`wuyxrYn+=d3H}pW*p31vPb7t8a1xnRWzI=h zj$kW7B!usCCf#doj!pSZc1B%Vz7i3J@ zf;6@B{4UORD<9X80?!@n060fV@j?=(!6WA#Gu|5xKqV8>hQoP{C~25lQsML`X3%wr z5)d0dG5{kFuBOSv@-dxyaAvZs(Y=NJrQ;T-q~}rPM$3LhIS-o7EK@W@wT|<7uRilt zvfFKHeMriD1hv;PM`81<%Tk8=Q~mn*MPK>1&tD&(AHU%5kB^V@7koJX`uzIGyzY@0 zfqiB18Ow$;tSrhz5mCx56@mV&R6Nq*iYNGSn4*gchkc2_iC-y*Rp6|5SIXyha6s~S z5(Or}eEe{jsM3n#4AuMj{PuP}pFh6NU+2#=eEa&ke0_eNzb>cC*ZFe3T&CR2?VU0i zsNYfO1k7 zQoSkyy=Pjq6jkuU+`MXDRhVa5GdAT#UKDu;$bmlrZEN@j#1(ORtWHxO(L1${hADBZ z4pVq63(m=hJdV@iuzz`B=Qz((8V2coO7Wm;MSK(rPAn2}4;RagQlTUkz747L(hr0) zG1B)m`a06Xu1%KaQ4;4q4M#f7F(SpshO>%nugHa%u*MyzfvElVdq{E6H|q$ZKo2*o zj_LAhqC&+r-E^raFh>7LbtHbj9m?k+8w{1euU2vpJh5(?WF)Sf4KQE(FYTY9Y&AF( zfPwtznHu2?F8fp4adQ3b(%_2Sl)97HLxfb#aHlHQFQ=|l$196J)YTN?&kMx9n5dJ> zox$3rQ@CD5vM1d*W0FX-(Xp5vr4;iRg<>Gv@R&aRkXpnKDcn&&3`nt&0DpVp-*A6! z4g!W-kT)l7B*=)a6E3tE@ zb$WLDcxQ|BLyPqDQ=RGSpYQMZh_kN~`#f=u#_#w)c|i)n@HI5CK7C`V z7^M}yuaos%o#7RFNu(O%41ta6HILlYKC4)J3(E>`tc#hjBja;}(8jJS z-#*%?s{v;euM_J6f)gNZe?aRhE2_O>MBqKfpAmCCB^L0xxi+Wlx4)-kuL~5hs*Quh zd`VbSA{K%SoQ>vGeAcU%jW<E4V~0&|Cff~vaTb}OfaClw7bz-@a!Q&#v?x+q^Njrn zsSsKFXA9CG8x?|0+o{3zmX`^QzDHVc&ka6LDpFpF-lwMn`bF;CcnvCYyIMZDZQ5p)Q2ZUekDPJg~PCnKhEMVlpx6}8um ziy$MIEjQOrm@gJJ&8CygwgG-rSHSKn9QR@EK%DW)n>^~X2=m_Wex_N~iqloCEFP4DPn_rvHJm2JpeFitoepL}b8Qp<6coSTrG z?wX$IQ%TsmqEP*5xJY4i;ccFoD*!AiyoY0nq?V-zo>A1O8a1nNm?jfm`5xzU1?Rh? z@oj2g5c4XZ>k_nfQHYv&8w@7smlxFM*}e@6y22fn=JSVc~|!tvw_vt)!cv*4+ajSwJ0A;C|?d};;go@naLe*5X?lj zE0r6)!O(LSYsU#Du=t&8hpQ|3Gqv-H)JQv#KkQRB-~hhw<`S~4mNPhbD3`5*=RkSW zKoC`?5-Q;YkD1RMWO%71K0~YO!}YpbH}OS=vcK>nnE5>l+i>omk+qv?lF~06Q_6s1 z>H)UhzXr|j&@^*Trl~2`OzXtx-UoV^HbAY`UZkV@KO>PnY`>6v``fOuEo)l8 zsYax$tCLbzB_+~u1k!DLIbMXdR6~zj;WLIZoj~J!I4NXY89tOIaa(SBl+ zzqe-{(MdU}kI&PjT`&|0!e?|y2TRJBaL}!yDUafEVzWqE5h7g3k-Cjtf)Yh&ArkR? z{?Ku$t=r2~#`&l+=Z)O~)Y`3ljM#pGBOiu!h_;2%?h(~3UyFAJ2dM=3=4DVISuE~*c-l_8Vu>IVY z5*v;n71|`L*dndVZ6z@dcnUg6MjQY{B>aLr8j|FKco zv2rFu%6(|728y9|pn%l*3rau#>t7FQWb4LFQ+>|A+1k~Y@Ls=%<{$qs zAGMhm_En=Nf6Bj15h;2o$b!YFTL`D?-={j4M=+uHc-~ib{!yLZPVxKvuI){`lM~<6 z43YgQ>ETi?>S^h5U&-7EDhdHf^i6q#sN;MSp>`xZUp%6ZSHIM=oIdnuhZrI{T$Cp`p{f1 zu&@`LnM^CA=NsEsEo#&eA(E~6F*W?LS8WoifHFI!sdGNZE@`kTp&uxAXG~s|-`Ilk zY$;KVz>Bg4sLkdRTs?mUcptCHvfT|(FG^*@RboCqYJpwwSt^5seIGPEd%dsNOuW(~ zN@W;0!EJ{S4g9*6#fy}*ZP9~gA;H5`D)y(bc zp{<`M++17kRNJ>-Q8TimGUuL48~@X_S=(RND{|kRX!FD5knU}F?Pb5v(SsOI1U@D| zhDwZ0atG91wFkG2!UNV6OQN-;-1D%eyl8K*h6m1c8E2c)4buev3Q%=SfO;OO6CLSc zM*3Vq3ehLTb{^9X4gBq6UmN%EX@)Rdm)|BNWJbn)i0|RrOO}4*xEr^%+;71USw9`cBJcZDyhV760eOA zyHIEut_w)tk`qFS_jfFZ1o?i(NGUZJ)%d36U!NCMTG2{=70Disr{>2~_bl=2C~52m z&R_(Or1=;GVSXd0)AkW6G)*X+sI+F=&b<-Lonmsli1DXY@{W>SbpZOAfBgLXR6B3` ztf2F-mfRO{KIg}_z)Z0j3;O;M{ht@Pw|Su0<&1J~TYy5)g&N-Q7Y;F`KO*gV~ePuQPPJG%JoW%?mTiWQvHv2z;f zFKnJ3SgGAp)PyQFDjF0o3e1hNtSW}bmk22yC76$DN|Hb%LDXgko71V)d{Sqm@yeD|3?i>4w89&-qR*8T z#k+*TdrO#QeIGEB_kUiS5N%ykZPy;qLBgoNOf9nmy`U&(l=(+WTmPuOMBA_7X6Vw6 zO*Xvp4OivirORY2x(YX~cjy z-}Sh$KX}=g1!s z?ekeiTHWA($0bL>y}PB%9}B-+S_<#wwplhswPi5qp~sZBWN-p%AsbW){HYcOnSP&dRFXC&fMqHxjc7gW78`rj&v3GOBhTvH2Z7hS=@=73v@QiAOU8b z(yNBnx}BFt#Js0anJXeUL9Ot?v=F&vWkQ+}48Pfp+gBEe)|y7wxxYMuN|6$Awk<=w zcTmrCM0Fk((*2wW{-WmRdc6#0mk1`Y&k3TBTn`znJttNOMrWnh4V#Yzg)CUpTd2>* z1VH@Jb!Hy0_EYs~wxD(z(_zm`5hu3cU(FS^y%(L={OZv#fm}ovs$;4u$a9FnH zRi%&tWh|H|0NA>s9nQxrP70!Nvc)=aauL~ePc><#f}ek4N!(N0p1IDgQZI#}V{u=C1+*3vB*TFMY z_)sBZoIqOcS)@4)q2xBC7FSd^eo-%AKUwRH6xgu&oVEgm!N{tXf!-_IptO4vE}PeM zb0?w2ZVe9ayXkhH*hf25miR`QeKG%qhxsLC*(6}b%`yM#Uya%18L-(1?z1lAi|m(rtp-QIGz?Qs|u0&OA^f@>SXblx~k- zeVq1oJK{V4Zx2;tt)7Shw8VVARts_7)HN2hN_&=mT5Zwu8x>l*oo|7*{=T6m_qr1Nia>1)MbeLqObtmB=3_2Zcm=-z+YWc|TZq{d5SP){Pr zC7@p9lK2szv6;}s>{d_j5|PMZ;G?fKL&=j8zMT;+ZhoJL47W+Mp1GxvwbxGbH-EXo zAaj>EYAn(oojulDG6ov5FwcflN*O-;%@>o*E379t7rybfQM(KGvi_U4Js_vJ5N2sT z!Slm{(*n!~hq++v<>0Hh^-)&0$f+gpErorJNSMYJlqF9@jpN}8=LCz_{-xvDw?`mT zZHU_~`rp>j9!x2xg28NR?s8wXOT_V8#l~EkkCf7%i$>-oHG5M!0;+9ph1dd=)VK6d z0C%s?=&`|Y9AFRB_9)d*8}ou2rqH!Nm0(Ut${IGq+C}^60K4a_QYG7Z?&X_0d(UN{ z_bOoPc=NW_{&tW%btIrZY`tc$D4kIfP%i>S4!dV1&A0!KjWev9ygRhsr{8h-vF+8E zXle|j3_AjDV)g5~+5nadvQWoCGKS8mc{jQB-Arci)FJtFb_Dz+7eNe5v}K;X8NMzn zfgJ~MXr<#YkwJHOPI01uD1nw>J~B+){M1~1Xu=hYX=q{)XRX~eeAW5{|2ywr)0x5Y zq;T&^q3$bczB#RBM4=AsI#z$^HE_#d#gyC#xvLH&|}|v(4OW*9j5lxz@F=s8`*kA6f3I$ z?`;>y1k}h`GHL23;s<@tTLP@h%wZ>k{%HMppl zEIX5DqVVHQWAaRWuG@Vx-)Kh4U5?cHP@#|Q|0uJE6yUk)IJ>6z4(PKF?o2Z-;tJI! zBAc)E5zL7>6$@(XgOTEGQO4wYUSaR4JQ{B?jl8?`2WV6$B6w`?c{dCxgJM%u2_zCJ zV)vt;n{!zpBi}mRCWx0-N${Kz1m1f|6U{}KC~C>5E1K~-Mxszo``chqu^v+- z7Skjnw}dl8h+2y-Nxq7aolicJk-r6snXV}v>ed~JUJl#dav9_+p9h4IIJtxpuDsX8 zU_mi)Bt76v_5KHh)+J)qu%w1h5{$6ImG^CFbOYAKlv8E|aoBx$K%(#G7q%KTNQshSuY> zV-L#ahp7N^A)}L88X)kb{Y*R}x6X|*)%yNj;y5i)XrgNU2As5TySfTDmvYcNd6e+s zMNzA!0UowLF5tZPII~OYUPdodo*H-52EE!)I!6K3lIcl&={4@YTSl5)z4D5_IL{=l zZW|x*kOrTu_34ODisQF9A+C^2QS+>$9lcHp(~iXZR{%Qeltv~w1r{|P=`G_DQ-f); z+=$Xh7Iu`EFk)F0Ofq(9j;PB1#1SR6a=c2)G1Fv6z9^Ec)NT{b*JbP;{dYK?F2(o# z`%cLcMGOWCR+idFhJE)VuO#dA6>*LW4Ly+>J@*N{_U6ScOI-UwSz4i%?ekW_68REcc!zsxC7 z0GiBq|1jZ5(HUR*RmBmz;Etg2^u(C#9qZ8IxqjRMduK);g?DcV0K;?W08INgFtGR^Z&PVgW^q41%o$|#4{5|{aX zlXFQmr}~1;qU(Him;{Dns8q6+}nLePHeR!5|KC1V( zCTfi0z#xeW1Rl=XR(Rx$p2v!U&()^ z)wJjw-`kx0ou5PMvaq?T3X4fdQ(7Gu+3k_E@4ThwQ#yP?S()C~Q_FnShVMls27vuD3K}^*YsXcw~i1o zr32#Y7$>V5wNSD$!hmy>%dsMNlN4d*gk#IA6WJiM4M+_Euj;#3y9vqya%T!`%9WF7 z-jA%P%MIBjjE~ZkV4CZ8tv8Qd!rK{;C*QC@c?=ZQZsCmnKG2A9{{DEWof0HB&&E(M zt|;lmyLF(CMWC@u>WI_3h;dy>(k!WnBITPPh0c*I?{rCRE{AMb(%u|>phAGf_}(HU8({$b)Y1}NP`y1ZqWKb3Fu;m)6%8#<)7Bo%yGrkoc?6nh5W$$O3z zM}vK+ia4>WplK7M>5tB}kr)or;=}f%*70zgM#&uH{u6E!P`5;{0*mak+{l3*GO96< z$}y6W*4)kWVUl*j!g3kt&;PG{&Q#^PKc#%!8dWB#Y*T~@kEn`Y%~$K+i}_Zv{Zh9b zI~}Tt;;ay#X#%9y3>K=UCEbFwYaYVvof=Z)pZT)}bM?79WSY?w8 zs;H^9h)%O#pn2UrcAEzrkOv677@ye-h9y)>Vj#xac;V`xq$rL&l)+gUfN7EggghY#o5_{|EVM1H{ZPN+6Ou3n<2ak*JEA|0H& zA89lD|3}-GELV;zORo7DW!*ZN($PT76IE?8Eo8muyVCys|6epB0FnVt04ZJFs?4l< z{a*V8F~s2Ez6BI~nv;ci1JFnRD1G8;1dDRdda0RxT8KQ4HFJ@Gsv;+KYI#{%u;{IB zf`f?vvi{#Bn=fI`iN{FA5LR}HG+uqVd0$1kb&ryA^t%b>e6qK65T8Fiv2Ox5@o~PT zdZ;@1(vR_P$aY~&3d<4kNrh7drm`e0b$0dX!Omk3=f`yTH{N`7 zY?a{=uiAC1QEbgLPbt)7i3IICIQT>p1PsGzaB7i_U%jB#Hba`A&XUyn{IuZG<$V0? z1xzq)eMxG>b(l&+*#rgaWcVe$g$)nk87T2(20!6sAYFrrMH8E9q!FqDtFgr>o`8Sx z6UjQT7vq9{`l7=O4);78HCB*rE9$v?nPMPto>c@1sfzov7x4dlx{omdh64@^cRTl-rHC2m!$};T2KNP8v-fi1X;=wq|U|hI>_an`7#YLA~PS1eHINI zaoW0P%0>#9;5>tUCInJ}dB5ME;@|)4jz7=${PX;Lf4;?E;rDIxo{={0+FWkuV`LVA z9*kgAmovWb+J+d5CDmygeoeSUBT%T0$Hf(r?$6?y$I(zAl6_Qw;x#XGfAI~furg$apID}#3jgrFnXL=zbNkj^QpF~pC8}<4O_V_kLzg%Fq%DqRADv} zt>BCgQ}NkDT{2sm3-+8dPI8Na1tu2Bt0-gK^cM4@*FnBcdVk*kSOz4u?N#Bn;S1F|w#-$t>OsRyH9GILHP=0@F|CYK6e>MuN!k`3CMn zKAEg2y-?Ad)%0DGPdq^R{n5R=-POzM?dH|pJN~YV-osWRqWPv}{H9H#fH`|m3834w zsPEV$T2xuh?~j)H&K0?Z)EAA(g0PvEJ6)x`Ut~?G$;PjMQYZ0bc5&bSqI*ich5)6~QHYGNu0Y5Ls&w3YUPVP?Q zbkwOWcgLTv*@i1zPz#ogTI~UaAiozsU(h6}d}={+{0yCDn7PCf2|4jas}W@M9qZ~S z_h4ep;T>Q9rM!FF90u5-IJ9cSq01SkAiN@mgX_R2tDyW^^l!)X>CdE}3nxpPiweI2 zA5h5fd%4C+ijLqvP{E3{$N8PNX@^>nrvk2X5PyZeEh+9=Rl8$jl{{v65g3^GP^86E z4?++r(yy%P2l~ooQTLG5hy#{%hJiuW|I+UeH){NnxJeaEU77->F2X#6X?-2#{C?Vv zn@JlUE|f}={zuI3U}hlR_LlDg?MKyDDd7i zI^YT3ub__iQ=evq>QHk_9GSU#ikq~F{}lf6xX^!{xghWfQ3-f-iX{0p!{h}_jnC3H z$ZxWqAEk9u2>GDj!VPP=;Vs<4DOdX~>!#@v6|_tOrmUDO)js;iR|l-&h-DZA(*x-; zn^{5YnwbYUwSj-EL$Ql_Ipyrp7njo|U;8@F;_bX8kzcs^dG_=XrzO49O^-wkZ(I`A zm|^N=F$W5yhaf>;QU-I z<_6ByMQZlSczbNu4P)#1;iO7B;K-Z|yFR}TzW4`NV0PaB2`~0BC&+9|DgBV6rFYJH zlF|WwyCknBg~`BT5V*s~8#BQy!#M{YE*LAB0Vgn^Cw|R=J4W>eULs&oqF^I(2p&kAMvBapW)u~xFKI}5!mbCL8Zvgeq-BH;0MZn*kb?*|M>!RSnurPxI;6XNi)O)M+`A^Nuls z<#{)rZ}Xep2-!>$1fN(uPL>}0LZpPT#LlY&T|S}nLZUpr*Mqm$ye5D?WaA!Xo>RwY zr;1Gq!}U;hs=?gXpG^7+=}*7mZEXqA$=ceWpa!x@z7wdAeV`zd9G-%Ey2BoLejc6& znN8b5{M~^%UjcNqwwC#V5^MRioF{{)pz}@wMYG>sRB4hVUHhWiP1RYrpe(y40Rd*b zpppxd1I6br`ineA&Q+j0p`Iu9jc=z3`5V51b_*wX4B~KT8%|UrG&sWcswc}nGZ#!R z88>J&8L=J=Z^&|*;=_w1J6^%~HO+=%`uR53ZFB9>gzy;7Ti~tzCFUDnS!#JIQfgPn z`O*2H)C~n*SQerTKka}gf!rAE;d2kEog=BTD#GWud2zO-0qMr}^dzc^TftxH+boQ_ z$db~a)Unb2u!i%S6vf5wid8`S8&Gl&3PkfR)%_wcK}L2J=xRyJFR#KCHA38_#^c05 z?=@AMcDi^(!Az6`5w`|&#A)u?UP3dK3s4Hl`2jxlrKwctGQ2m7e+<&O>!a6!7*d#;|E4y+x2Qbc%;c z&^D-6#Wk8da4Ik{xfg>Soj7089VINtyV`z;|9N~doF7kz@`I6v9z3Ny#2kA_V=9Ui z*Xn|<_h^_J4kqh=op0PEau!ZBfVu_0|H|BNMJ^*RIfkV5s)Qx;oIWZu_bq|KTOZ&*7S!j)q>CjW3hYq(Ewl zYg8+|@ar&-rBYwvujk8pPrL`wJQx+A$KDhdsY?Dbb)e&gZ^$|VdnjH|w*g#6v|s9D z8`d_>0@I8#=_Zn5i7BopZPF83g;^=(#&mz3hnmaQBJ;IRXg|8d#XDmu z*jO?FHW6f`aRV4qD>X$aU&_>Btm6Fc_5vtu#$L1 z;czt(>euF)hM7ium(UM6%OKsdy(`wW%%8AgwgCINv91o&`xkgY8^wxLRS;zDfkc3E zdZJ8H)H+(!&%CIN*#~Vv%l%nG5;@3!ecWd8REBVinbg0pVcP|1zplG>=-a`GgSU)# zEg!!SMm8-&ce3r|!{kkTQX%R$L~Oy?vbY2Kp72N=0EMBFokwb#Y7OY*bm%6yrBRlT zl!Cs!)z)iH4phM?gY*fhnY9(&nH%uq+4^f>;KyBPJtg@k+oQH?F~*v0o9KHAjItWk zPnfx!q?|P>7if@y^0+DZo4?0Rd?@@TcL>A^Nx5*TYE9UgRxNS%t~-K>KFAXjVq!ti zc?WTG_YnSuL!4@lmkV0%F?SZ;5iyu0PL@>4s$c%ABgXdQZ#EH%+_MPJ`mbYGro)kp zF9cYkXf(p1NR6H&lstyDh4gcQ6s53%dY}0Bs7iZ>UtdKCMwlgIW6^kG{l-nY<JfH8U4?`G!)ceUEmx@>qqz6bc6PJ8bJ;pP*CSjH&763u zX!+T#Qm8%&)E$BR-1Tm8`ByXWRT$kiy^e9T2IoWLr6dQcM5=PltO@B>Rqs9w%%Cxn zbLxU8iV-NA>X2jtxd&O(GKuQDT!-AdLeT;{&o!F)uk`hwLwLUb8SZa)=N|l@?E@tb zhTGOIvh$noR&PF(c55q|zrH&&F(ge}^8rLY6PZ->B-(dn7G)EQxCV3Dre@gY!FjkB zuDOZAxLEx74Ct?PmzLLlkK-+MoS$pyg5mLfkVF6eioM4m+3m2(qxPh0z_tVIxWR{b z4=2oK8d=A=m6qzPO`88HJn+L9O706L$1fiL25tlQXMhZ@N0pk4}d0 zHv2ZTJWhR^VHZnrzAT!!4#f24LoajDjH3J;IeIfns6`loF^PEv#oYVTYk0a1(n&Mi zq>;BL_w;vR*fpk3`63k?9p^2xV{(%^n&pYajHoVXjPz?l{ug)=lL`!wVkFeiC^g%; z#p{&RfL7Vk3h9DBJUnMvQY`jA#f6oxhs;+dn=C}=T#_Nabv=7pbX*@w%#3jIm1Jn2_(6~q0z%1-i7z zctgG4W5S7UoOFSV6sQrbSW!!hW)7rO`>u-e`}?!o?%)4RZ6h_y3@XDwIf|%1MySP8 z3hG~}t9KIVFEvY$Y!r__94Fe;LvKF%=~Y_Njp)9FNk4tAK%w4BzNAw9{8+=HZ}JqJ z4?aZ-27@ojx~aCH-yH;`d}NytIj#MKRJ0x^4#Rk$Tg3%kQ@%_~; zO&}l)mzW312@Tq1Hb$vt7b+C@P~X1^YI0pquTm&}d@wONb7yCsw%kwY4p_dXNlLi_ z6pp_iz{AP-LI0*M3>3|P3LBp^3;IQJ7c98CVQPYpo3_kj(Q(SFHv__1eVJot@yMxv6m39;*Rn24=Peinp}1U7HOi!=cQ0 z2?1S#e)}Ruu@2{PTadmd0jmI$FUoePPZeUoWfarB>T1HwxQ>sq4|_j;RpQl+##2z* zcW{js8_En}0c=<5 z|B(%9LSmSQ3nOJFv0@!GpB1BfR2OlSO2dR#mo%5AZ)ssK%#|!nW$7*@NWr@`F`^rg zF5JV3iMV~wb+z*g$}otI?^8@qoua3{5Q_j-6eRi;jHqcxTnWlQg}K3V-Uge0r5H+yM!#clq!`~Pn?)cO zApf%fNqePJ=kXty*$a0NRkp*OKilPP%FDKH-{#b8Dc0qZny$K_@y)j$_)x(cG^Pmt zKG|jcAkJTyO%j+yIUh_CUlNV0hVpy-^n=PzlCQgGXnDm_nqZ}EU7XHNvqe>?!!(rH zYSwHnm^IV8lny^?oi!_~KtDPkmlH(pCt*1J#il>7YxVt=IpVycROtYm5|bNl4JBwY zSPn*wFr{OrWHcPDXloxzBX?WE+=ifMzYt{r!civ09zsnYKg?oal8li{Y@vqx%FmB& z)_9kMcni}78mRlloiB@6Una^?eMP^?fPrt^q$kAIh)3QIRExejsu}9o7!Q+?TOqQ1 zVel27C&3&_s#`|GC`lW8Tqcp~0_OJ`X3^6RR6k(qMFw|PFKD*-0dGvKMmkY+%o5h@ z@sl1Y+0~Zx1DU+388u$_Ara;BQY(zC6zD;}N$bxRQAVzm1ceUtB~fe8Q%n_O>b?o| zVFfK*SJq#8=a5ZHuU3a{1vUxZ94~D*Jb+*j$XiU(>TTz%PV55B6Tv`f*X|+J**f7+lQxK(xIM)iR>CFwGQ8LYXKu{?bUy!P!I|6iAEr=gLT&rpmZ~jk zOsaq^2G42C!tY+a{zj)~%f)>@{{>Prrv42`bv#G$HW{97aIl`ApTHq^uC920q1vBk zPNYGuthGf32uSrD2tZd-ERlXrNIz$!c#qyE{{CM*gtxbNlI&j)iVnKBON|~?dev-9 z8;*=_1HIE8+*A1yD~VVw>Vmhd>c;n0iyqqf43H)Ha%EJ^!HdXRX?js{i;@i$jE4$M z18D4n63T6G_H-pZ=^P4(R!!Ukb_;7>Qs$EEx3XGSzT4EZKqlklTtE1 z0gCn{OQg)ArA`ady+yTnto${K-^z%X9DbTrC*dR0)p34yeh~h_ap$&o>FH}~%aM*= zr{w7e=QkWQuET_!+M{fw|3&wgm818(p1R1Yc;ZXeKtl8E_WJq+2FYxyOR86u1p2@; z`Q5AcYhtn3-&H*R%_zg85WnFlDI=PI?kUOSoGSc;j6&^n$j7H5DB!O10){#@pfJ5@ zjMgkAP9FZs%{1$C`z-!?wnznfoDwVkF-y@&L9m2sZSm|wdtw{vCxwa=@L{T(2b2|2 z!<(svs0aM>EoX1CRrT`dqt_?ZQindQ$NQfPph> zM6pe2CWUH;A`S6@#6*mrcb4=s9=}&-ZcaLPdtpz`J~0=6tbku%-OZULS9)aikGc(F zx1>_KCa=JznhF>OU@CpIs^1B0BKZ2=d&z{4r{DV?-)k?hWF144G~v-Rx2Jb|pf-So z3KSX-MGIO9BHAb3iJu@fbXk{ay5mT z?*Z2c)o&@X{-jT}(xA&H~tgn94Z-*4}C@&59@-K88e6|)mmbI|OvPy(9U=W(RVf{2e=hw=NeTX7cYoI7V4US=63V{{3IKe8)yndwQW@*l z!K!Ag=4sCOEku6?Fe{~pzefB$^oDZV)=1)`l+HK63u^SNk6h?M#`zz zDV0G#2F(q>#h~Rrygh6q&9}ecdqa~3$+JpWR2H4nz(T+c%0Eb=tnAe9BhKGoWC+4N z==*jOP!Yx{oi)j!Qp+n9%wpC!zl(1l7gZ^g69)Ms!1s3$?pErgxYMEE$VrA^)#?Q& z*%Z`7&~FAuz{=UjK~&*J{&B$nxXh-5Utiwu?=R+TUX=$D;<19!?wGNlc_DeI)S{n; z^6U**O9`~!neoERT4C=eChjp_DUVgLtSTvtHbNbUtX%dld>`UTc#ZkB8VTBrpwyiM^CG-gk@=BydAEh}r!4_X1U6!?2WY zQoB0((EqOijZuGaMhY`olx{>-r>Ut)46$e4@_73f3rh1SgR?!VtE2qx`q2OBM=`9? zdQ0R`)n1#L$O^b|qk4&Y9z7lHwEeP>9#_1MbX`(`=9!lvKKUVd8^A^Abk8ppD!7JT ze~kGH>W3}!D28uo9p|1H9@c-ngATBi7iW7k7JWgs(W*7l{OY3`o@+t6+3@waU!h_F zyTAu|#p(idg@H`tEJ&kKmJ$_1bUm6XyRHT5bC$loO?BdIXj57Wu;E8Qz|eC&gy9jt%~$({u*C&c-~{dSc$iM(LAHRNRt$5KJ%gp>biOA>B}F63(gG4 zoL5+mv@))%E*dCy89=93v<}q(8mN*C6WRC2wr1Fji0?gUcmapDsPTYgqnV#ka1sE4 z!G!XwK>0DE{D8jH=t_ONyC6X87=9VnLxC&JD2Fr; zP!tCd?1JY(p*j>kZ-ff{Jt&!^u1Om23Z;zKueg;{tVK!6q+?Zw`q3FCjQ3$vrGNWa zUGgiZP>Y&`&E`>6q#xbwCb+(Hfhx0ik98DCN#n!4Qz^ZOgr|FS}N5m1L8 zOnYou@6ruGp8#R=v_}I}8SZa)6B9pszrn;0N98#?@V%5zxwb0+Nb}%x=6N|m;fi<}@-> zV!W!UHX;?9bX=u`|9b=IM*;M+y9H9Okb=xi0~ zXZNc7v(|cej7JHEzO*jrzUG3@Vcr5dS+&G{z#SQ!E}+J1Rj0N@1Jon!si(AaubA*c zwBpby8cgh`y~?uUq$a0Ibybw`tlH(>A8O{2E#2M}0~+QnsJBa+{IS01OS?o!Z@_wV zS3Uj6lQHg8kWB!(r!Jw2F8;Tj(iM}n2|lpns*)Qk$q%RO|Yj{HSbhNSCoZ2fEO@h94)4buSt3-1;{j@ zif0U(rXc8yL#grLYo_&0Ki)NaeAH6k_3iQYaC?#R@sa&uQvO+tqwtK>JQ-MnC zi;ktz|9K?}?4N-q0g_U-pys=uk5%+4RG6E)&-1dm^kWNw+_P2mbWK6>7n5oQJTH2q z7M(twifL_2dV{)3Jom3b4SyocSEGmV0yy88%KbA59TXS;pp~Rsvy@cfQJ@p8KpYhqwB;35L{iqGSjy3gej?1xQy=>D$zN{Z)G5!~35{A_V8INbpC}_K z8lnoBg6aClkHIv!)(jSYgGDYr?uMEmS=36h6eQsw7L}V_7^c3wNjiZNZ6-=Rh z^btZRs(j-Nsy{M5yKw#EJ7r)5S%G4fF!V}|4=S6(DcPk5wF0wP&o?=yq<$(-gtt}A zhc`M4X|TOxwl~n|SH(7^kH75m)MPVOx6ed7Nkh!d$rw*TSt6!x6SE~!jqE<%gcRS+;}dn79t0_Y)3^&sR$)|9Gx#($7F+;s>OhCjzjnjCtEo0Ze}>h1N$hlgPq zs)~kzREGv*Hr`k9akX?wsRX4{W6}3`IYB$S6L+&CH2#B<@5)$^T&#jNs8NB+$gU1F zFR7v8=)W@Bm9&z0590VV_!qP~l@us#S4H_Xzf}iwXcy=E!MO=6Z$U6s5IS;!HNC6x zv1-BE4=H)FItdTlkoFTEMKD1t`6A82O;|NjX>43k4JCBc94D(vqrXuwvko)`D(3K- zG8aw8B#fX6#TT&(MH-L3!D*>hq~+!0rltxIYho62rQL%jL*R>IsE9igR3}TIoHOg{ zD0c>sLj3Ab0?I;f;)1e=$({(@;IJe;`O6kM^84#hJ7w)WEioy(`8TMl_<%*g^M)7> zscA8ylEy*)O>ccye0t!`qNLBICG}p9N_6p{=_^{!s6)UVQhuQGWH*sIJ(?h1MEa9U zD(8GLN(Ciny^tSevUo>9cxvuQ?}Ge-I_g{8i1u;rN`_bKVm4)l*8moCLg?*xv?e+Y zU0#^#wO(2p@GX$WC@SuaCZsE&4EXRb5<`}VUfT28vq+vnWqcLaR*+$t0;khz?DOPy z^a0NB8f1L#1zt-aGBu21xmcLo(DYpBq}Z+6l!Gd2yzQlh(gej>1STdspH`RiqYF>4 zZZUG=#<*?up=&NUFmx_c8O>0N52Grc&ge`>t0MjG%wrld)!R>$xWGk7v%DORQ%v}w z*zbxcN>b{avT5?X4mBoypoTKUbDE5Ue`f{(mKB(Y$);BrK{o)6qb!;=+O7`tqnpAe zx0^|yx4y+1I^3epI|(w0G?SKQ8ANlK<*gz80P-v(p<${?z6^!KISUidGpA2<+qhp6;rlD(QV*FdxR}Kq!SHF;(lLL8;v8 z&2nA+R{iRRFmmS`4HFthQ}Uq#qfzj2E6Dx}q`B(bf;2edj%8-+p;_DG{hFd1(34EO zr$CO->>)KG`E8070-idRiJSBPdoLV zT4}OkG-(=Tat(IWBrI<+LbJf?(&=x(3W{DH)6~~uukI^*vc|$MIE=%Nb+nYOmys1} zJ5*tUod4j@-BP@A^?lf}f;#;br<-;h6N=sTx1tLgHRqHT6n#yV6Y&{JY zG|f)NoLUx%s)i=|v_)-7R1(ndZJ^%>v>UV>l)J^pclaCs_-GzqwT+KGsAL(*2DY3< z7Nt5Omzz122DX<4#xzH7dAdbup~CPKNw+Pj8%o!in&F6OmPr=iE(>0hwmeR{GrD+7 zlaPOg`nd!3t9#L0>JXg4klLA%m=^!pg~wYE=9rbr`OdQ+@1~fV2<26L)k%C>2<3__ z>04dXucf2UXhTHm6o2iN5kE{+rA)5P1v5n#4Ii1le+yDvz~dYBY|J-~_X{1t_CWMs ztZGqI!SgS=I@FE$G!%dG?iyB*$B!>6$PdS~xd^2xRELw3y=#K%PKVO6c>@4R1zJC% zC{-0ozCvl#o~$F?$WUl9Z5f0tS9l4S_NjPT#WaN%%;ZAKweN9NbJ}+V^z))t)}? z59$<418)kI4kh)dL1%G<<{0SLNB8mqxk^BBtestOMT1VIHnnyr^s^7OcR6NhfC>w0 z4{4tlfSZH52q4@tPu8dI?H)i7W4~7QvFa%R!*~i@WF!68s_{U2`rlb6ly$Gd@Qcl zvHG#?ACjxXR7=&@U##rsq;cZI>!M6qX#4d^LnqPG4sn7~{;LXn@HT#K!;?~$K{wjB zAMw)#M=xinr=Y_lUDgsT)7f^s0qpQL1ZLsc3c;FToSzF;T(15m;+OD1sDnnUQhaw^ zSuOyyK!w>PKalVJTa;RjwyqGknqt(uP)!tZ@1uxH73gPoWaKEZ@BVhJy9(^I7+OPw zT^uW_L>L#bEY za}h{dwDgdMGHYDF{QRJyQH8nAu6(r;pj#g+245bv6N?bgl}~`qmL3gQAKlRN#{wJb zeBS1PfK&16Jnap&N}&GJ@~DtVauiUozuZ&7l0}SGI8ba*-J5>?q6i4% zG$Kjnwjf<6*e=^9!*KHLTFFA8v9s20;g+y~YS?s9)u7U=2?VoV@elreaPRQ>W30`u zIi-VLmO*tgQIabPWwKuM`Tnm$gn|RcxNg@hZwfiYFJ1+UkOhs(8KOXa)Fwwh@{8OP2Ix<7PdG z{EkVR?t&^*kk!6B|8gHZdqHz2FrvRBlQH-GRs!zpF6T`82IU48Dl{XqwT@%1;k6}!WLBwAx78V8q%*$KRiGsP{+;sl-)l3 zVc&hhyp?-WQ1N(yCp8DhXL862q?pBkQ2kdO=(Z(w%DJ`S#w10b<2iZeyuSlRQ`$ZA;BjZwR~I_N;b(p{l2f_eos z2p~B$1<>;0a#(Gt<6Iy9VPe_YAh(Kb&DI|r$PC~l%Y=~hOchY$H4XKb7N{nz-fKKX zIyr!N#5@m!g+VgOsN^z>Af&y`Nrgf*ZLxW@09JhXi4+zoHEQ3usB9`AuP zX_bL;QS4KZZ%6xW0!OxL!z{p zRo2pI_9)Q?Kye8&pyZY+kfzFjkWElQnd&)}SM$ja;TMMNM;A?nV`4JK0?gWC7wBRa zs3c!kgZtG$xb~-K?NezO{BvDgT&?NCn&MPt(aJhZoDFe>Xw`4{sIuhvra)#x(Oi(z zS!+D~5Wm)8CT=KvbHBy92$#?Q+L)5uRlK;XfO-Y3f(Epm;&1jPd6WH`KmCT|Bf%;P z=8aJ~4X7>a*Vov@&|w_46dL_2^5|tai*;^d#5dJ~WVxkwKmn=Gp8u?NX<@HO#LCOgvNf ze4(1*8UJ*DGnvTj6077f{1a_ZccudG=&0vyxFW1Qc3TAKEV>w}&{->93T?P#09ApB zWagtV#igM2LP(uN!C>AVy2G@-RsfaUA}in&(2%3)_y_+ka2!kzXjXeRrb=rH4)54$ zwNwF~i{j?jE~Qg-px>R2x1LSPHOd1yZ2eB1F0kP%g(XW=RcFN`RY-00IBA(Y>MBUT zI`GhvSd}o82taa{_baCKBG=dY7^~2kFfMyyP>7W()T*xn#p!U3uf81!lkIh?0@y1O zt5{x_s9b3Jn1dBZO43$$+He3qHUM-MXmAthg zw$JYh()+`+(Kc}`_?fA|+-@@macWr}*Uys|vurI0*Euw+V8uRFt9{x8wkurT zlX78J1V9Of;rUUYfBhX4{$hyYm_@}(^6iVXAI0DpP{M0#`rW89*{_BZ%l*tXp1%6G4BzRpmVrth3vK~>5BDCL=wVQ1 z^eI`m7+h&ouDrAQ-ZE zS{54nKdJ$&j26hs(zg}HabD19p^mn57Ci4=s#he+MM#69A^$hHw6ZI6HM+F4R1IEh zSOKgrY)r3kH~`1)eruLx!(*2$t05M(`&D(QpOCWr@q;mih}loWE3{XOm#aiGsNyXsUf47zo~A^Pal)cL$xwi_ z6tKPQSqI8{z_GD-wrGH*i}V_#Vf%Dr9Yg)<=-0jv!;45WVjO5pDS^Fjzgr7=nDs1M z%tyV!0c@fR4AX)j6_CSLU=E{c=@!iI8Rpg?OL8ld7{l~Ku>F1c5RCfeSt&rs&5?E> zuW2OQ1J#Ob09~f6G8KYqOPWU7-PTaEBNJ}d6zv$g)-nf_1yQal`TR0*K&l_V^}U7rGlMz>Yhp4)n7#Lb!+gI;D)-9t*86r1!RT zjFO}!EG9$(3Ial2g0-^DS-ENgJKW5$E5PdY9RRf88rJXbevd)BjZ3L*id5F)t*7^EDzSjW zHcSi&R_b90j)qveu6}>W2!#M{oqtL$(ES*^*qZp((a{<$VZjsJ#Rq}YB5R?+Aq6yp z!N%nH^QRnZ+LVF%z@m!q!w}dq3}{mc#PNIZT+#8~G6iwX6v35NXkJ@K`d<+V+;ZHS zH$Wj50`^-k$#LuW>yJ=V4)}V`aqDOUq?Qo4feUY8u(4=f7783vvkvG?)kR$|Ynr~BlY-B**UqjPL<$t&nbeRcwnpVLx7J~H`hC@nE zXtJW3SVFTt58sU-KcFwX1rpSPV!UF+|Nb-FRWMIL47m*=yX+>JTfcM&87-l6#=@4C z&{-!CbmHUV-+M!i0zinvO-~ib$70!P^Rs{D{{mQq?LZuYkbq5i?gt*8O#Eed!s!;d z-3}gKD!K7ejn+`$U97E_P8WBoEix94-sP1KpRtxHOIY6oXPXyWKBgoFEL0+}Xq8Ks znY{YBI!vjw{oR@H@bEC$7m14DW@}JP!->93QN@WSZz(!`7|DkCmtpdXAc)fa;OOlz9lx0YbL+vDRSf4NECM_7MeUclAj zC2_618MIn2Dmz}(ZJ%|DRTYImHCkD1UZm5aqFz``(^AfA-LJi>8?LMKbyZ?bqBgdx zZ2-jj*WoS=bM?(XM0e4RE&!cKnv7|_xCq*v$?P4OL^9&uSz=oUi3{*KC%>ryeV1o(4Y-YRK03Q_TW*#UedCt%fuE+-4b+T7& zaRsfab4{=eSaPf z-P#|-)tPA(BLE8tkE8{QLMevznH5-F(o355qSY@d;sBl<5E;m_x_FK1;!^FCMdT9b z*B1r)*-fF|^h*r2fEWZZ!DCEeU+>%IyJIVWZCq4|z6`>~&=Ca8RMdzU5-gx21U&oN znl2;OnYg(3<4!Cj&)03PF!Twpz(oCLVQh^)rL>ZQrRM0lnw@75D0QX^~%xdz(V7w z2jd8&UDx4u7Jib*Vf+>SH^Z?wfNu5W1NjY@Wno&=5Uao{HAm`Y3)GE6#mawvxT1B9 zM!ZPAiyKxK4{TWyikE}Mv8%INU@gk|dft|A!`PgB|02XjxfnDyuR{$%`4CVZpi$(w z87l9YXJ8_t%q~=DUqQjR1*!*8kobm54PY}~SA;L8&sFYT#ka~VrGi}14>BUD15{yN zd8lN|^$!LI@Y%hh=EkCDE10<&d<1g6NpEukpyK-82Y(OS{ooqyR-u@ck^+?&S4ibl zUWW>69cg7f`4v~|6s>YpVh49CxS{6DKO@cI>UjAdBL%K#s=Ze5KE!Q2IPX_l7OW<^ zGP+&B;zIwr*abyD$XHb5u5W^hE6W=)Z!WIx!s>7iSVYRW$811}#AB>tO_9QPr+s|3 zeUJ~q`N6nHyt)SXcLv>h@9|l1;ewk0GMdnm3u~E*GMYrcj9alkP>0;5VLKSU0}-}q z@G|~cyuUv?9sl{7QjCEMfeDy!1&c$vf6K@<>9e{}WVDwx9%$xgPi5BB%1&d##ut)xe3ZWfoehuW~{7f5;e{p}XgkK`m&Amt17 zg&wtT;}y#x+J5whr2*crGpxi9&=DsX$OGKHHI+agGWKU{s-`l*rB2quqSwy*l!k&V ziG|6o;TAZyXbG2+!4$S?bQ6)lX%wP3Q$;WsgIFL3G~8F+zM_FWuD8{sSz-DPOm7)0 za2Az8%SQ?6+O@j(`3v5lKmGmx2fM=x&7|U9B>|QwwAZ9H$wi`ya}2QTSX_C9BmXk` zNvCLgF)OGW6D;Ae;R?e1S{z{Q6FH~}YZyy{6O|!8POqpb@7$|1?!4HwNERZXXL5CG9QVw4~uZxYu_T-}xj95!UnVXU+K(4z+_aw}6y@75g-XfO1Y) zNB3wPzy8F2%lG%k<~Hq3xEtrg@Cd=)-EOd&x1ZB(+2F-UQ_NoC({Bwe1PVq+n#O(F zaHKX;LAfjoEtx?ll;RlJa-h($=$C6dZvvrnzpdd401Yndr*Sq%bdJYApHjSDXmG{b z6;wmesD%bBtO!_KO1J8(d!;|rD9UR<-!#6lw%iA{2v$MZ&~;UmV+g=O(b#5vi(I(U zHopu{S9Sqd2*o%BbXdXoU;&lO`MrWFyUtLP8*_sCjUz1x^&>&O^Bevr{q*tg_~#xb zjb6sTGE|_P(g7@$2l2N$S^tAxvY^BWPgaa*2~^$}uRxD7eAPAm{_Iu>3jW-tegZvPEfK{N>aSPUUfEtQh!Ta-}b@n`u_k4Wcgy(b`e}Ob!3DaI}M`{-( zOz{J?P?01fVA>+SvHc!W2(3`u>J(DGNfYVC`O;-7U4Y>=+TJlyCf2+t3?Wtwt}9T# zQRJc?%j@fgd#d-0@V#5`0g4#yt0_vMh3a9=z zI4x829W+@$q`&ZN&0zDcRVc{Yw_t?`jI1nC5(vdyPI$0>Z`baZGgDzmLlH|!n8_;E zw~S$U8p~@qX7K?-d?IicZ!J~NVzg*BY5;mmyli>7x+P0Mc3GUSinN@=Tvl8hcqE=i z3MFDNLIProUFaPPUX%r$NHVhZo2&CPP){zgFg+6HeCH(z^YVCIIg3Ia^hqdzW?|+A zprsu9`@4yeJQQ3lsB^`{=JnZR=EIR6sR=luW7=~kS(JPj%=!edr@uuU|T1Hu`qA+t+tXn*RuyE`6yT=&Lx0KoC z3BZ6`j0-?Kpv(!1g-vmmY+KiDAMB6}@Ian(m2I`njji0f?e@ZgKlTgk*Ot&Fge)Mj zUvNE5#~-a3WS^`$$_hatcnowWoON#6ui;JDM)5H|~+ z7*??xkZ!36E*dWjq^^~G0V8?b0LN97*_J6X7?VWLPi)tnDsi~5Ta)UULY|lbjlLp* zvCFg&EXJxhKe{8A`QaI&gU3^8;jDJt!yE>*;fq&Q^vfzHh#u8o?uM*^x*wc>17JN_ zj8g=(g%n}73sDT?fcH4I-)k`UIv9vkxh&2gP6yX1)7B7Bfoyz?s?-{6Uep->p%daU z*pk~CvbQk2rv_@n9wZQ6hj9D^FtL|k#eLexO{(Bb$z7;@YGX94>%p>y%kwR$&I)qp z@Rt)s@>-}}rxS(K_-DSq9-k>I<8Km-n|1vA@ocX>jkqIW-1Kp6wcSUW6~r&^)H%cs z!$~$nu403_5e#8~T`TQgi?V?s$D?0?Nk4s=|K0r#>#j*d7TFQHt zoVY~;-IFqq6aNbTd|sC?-=zkcm$KJDXWfOWc;Yd)LET7AhEbgzG@0#BEID8VKfYBk z*icpk7l2bteF^XizCZh-=BM9soDnFb?u01AX^}=7>o~tV=eMq>pSrbcbmf8M4?Hi* z++XNrrfPW;)cqtiXaCPvO?kM6ZX2k_6%nd1i+C#4SZhGHk-qV~xC(#{*9eW;3oyE2 zN!x%zb<2iOX5AhF{VFa2x}Y|72;*Nuo>q>ta08h5>N8#XbBaRhoJpS9;Z>Mme?H0E z2dc)0!D%>OOd}S!pm1fqwltKIeUVIa?6WuEJi7le=o`QT?KC+&lNUhT2*|8uG$og9 ztJJnQ#=1T-Jj?+PA1=s`Ltwz~E;HK;%>pugyu)N%pdh|c=V~6Lwn2ygSvYd5- z4p_lPlWf88WJYznS<%r!fu(www5-iN9IJrIZ95nVp62oTiSHy6G(;{1;hja}y6Sm8 zJkeleC@(SmN1(UX`WxoAAc7EbgzVNfH+V(oWD*@vy??)vr3dTDA1b zB!tRw59%1)QSMYB3J-=M>8h;3+~Z6nhUdcrjwqh)nvYUN;9VK-%D%YM%V5ssVr~uR z&IU+YtKt3=H?SCVLT|)stT@qcUql{)rJ?E?P!c2E=MGttGmnog{?5hh(20CEv>2AX5Ujer~U0WQ`<;V-%0 zAP4#pP9zs;rz|5*F%l3etNDGvLY~5q^W0t#(un^7$BOt!T1L;k>X)f}KfUHVK({Zb z#6~qq7#tat=1>7v?dktQxamMxn~b9gMXRFB=tP*Ut|!#*+$i?#n>_$!TfzaVZ#;f} zh>r!`qqD+(+M^9PyTt}g57h0RzW!~aG}2&Kl2MwJ8B1AwcuB^*z;6!?HEXMxaF(;D zV~XL2%^ovv?S*K8@>Hsr@D+vhJ60J+{r^LpWtE}OeZ_MGdqQ2JnrQTYP&&qY#XN`7 zi<5-yr3y@3(J*j_%Fk$^Jn$YuJT)!yeL03I)TZ~}nzaF>Q;7D!U4ssElnV9esuuH& z>XHIhzhIpG!q8-@GW6)C=WS6A(f#rswa^YfT+iLcJ*?+Te)I2kIF=`82UZm@jwpNM zsf1<;m6g!WXK*XzO!9|fkcN=5PH%;kDc$)8SD7gAAj}&=8|oAi+FD$F6LtVT$XC@ZmsW> zi;FS91A{`4a3P_SE+|m0&qf-x_1vs3{kXcA8{nYv>ta+0+8$~jtQ|S$4Pdii+GI}n z@B6!nf4hI*P5kq}@Ye#YZDW_9og&=(FxE|>(u4#`B`=^%UPt=fg=Zg>AGH4^y7@%d zfTEqZ3b=6eB`q|m6|kl)NTt+pE>9E{&x5@>+-}2ddp>60Ol=&FWso)lZG#%=3SDMS zD$zD;K=aqn)3N@2i2H!&ZNPm>P2SX{{*p$`EW?cFB~ZJ_MyGFoi1VY{qZQ%e-dj?ePL{PNMs%=X4J*(*M2a0mY#!DjE1y3#yzW3LR}A!n|O0 zs)`k5Q;aq7b@d^BKzvO#{+-@c?sV7|qiV1KR$%|(O7j;{3TZnrd4E)pg7=UARn71G zZW$atwaTEm+PrTIW$F|Lpb}LD=D{$J*8{{YZn#1)LXRQ_(Upu5OW~(fLS<-G1#ZO+ zSi=Ty@270@QSTGR4qzi#bAUAy&Rehw8+WF#_GLh}hRwYjO~UQ7G~>}q!JsA`z@??h zLh>89J4r~l26Vmwh^!B$sqx9d%tf=|d}};Yw;9JlIWz&K%PNatfG6RcI@a$l9()bD zISfQ-R7(q&x7AUB2?=w(3e1Y=&01hpA`#77pzgPCWBzZr6Bl}8SvXI`Th+#;b*>mW zF|N3+OmZnXhWUXC6n_|aiTxRj8eX3w(D+--s^gy=OO*>|&W<3C{Rx8x%V44d1E(1& zh+|B$4)9NS|9^g^;fJq}uMLoKtNwX@e7f=Be;e4Ke}+s7b8+D-F+ncK`gN^`G1Q?e@ps`JX@R{zo(R=ceIb-Cu9_ z_~)YIG{qy}?Ty+k2+1pG3fL_v>L zA>#%m^JGT&b%-_{LH?M${l`ym*5TG|AiiIt};xt*$y2_#= z*k{H5;#}sUB>lrK6vp`?!Zlc&r0ZJT0%EjLt|CRxjBQfsJic@Gx*EqxpmPE|Es~JP zLU9aA{7Kchf--eWU0hMUL3h4xLX;R~ppwo7aNo(-m%S)1E0Nq1MxdX&KrwK_sh4}J zd#>CrpqEnF1qlJ3Sq_6xw!~Z%sZh`{E^%#Xp`rqL(h&Mtb_NEYewLbSd8yAEffY%# zRN3qG5M>Njz~P4>|KzEl6P7{h1sNC1(x|xueI&*Aks@D{Hz5u25ZK_o5AX5!xqTkc z>tW+5P^~j1F$}OEkz{FFBH8-FLI(UU!tWdfb~32D4%9eP{%B--(i^e>G)4}(z)8iX z(zj=QSBD8pdVXTNNg1j1c_mom0?tOSU{H!)bZF$TM4F3f3F&uCN^W-}pZej?+iQ%C zv(2aZ^GCn2)b-PL$KqvF$rc;&{@Ols-m1l<9ynBDk@|2&ky3+6bk>`FA%ie_e^Rd= z3~qY;60D}c+NWbxhx&p3nNOx*PW-f>K2m%HG|w5(=4`PE?CJ{OeOHHyLSEtAc4e)X z6^lC{o0088O#_g+3Kfo)8U=m**zgaD5jnlJ;evJE7-0$29b?MK0&N+$V3-ZwdXM*(?bY*3(PLb`Y!Eg=KWfX7l1d-3)s;df`oqoCF-y@Su~Zdw(V#dxO2qjk1T76 z)$la8-GbFY(&FBoo3oZ-FQyho6M58>Xam0~EIaGAer6Rgr$7&`7W=;_Q#aTed9p=4F*dMqq3* zrmet)K%w)+;ScgR8rB>3P zq@Ufky2K9y*M)~B0Scn@eQ~UyG;|0NeLz3Dx5r-g!>ctZVg+$eH#h>@iEaJKbM{FD ztET=*;HHE(>j&zjLa92!`c9?w2_+<7(EWROiNV_7-)>Tex2G^XSs4%5pW*g2Y9}vs z=X9YvXEt>mH~fA}xQo{nYMF6@?)Y7jp8j=)Ax3fz9*e7%Zs(&B?IZ1EIe|EEW`Y6a_er$KyjM;GgQ`C{ctK(VUg zB4F81y#f@X0c`cZZ^(;;YnrJ$2E}_=!LJx|f)=bXE9BJc@olst_!)O;X;M!1YYDYa zJ|AsRf8V5h%k_`0YmvfQo6BuAs1a;@0OLgvf-tC>+9#RRNAP|hsg`Fs;9EWf6l?i# zskC3KNU@yx_$ILF`-J4mcpF$gyd`OOR3;tHIuUGGeeFR7KP_x&H9=hmE&hj^Q(N-> z3#`112TtvDgQf%JQUG~H8n46^R<=UX`dbU3G|woPhFv!6Yr938Y!bFO7gZ3xMN zWG5sHSGYhcbF%qrHB^r}y{;(mT!Z@cQBg>{m=$ih=`7J^+CbU^jcC81MmTw@zBt^h zZ+G`A#kaMa4&4p5*ta}S1=aC($KycPTnaaRKq=btniZ*A(6qRTVQCgwV)!9beC?z_ z&=&6X0z1dxrBDU?(ODf+8MLW0$**quu|aLVwfd6a@4R;t)^YFhF*dAY1#S+^FnXUw z6I}Sk9kC^-yt~5je}Y*Eq8tw2hm)>Kdsq{Zh&5p~5o<)H25)t7xWe$0l2PFRLvYgYK=k<~YuT|~NI_t_ z5yJ0Gj%?}D5U@pnAg~tCyB4fKw+0#kC6_O0z(5-Q8i;l z?)Z+qB1lY-$^xm(r@ww*bWb>NEn6`r zDbuuEt?|O=0kx<+5qJ%k1KDjiV6v)emCAq|X#>)&!uwCp)Y`0uE~-Yin(=|nvQ%}X zpJ{N+@Z#DB;P<_to2yo$Fmc!>i&`{doJ=a;ElA_tXEoFdQAj;rh4=VJNxW9IJ{<~d zXuK{~l^iRRyvkaznhlj#KNCl;sJyOiwJ1`v3b5iYb)erJGwknE-#_;u+=6*G<{jdy zzk&HtQ2YI?vDSu)9{K(SQURB0{8ueV-=M>-MT%CoyRsJ&g3`H~;4Ir3I;+uFVaChf zJU-s?`(y?Qv=Kuc)xtGm=F<6vKy}3U1#DnmK$9{d{T`7b3%^AA)hYX=UemzEb392; zPlM~9o*eZpS{!~^a$MwwPvM5!LxGhZDVA8o^;P}q;&B$UAIVt6Fh6vASW4&5NZso7 zN2JFnp_SD6FsY`g=@mUhdO_ru9DYY@1E5Wts)JRa0QH`chQu7%>KmT!!R^{Vtuc># zHV+RsVD5s6VW9RPPwKP?)Sx0vA4FQ^7684`>_+Sj9gOQ zq`XC!wJ2gpk)|7K+W?m}B(1+dgVk_U(oz@v9w=0|K3}e@BgJJE*;z5z-{Fp|-KwVV zuliMLja=d?&W|KgC+|ItSYLiNq_H7TxQA5cA^<`9v4=FKDlBHJe(Vlyj5i+A$F?Nm zR=U)CKodD%v@Pk~aPJ{r)&2PF>&aS=4Hu>!HYDliZMyvWHkGs6zP{nF{5fvZM?F$; z!VkXwrYJpG)1p=(`VnhbO^S{`CERa^!pcJF|$G`min?{FYq2S*5N+vl-ABt-M36DSL>SNfN?yd0cuQG zo#}&gA<67*Mgr$eE4M7F2wGm$S@}3xSOuHipk2_Q$`M{lZ=zqSyq9)>1hrT{2DWPe z-Hj~*Jwf%qK_M!2>gmQUA}nO*#`~aQbt6(AbL*)V^ghhL#-IOleayMEQ=nMOq>zn@ zZ1rvMa33D|ur*3@d01=HPKE3Mvn&drok|1N*9`^E#UC%2Eo%->0uWvukFQ;^2a4$z zY;bb{1|Y{kVY<*b$K1;@S|Jty7lz>022k^=mPpg^QzO!Tx?BxfuCBc-Pa)mZtEOvO z)Y_5r3<~fyoG2z#;}Bj_JX@I8k~QVU(}jtPYeCupRS2rVp_Q50svlU&s;{8Rd;z!| zWmTbeCIx*4-`McuRA5U&j-V_qf0%j!3AvUpKYHi<;v~t2Xy_hN>2gV+3F+^SXzlr_ z5v)b|0*^lC*EOiX!?3|YSD2KACOUDr8oF+H?9$^eR{;PXsw&!UXO{TK1+2nXg(6j} z0DxhowYsdUaz*%gShZHW+=ZGv>}7)bRY3h*LXD5V3xD4HpvG}Be^dT8*k^Rsxvijz zQdys|4bsqt$n8GZJ`caPtSgUUwffzmmz7>PMOuU^Qs$?shI7dodi|XTU~T_I?f|&3%zqYhdG{Z3#5Tq52-GqL zaOv|`k>blL7Zl8}WL}(3Rralw=su9kmRlZqs34AqONSx^c8fp1s}xGK7wZfmb)Tj|mk4WhJbb+?c7W8AUlFM~zjZF3lYZFsVa zHHl)luHkfbFs}`wxBh+i=Qk`*<{co)sK886E7t%_7X!D|#bDbt^2qy3mat@93s=xo z7z9&T!1{TFHC|dK1ao@{sk5y|55v0C>b@TGmIUT5RwG8XP;FgdzuJ?J7rS8BK%t7& zJBzbVHKZ|yWnf?F{f3NF_(cg!vR#Xiz+`G;9qD(c!3FG;mS6Lj7FEQi%UZ+)$#Ye7 zb*P_R{6K~H?Z%kS`8pQtRq3q+(ny~z*W%+fR$H>22#gd%goR6rQ@bcB4gV4x&456( z9)B)KBgw_%iY_8ts0@T~Bp7Y-1x9rOV`+Ipw`UlG0ktjFUqc9AIbKgO!3ODYH6hgc zB*MEe!~5_GjFs153{))ojBm6TNC@La(q>tQ?FO`#w5f_ zNf2IHL*OErsp?=f50jRa4w~7sw27nEs`I`{8^d3c9&VW29 z5Pt@-VhCpehv{Xoilc3WiO0t;0PQABg@ETd8iM;eIfjAOH07w zf6^$l4D#%uPUpqO|Ci8jb2$Az#j-J0v?as}!jH}lL`oo_?8RWGaZ&KcTNsS-qr01t zVIh+v9b^el5cT>)A0zxKthT)vatR2GkwoG)+ND&JtwBswRul6*FF*u|&xQ2L)i zPT~gB#Z$ou|MKgj+bEw0SM~$F2T`ljz4bJ3$!+`_Xp7JW)Z=$21!|6#^|ot(+b!O} zrAcSJGha_%Lx~5%kX0q1_@AbMC{CSlf}5CZ!THr4J=-BOn63?5N_3DhJZ@KzYMxfq z(G_)wwEgpMAx&}@@gYAEt+DB1R~}La!;=R3h*FX^HAhc#B-pP(eo&CKc5bIg+}_fk z6^P<{e^!Jgz2d;G_clhpQuWhUr=IrE@kc6UZ6!aeVWnDE_}$5|r9S3iAQ^<(oYSHf zaO1Vl^cA?iWGshxV~4YF^J^rqxWa7=kU&Ck0!j+Jpc#jv=QWt|*?h15^k=6*M1s+mOLdqB0Vj@F`Hdma<4`H5xj^?e;J(#sf{|9po#}A) zw+iv7qi6g}gejJpZbLYiSze$P>!7P=i(A*Y*i8N?9 z#A`lTZYGeXi=kqUK$sfe^$N!M0CBe@N@2Ggyv(FI9fFS9>l_9qdN7P2%Naqd>2_D? zc5eq-uu#KHn2rD1UcrefiyFjpLFa1b?K@ML=`8Y>C7Ib9P@a*E76+|%c?#Q$E}p5S5((kWc^s7ErRg&tXQxrJjU?YDksCX|JuQwG%9?=lP)n^Cy}S3?~s67$#zwg>vzbd zw0-K~$-$V5cDoRH#H6E#Uo?4*l68IJMPZE0vWLYH&J;A`#C; zDZh-C>wViyH&Pb6q99W`8wbP(P6LU=gQ*z|-RCWh5e|MEb>Ymi!l<6FI(4;9)c_es%~**c>KnK@xblw_F3xHtMEq>YDQ9{ILQ>53ga| zkm4+^Cx(ybb(q9wtF6w&m+~_y0tKsLE6HtNz=Rw$2~@YkT=QN8PIT!DiwPxeZ2f^9 znABo>F;$N_{ME&zfbVy40;L@9O@*`Zj^W(agi)1QCUd;2L;O6zD8{-?3r4Z3E5-B@ zxDW~)SIGxgi|4&}r(lL*gQbH_et#O~Xj6qF=D;8$kEUKE9Z1To!mM8JJ}7bF{Gg3W zyonU3eAARe)9XF1Exb@wkY5#;ept7N>ak~9PJ#7V8LxQ3$>yXmg}@T+k6~_Eel@_v z;sWT;L%kCj*&|BoM2;a{58dBGggEmaCTV<^e*^V=8Qd(He_jW1v`heM5ftas&_KdT z3rbq!#aV~BQA18iQ7~gD>Q3u9r$%~WU$VZ3fd^8bGs_qWbIn7LAG@F3y`;c1rA&($ z$+k{A#bj8*h-R;LU~1kaswJEDX5ZC8mV2{=3t}bs9G$KmBkRrL zYHEN@C2LjOPhi#d*?nx|lq1F>-(Ck)}c zkd>}d?IUZAr{~f+vr04DlulrK!o_4YIJr=m#!e9uT?VSL^uC z@XtU0;O~oUXf($$wlK%(_v|Mjzku#_%r)Fxi>F$bJZ=JbFRP3B`CoJHK=E@1;wGG< z3-P=`>75ieQ5tNo;G75BVQPhCNoE}{P4J_<`?t5jJPzA_uK7-~dkh+ghykb1qU>vw z5Z6PLR9(%=a)xVW4aj~J@enBi6%g_aS$cH>amAUgit%IlV1}~d?bGYYuW6ZR99sc` zJ(N7h<#hD&lqU6Hsxaw87J0QSoNwV&Wx2VUKYaQlpO(Ep?6NuTTnZ%hg2W{ab6g(dynqgXnCB!<& zkN?&c4;JViJkLNOkfv@0PQz1KwgG3Qa}D~-S7v~bt|S@g1H+2i9iR{^fxuP+(9@J9 zZ5p{X@tYe9q+PEac{81+_q@Y5vyStlr2)4Z{LyViS%W<|U`%Q~r^8fhmD`kXT*Mg` z6r0ocQanf~Sm40stVg}73Ffl605!xa%KhaGyF}x5rTE~&f{woqEIXK>FX)=wL5;-~fjFpmb$e>U{bpdTkeYg4M?Y|;D5WWqqh zx>hDP!j+eQJX;dX!(vXtGzChzl}=;5zG1(1nPKy^rBIT7-cDVtlRqy5rznAB!1?P_ z4KSGDNhpsGROZyWSzvqQSTg7q+dPJFf!YsLD`0(d1O51Ilu#f^5*ke?4>OS~O5kWB zu3})~RlKUKBsplgnsGNkf%z)VFw5!;;ryFFx&c9<5g8a{HH%m;;s@2AD#(xBE)wW~ zDX1{77y_+UKoaUB4*VkR9GNhXr()9ya>%JXhIri8#*5#e^uwa9ebi7;Mg0zj8l?o6 z6=eug)>%~`MT@`+(2p(z@`VTIll^MQzi$-q*ijAK%y)e<*OFjxtTc(9fpa(Um_Py= z40s99k8PmKHT9E1`EKIG`??NET2zKlKE`_>u1&CtaWWlFePXLt*ED%9`ka8g0m;hC zOt(txE3;bF&CjoML)H+e5~Rrl3KWyusle&%6r9v>jy#J*wpis(&Nz1p=S&F%Piro{ zWCWzd32U7$dV|EU9gP3^&xCP~^~@Vp_K;6dz(tu_wW`|pg)CveV{Fc2W^Wx9?m;Dw ze0JFxO@q>&clFi$>W+-8!@F5W&zs9B6HcLL@-L8U4LB+}dgK+f632&ACc<|W=n=wXb0(m`6?Q0sA#?3m#Q5a#RaoJ;(Rq}FJ@6WzP9aBJbA`SFI zX`Re0uOy2TX~z)DJAP$F5H>x&cw3iIp|CfHhA7>ghVD+C8cr3AA4k)jCe4I6y`p1@ z#zqcQ#^S>c)*fa2Ox@pwVz@tRIMFyOyynzH;IF!9?P#$|o3w22jqxPTVFWFaQKxUv zn1kX|6(md)rEr7R+FbU=Ua8ceuA^&Q7JX$?32eoHyN9^wwWV(tV_d;IR!D`DcQ zk_(iY5d&{HmvAdMKS&rtm2vG{D_e2gR2xwP3iy!S#aD?2__ny{S+0{+P{zAyb3?|+ zGyXE9WK^9PtDsyw823_+qpswDDAyglP*n48uqQ5@BD6*j3ayc}-a)A} z4*Xn4IsTDfDUIgcY2ZZ?kU01B`YeDNhZV9bKz)(Y*&u_zq=gMk4b@{kZzd4|@n00T z5{&$;5PnmHn8V$@aj)81!(Y9-dB@*gxpOgAcvJT`{U+XSi)e}Ug-x#95Ji|u;?|&6 z3(WjX>Pt?;pT{w2-KD*`vxxBWe|>L~b7YvpJ3T1nUr%cLZGnsr=Zv!No+gN2sU(Vg z>n{{KLi3K#^x}Vmms9tu?&h&~!{g)aIwgje*SFj4_W1X2xW|7n-bRjZ4U*gG z|8H{5$k9Z)xccL{wur@!{hL7!}(_6O6DNz*K+I+&1;JC#jLC21QQ0JcMB|TTfNA1 zHU8_fLI+!tHsiPF)RHr~M}n(G4JJ77rTg!B--_vqq)ohJz|Ab?l6WjYNw=M zop*LBa$Ac;mO9Rf6*m32=@0QwH`AZ%YZ1nc#vmx3c2jIl0rbidX6kL!Q63aDU5YIO zAGWC$RSr|U;;pw(H&T{-cQK0{2q#nXt%egz9AR_bd9<=Hx8+UG4+Y?7^-dj8!bGv9 zt68MFZW`A-T+4k@P~Ms@20;9xNo*;zHQn+fQa#?DP$_Ic`q5$KMd`2()2Bd3?@vkC?JAhf52%~%mXg<^?CN;CJ6eII?z@uThnS%>rymSNv+q~$M5jmgrf%R z`zfuhc?KGM>l{$ONgN7=4LjcV5+uPy8ab?N(eOqC&=Rf~RFq0g^CJar zP@@);oE|AecL=v*!=DK2_VD7fm4<$3l#JTEW*~>^3i~ zAFbem8G3gq8ChsXS&DNEqmmUMZ^G$E1C5tlQ84BQ5?oVM`u}Nrx8=NXENS$bk5Lh0 zRoSv+;tUW}?O&lC;n){{U+I3{|1CChfrkZtBmq)xSC`B7Om}C4IIKgiOsdEla)K1) zHO1J~8l7RSx`usPtmo5|F@^cIawa~E($vHJta0OESS=i^zt&O`VCp3*g>qb2VE%4} zIlr27a+e?fb5)idMp*U^utc$P+ND&qIQqOEi!i^>E_Ib--yNl(i}TBP{X*T7k6~xt zuYswZ%MbPYDIX=bW^-SaiQ~??!iB+NQdTfZfaDgSy3cui9p}Tx-*!X$1bH# zuw@WT_yjl?j-4V>m~FzDiFMGZTuF9A)FCqjz4KeK`M$Mf06MjTvK2c@+@@564y37} zRRQ^MgZkxXlKrMO_6^a7ty1l}KMS02B7;_K1J3X=ceTQi`R}E#zNdA=P_oXOMPW3^ zt_~8}-Tb1v*ktae`sPNa0I>$brG~$pQZh1N=qb}bMf#DEzJ(&^%!7Szq%ktQJ>eJT z>8ihsfh_hHdXx)oG78P z1yz~EP+iaMhRvjcQhnTfmE_?9oiLClvpcY?VwgJrnof)33dbA3<7vb&b6t6g0 z%bh&QNvF65lZLwEcy8!-wRN+j54a1{P{1sjV9tt_!|-Qd?n*+gC|UYpCd0ux$WN+@ zL*3aNOs9<3B_Xro5mdv(;B4BZvbgPeo)q$|bPLd9wu+eRCHJaMnC%)uKru(NV8pWe zgK+c*b(BT8?Di_TYTIRyx~+I$s<5r!orU$}K_6^ahxwa|xGyj64=F15`vTRwDD#!n zh%%(VUc{M1;H`kZuu51_dLGWh5t&#P6_8j<5&i5)80Hp8^b3h;E2rGzNBE!!xeL?k zY>C|h6XV*CH~ZHjyIoz~4boh0GSf>$%+qtEx!~{;qk0l$f{gjiX^gYdsyzM;)i(D$GC)aS2>QsWTLfc1CV)p!zTUcehzwL+8l=pnV?Z0q11_d zfSSKO#Uo8nJF>v>$0~ATym6OiLD=-Vr>5teWZg~oBSsJ3lXdvDqC=gvj62&W`Q2WQwTn**t%KUr?NycaGa)f#> zt}i4)*#>2geY%cmgVtegD~(fkDY$qzA=Pk_cJ%4j@6K>$>vP|PGyZvf@=V7fbs-@= z%8i~C#yC(##xw55f^MSvTqL_$%E62jq>Ev7$boX=7`ZrQSC>YD*r0P zpKcP2y_zaL`X?;kojYO~nZvv~$P_=tfpzJr#;|d18!z}Lu&D|7&I~$*rJm_97BF@1 zXOaIp%wsj}`LkawF7L=yZ&y>#b;fD(sP@U_dZ_gu3Pg5%d{&gAq=`;GK9#k+A$itD z6D2hS9QJ@RI3qc6k15?_;LLPYl)tG`&&f@*km>Z>fFF0{BGa1*yH&7qBs=GtQHO{^ z(2K5$u{s1x6t^+n>msH&9=v1;5za@XQ_)t>9uzgC-N_L$bo{lr>VGz;@fyT7^tpLe zMDwm}`ZlC}(tz^^j3P(p7Bcfc;HITtgkQhhXk_@c@o7``mSFXIkXR7nriOgmmj{tC zETgb-mSfB7>mID8S_G>ftVh-$gyW=om~6v2Sup;|$2~{MsNbZk9^*VbNY=bE0!4LH zm8&pM#T586FB2Az4KF}UVahnTtG%R2 z;>_a2TVc{cV!@r^!5!+O(^Y?p^7rrVM0!=G0Q6OReX9_@QT}H)lUcR@J(X*301Yi~ zr*Ew^KpQ&kQ&Vv6jN3%Z5L~XIJkhpegt4n{{QPtVR2M;LDlaF$O2*Nd$3RsTCjs^R zE%MQiQxAR&&))h=>4#uF*>rts8^9J~D2q8dMO5}W$WtwrDPE3u#Ofbpq1T&<$^QpC z-Ar|$AKg$=*Ww?E?(!%dqylNI-0)eDa!KVR&v0u=2bOmFM}hVB;V+d1)PfkDgd2056|Q#L+jZxM%vmM!*ST|ks26}*!U!VMSH(B+&4s(&%W0i(z;GaPbotOjZuUD06m?8@g zfVrrd2J{A}pXT`+aBf?0P@!;HYz_BS+)4pj)7sfaZe%@@e&{JLFfEs=CTYbg$iM!I zL#EDet2LW5v{4U{S1@{pF>8k~fceMo&z&F8`~GYG0X6@>9)JA&>*uN+QmACg6smdN zS+$!8eWp@1e#nEIY|m@LUY8ta*CB%m0rD4!DYfAAqgSzLU5sBQr_6Rzz0IsiV;8*| zSROgU{Ezys4zgk{qv@<%)T`M8x#-noU_Ta!|Ed4z{;K~d_S~LJlxP3_O93zt@t|d$ zx~JJvEl$Vt%c>>WatnoD42KVr+ndZNr}A`G^lJH3bHB9)A2728a2} z@TX4}MPwKgronrbEx1BbO;Fjg0dHk(LyW=Rl0+o!_`!KOXzu@7;#8BuoSz8ney`&H^?a&>dHqbc4ZJ_pq$<_GX3!5OeQ$3F|r@?}TecEM%^mtYVzN0ev zX!*R4l{zZ24XXFf`}i{sFN2&D?O`!I*~z<`C$Bd~EbCPj*4b;Aon+!f{qT2WwW4Fx zFK)(}{~l+G%4P94YWIpEq^%lKDAn`u#20N;L!xyJSJXaJD$e)6c-Alea=1r0!PLg= zL#ek=z$8Iobsgs8u}_JTx!OcsdMY66Lt9q!$=1o{-EyD}H~zsBg{RaHo=hzsMO)P! ztIhQ=agpZW=JI6=(iq2cw$$Zw&m^`|OoG8tF13@9=EpQIs%=5~IeYHO6Y?P)uSa{U z2C)F*O{K#hQecJRTPXHH(GsvEB;SDe+>l}P{(&%OWb0fBw|!^3fC2S(Y6>&}jq%Ti zR6Ew%es4ho7291S&d4RO;~&VQc088ZdYVlhC6S+0H)@w_?Ej7R+<3#TD{6T(O$7-w zE2~GU8&?u%w7IIO<2wKH2(?}Mle=xQHFX@R%QNFmXaDc3n^d)Rt|*%FH0;a|JukSM zEU8;Zx!Q$IYv|lld7fZPBar)i&{AXfvG{gPTHtLwm5t+~lT`g8>51X(AgL!q39|x2aM-726 zKEp8Z#!POYxlw4Jqj18%co1dMM8y3n%>8SAV0@29!)o1J{1oZt>;rL;{2op!mnn4{ z=NI4lJsapJ7x2-5EK!pm>{*GPs!GhX3%DeF8 z^&hN0ph=xBO;k%#>=YE!`wT@T3Uz1I)j+=QjcmkN@XZH*dBR|d@DqIN_)Lk`02|cc&YzDn>TkSFUVRg}lrS{8Yqg_SnT~z*mM)me>XU4@;MVXBez-Bn}ZMiI!JE;Ll z%wrBrZ9(-#-%e-Q@fyV6zyElkylbS)7E`a{iLh19vV>KbAKfqvQ4Qs(AHEiTX5uz( z`Sh-*Cy6IW0J^#5`zD+ZVu=r98=FK<`lgTWvN%ABK$#c4>iOf2A}NW@S1^JOy(K+Z zZl8_n$aOc`S$5ok$C83N@Az_Zpybvfn}A*)&+?7>-K%Xn+nsLpP0z^@jMJ+qAwww~ z1xUlw93cN)k?(I6tAbqOpmZQ}=k?VD*7Q+OTb^nm4KTA}=C7d~d5SIXr)Xq^36#Rn zu93YEY@_m^iOeQc3*zhFU%OVZTvTry8PxK9$}3^yE`#f-{{2t7HIsx{+_%fFa*mO9 zraRw3kMWPqut~c!ub!Fta(S+OcIy!| zeUG7W$(l8)@WXJqqu2npD+|bvvNaDmr#3AkFqTYyQ(!c}PRcJ!DlkuiC7ny4Ek}OQ z+OR9K_@!_;H$Cwx!HF1VJuIYG>2w!~aUCO7!iYRM91h-ZucT%VRm61Kfazw}`2v7k zwB8{9_(GzYO-L0sq6rj&JZF}}aP}zmZNDH{510+rlo#6~7rs&lAyILRvdY1%JaEN@x<%ILS6u~Q ze$8FHcEB67ke49|(L-(Y2qp>T!Gxy)=4GKg=^7t$*p}KMa0yQ}>2&8@!}hZ{3%Fo!+9t?7z6M2BT*y4rgdxJ%ZBF_6V1 z%jxP@ucg9a?*+2WKCmgz8BE#%O0nvKERuFHH?!+V<4&zREYs_^3ST}O07DT~_ZSOb z-=K61r|mhvK{bT9-hWIjg74$hWZy!*cyDGi4vfyg4iY8L0%1|%ZbP_rVHn2^eeF(# z8|Qnl@X{4L`AM0OHT+Y1-_=2Wc451Ej(*dR<7+?u7@YeVFOLZ3x(CKQ&|@b$wP4q~ zpX(mD-ur1RxDGWNc}MRiO$xJ&m&F}2!wsTxAj>V6MG~Qww><1n>E;i61K9K!{rk|z zdiz>7v?_~=ESmMNPl528eET>3@@M+%Uw?HS=hiiuuXwkTrvY%8ClusXCtMTC zVOlOd#6MSsC;4!lK!Ucyd%{Wy!kOzb>589J5v%8?JDwwjST~J*#LG`5b=RS#KuFw0>!<2+XGbX$dr2KMCSR|g|ittC9`)*F6WQFg;uQFaUMJRVkJH^(Qm$t8jC zH9#0IAQj>e?Nx~R{u##BHmeLx`GrJDMq@b*6Q%%BDflJBP_tZThWTFm-ciM;wU=Hg z6PrEXg{`|1u_}5`l{m|L>p;5Fs4VFW`mT9r#&{<7aifn#104YZSvW)dVFysX#Ke6z zd~CqE_EP5-t7rXHjCyRQjzX(=G!nJ|=?Byf7MP04npwUt{U(=QYp?luMo0AbC$USA z%OrLN_G5wgUz)_WhFp^jIXF_!+%f=63p`I}URfxQRk7=J6Nt|fW7z5gx4E`Ta#Kjb zq;WF6gpR$0JiEn^hDe{f4)iy&+n{YwOmBBa8@}jJk-lbn3}H{>SN@(-h9`|?lCZAh zJno*wGK07>?*zds8p~Dia}xzhmWqL8D4S6FF;lk|fec7@zzS23( z6I;R>0h4#OLAgnR%=*Aplns5paBv)Hs3wh z3mV9}DgRFUe!yVhdWgcN;6`HgRhaMH_h)h&Ou;w=106oR#y0RrkBlv!Lw{6dHRPQV zyc{f>as$xi_SWS1Ko+A_oHe0<>obEn3&*8|T#ST!IIAN8wQ{Me_Y+Iz4Cp5TbxV0o zrN7+k-!tRTjnA+BH~nMW5@3t<)F=E`sNJP(48{a&c)`sSgeBc_-_;lNqkEKoC z9PIi?^5y9g*eSi#S;eyluD7Ibh^J!Nq1>lHkVvf0b_ysHeSCZz!2ImQIDX4|u*P4i zwd#!1x}!z~eS7bRiE>il)XTe>h3;yrS*QucG1of$REH5}9PDM8yLom*c$MWM4RgH) zPQThunJ$ksAK?60mvdJS1O{JE=^yd!%{!4{R7h&*KoyJ&0uA!AYsI zn4f2Kj_gCXL*1)$b!UDy%vwQ_A!)XCgL35ssurnSeme)6Yr?^fXA$OALPvHXGMKg> zd*9a*kO{k&`U6Z0>tOLISvb1dr+@Z{PF~Lk^$uQSr{czn500V={QeE2^ z-V!o}d6LQ~>sz3kCy4y~xglRauk=egR9lHY>s#D#B@sNlf;bRgOs=(#7Xj8(4I8x^ z$|3Q=^F8aG_95QM>qsYG`|lA3gmV1+0M7^hE44<%)`aUeO@4 zA?^m}=LaQqB~J13?M2F`r|(Zf{CxWL^dy`eQB6*JeEeyDKFWhnsi>OlbnH$(LJb4l zP7sexV7VR~gt(cVqa})0bv>IH(ediHpQQm1>f`F~)e%l*Db&A;b9fMif;P->Z!8a* zRp_*)r8Qv#)D>G)wE_J>hp5L-&*MwN`ZR(bKJ@std>yt_W93?Tr?AXZeTt+7tLpFt z6(k1LxT5_SbXHhLe?7hXPwAhp%c8zMo_gO*JEb^Z!)k%AsN`Ezba|Mp0<+9X{9gC% zk*J~?$?%$VOC+X>*n;wRp1MY=5$B)=Zb$VecxgE&A;We7uUXJbw<|g&C0b|2=sMDz z^+yrfs z;=bZ75;F@}R#*5mMwtTD>mdpa!J+NoJpNuosRAWrHq{qV#>uAeqM6##@-oD!lb|E> zqs7Z7Zc+tm*sb03T+<2JFY&7aHd8$xV`b@DOK@e+7L(T`l&A6BwgYu42O8s@0Zll* zS>*TN0(q&AV5Y-QBA|7cag`=kZ(Ijhr{6dXnc6!lJ6TaTeR0gfIAz&a=cH-h&l9gr z3yN><`k%A>hD-l+%aiq>Rofhw!g&EDUGyHzkyM!RJ>)a6K8JuQ<8Xx3PFdLsy4+AhHxk-W=b2_P))}Kn^rH zZUn^PQ*KBNM)Z?6Cm(wd0CVL#ad=rm=>gd zu~MSBD7~%a-0)SJXWSjeat+p%&FVSU!ok4!LH5S0C=UUMQPH)8Rww6tSMxrR711n1Yfz%=a#S zAID$I0{+??CB9R1R{hLc6$9epU7)#?D{UK4dyv=1dqAfmzRt{ajG#CklceWVo251` ziS*XAP0zl&n-Y=hYD*+nm+q;Mz*;3rpo=OL>{M|DGyr|17!--fjzUo1mL3bjiQvg` zb63LY*|k5%PBom53R`oFAr6Q{U36EU&^dInD!mazc3jVU0TVVUy=#UUInJkPE+t+4 z&p>${CedHafB+v}xrMR`v?`Z~BrDMp#kRUV<8%Bcet&&^Q?KIf)upfQ(}y25+X0)x z`0l+VGf>HDd4j0bFiNpSrOC@pFe+qs>mcRo(%45$@#q6N&on-yOwStZ93oM7}Rv$ghBTmIk7Q4yGE zvUIpoldWwEFhm|(I;9*)*v?vJE#n2Yi&dL-!v2tA{g0q7E%I8?;@7M!~= z>4Sm71)&qeDvzL#_k-7%`YNImzb>k=S%0Yo>MjA&4AVdL+p~J!AQdf0i3mf)*Yx}| z4SL}8a{2f&>piys^(ticOUjXbjOvG_UWi|aFb^Mpw^pX6%Ssi}Yr#%_K3e+%>$*|J zqyyUe{T=JxR&9w-vHD`*T#G@23d8HpKCH7!=svw7lZPa6{P^>j?HA}g zcc+$Pe4@SEfOCFOH6Ou2zrpL$hn#TQ;?&ElIJc_00ZNOZNvO8KxpYPZVRVZ0`#?kIkX|d-bQyJUnO|~oP2)fp z(tv{niJ1|ZNgQvjDajOjI!^+V0&9d4SVAgEv$nTg9VldqZTRVqHHzbvukY5;CMulS z6!@U9!*HhfbioNKIp~j7l<(`?6qbz1zhV6R(7IFZRw8t9<j)%Tw_*IB1m%C0f{s}#dSM@@D($Ilfm0Ox zH;VWjXeO*4!PUvlq)uL6&$&0|xQfuPtZhes{Co&>cR}Sk@1RIZ3XBrVo8f`iNg=eE z{*QH-pWVKMI@_P&3Qn;O&C{fdkf7{1&zm`1{2P*XFI3vAB271`UHuOSBE3BrBVK^2 zW)3-ql2d?>F3CEHrJ+*omLN~gu~P+Oj-702<5Ll?mAd-jiKpkL-I)$QZbTa=(L;|wNCN9%|56r%5>nDNi33lmt7U(`)4PI=fREBGjE@j zT|c*d0^nD-Pb57qMl3?=*q{)X@$9bGT=*Mp@|Qe(Sx4%%4?dp$xGb?s>=#W#q862W z!FQ&mHhImjK52=o*Zkpd<|n<<&S|=rZdDrPbUHPuWty z&^52<7C@;d*2^}W)gQyvyPniXaK#5wS1>}c4azTiIi;Ur_i^rm=x4OlU?DM<)|R+rIs2akO6_c|pnrFlk(6H2~DF zr5rI)o$Elm`0LR;J{J9_%VE>>EGZ-*XpVTl@>{hKn=rHo8NXpY7q;!xx0@0Y>g7Zi zT)6CmHy3Ie|4lOf=gsuXH`I=frkM^GTY% zJV0yUOMv!mBidu(mz}5S^vV}3rPrW1qwi$-tQZ<3EPArpIHiV@co2hpTS29SY;)`X zJlX&o$>pARa4F6caQ1P~e&-q+n(Kh$=tCK+ww~|rmDfJ*2YEg$JQ`~= z32VAdd1N9rZ`xvyYSmRyE?m->6J{OvYI>Jx8{|Zj0HQ#q!GPn1T+L;TWmf4VsOLc8 zngksxmp?m5FpnL_@t$-6&D)B!MR{lBR+aFyHfor z91O3Mvhm=2%I!T%LlwV5-K42hTTIj&-NGwr6_IUKSV%E``tUlX_k1Y4Kne~Q=(#OG z%MNYaWO&3fx}m>p$Q02Mqs%+a7L>F)!~Yfkugn>qo2z6q)FY&y%))W3ku_#Vjs4@) z@2zUeMo0sSQ}qCsg(V>?Y**GUrz^w_q9OkQqpV#f_;ut zJ>bmRU#A~GE$Q2MmM!n?Q`>g#F>Ni<`;hi2*DNZK(i7SUv?fj`Zg_w!z7#}4u5Vz^HZyYGo>>LDbijtkvHH8JI`hm37^KijR-*X$p5OR(DnHhMu9^Mg z3!4AP-4|M7RS>2aQHy%TFKpnmS@UHX&V# zj$-+x8ed+B`jGy%TZWbiZ3@Jqw*%5Vn8+`Oo(sH6r*p@ljxz1g=iOMueZiM38=`_k zNd2aw^i<%qq@?AVaN4!b7`B?e)U8+Xx{{XYX|BolgM|gQ3LGg9o72mb(xyr2{tPEE zU*h7#uyC@^=1oO5&!VgNccOA>-lVyLKdw4&44NDI)Ae)vsx_wU!bwmB9R z1Lg!nKZKq=^IVBm`pNIDigN9O$6|Ayk?>7}niAE;9d&JeNPA@+vM+$<4{xsEJoa4K zTCNz~td_JEom(HZdXhCT3Y zZx0NQoFcO)lwRmiQ|%G9TIoi`3l%IoRJ@eyRkmTNZ-J>+vM=y>aHwQk$_y=HGNc@I zQG?*R$sj^b30?)~(RDhw$(Sb{s>`QpmW0sYmZ)?G1h$}#F^4I$SFRDK0V*d)BNDJ5 zQ=QbXUhbydV4EFA;5-Z6603W5O*wX^XhBsAP`*b4zedCKYn7q_b#+PSjzhpp3W|F{ z9S7ER`qucRUefgSL$`x7dL8bWn?!7rUHg_ zMrCh|_4600lb*)duYCRGLiOWHGjew{|K=_;awm`Sjw((*N19>?bnkB=EuYij#x`BJ z_%I*tdogHclAyU(wuL^K2wfHCl8w4`VkpkP1^ZQv81uav@lDbd7!AeOFLDyIZIGaz zTgUmH{-4J@fb%>@qc8e&Ow5KxPfcjOIS>qZkGa4ZMD97X;H2VcI6Y8z{C#Of;fWF~ zb;i~Y&-*ws_i>tVaz@bxuifo5incNYv=$(qmtK3GJQ?QVA%o4Gv4#!$KTjl&n&+5q zcfv(E)Iy56q^E?AJp;+}P>kE7ieF#M&o12^(WKLp{&S1+6AC(V&s@mXIgg%3glMY8 z2Oi)VRTjvy-|Hd9-x(v9t)Q$q0X8CzVu|>E8h@&!rT2|SvP*UkSE0c&Ih_)xa`{fG zu4V1=Bwf0DTDhYiZVT$PvdKN9b+N=L_&na zozpH0ucidj=#(zYBcQ=7B7Y4)V|<>K`nhRttN`6@*GkSL!UUUy9ZulodyW(#Q|7wX zfx626)OVjF1H~uZ`}A+%l_%r2XMUMlK7sjugZLubq<^6JW(Ea@f#DShE{Z?xQQg&n zrtI^(8hzTK-m>h@EhcOeR(MNxVfc}f z?t-r4W8D)&x2dM0HIjZN-yU8q;~T-M+!brwuYcMIY3_%@lG9uGy0Z;yi#&ul`KnuI zf)5wA8~b11iR+#h^81`TJsU%$l1HVgP*So_`~)EZ1kpEVSQKT80AFN})A|nRmT`tv zb9h+k8ead@!+x}P{^_U6vq$R06Hu-(Z(%jjhpu6t-uUpQ$)y^xD9U3)!Nj*LfKt7h zXyetx^f@(}hT#vw%=V!cw0@Hov{+2Nc}0mu)`Id}2tR$ht0KvBgOP|20=eA*Op#t! zFCJmOUxT^dy=n3lpc%1roubER9RKQ$e|0;DdFPAyvDU?2+P}JPYlM>Ak4s@*(0tgE z1GPp^CQYq4H-5|H799tWG+*N#Um1f2BWg}Qy-tAzB?V0ByNk=Yqn$+>>99Jz9OTUs z$qLhUkR?1^PC3)>n{k$7V#D;xV;CdckLAlCp_ddgN6|8?_*~7V_;OOu{r(oFCn< z^o+-Uyq^;~TNFs$PI#}{82BjUfISLT+;xj`oE8$K=|8pw=AkQB%rE?4#~U}j$<{3C zc9Nl#qtZ^4JunoV>Nu$$y4{v5VS_yS+)d44MW5b)g0EPOqy(C$LYshA($9xxT?=!N z2G-#Z?mXz?><)E2UNXyQ&^gNA2R7yd%jdN|9|;Tx3yNwZ`3FHA-AR^*aHV-ACiX)O zC@D}{=Za51)ZTehf9G?<1~*gszoRtDH)&CQ5HxmEh55k-!!omVnQ3`@hCs3wI>uA> zz4S*T!II`~gcU>~WA_ukNQ0Yej)&Vb14ZoQ)fmsEGVct7EAltR_t$dZ#ZeAa^?K4z z+5`H|Rx|;P9=*BruG^e34-Ym{+o3>Ts3)b%@NxJecWltGg95E!n-p5kbDLI4le(Hoq|Nk~>bFl| ziG3$14)OfG_eX+1O%YU<^_AdgrHGU_zqwAbQYesm*_|II;7qsa-zYs%pa;W_7Z+pX zvEDA6HTWI^gvRB9k6S;$Rt;hZzs%MDc-yB%rVxc z9SW=P>SurdV747P%-{M)ikiYGO!`fVTH>oY?cNA=NXPZ*ElZIz47%7{1J`t>tcjFL zYAXtvR8Ae)3#EUZpL{Jg2Wk;`lP)KBfnTMbIosyvN%URy!*7Bp|6tPdg+w&a+?Aka zt@Op`=iDqf@%7ZYJN@{F;ohv7DhEII!<}=Vod#gk1IJ)>1Qd2spgi3HIuwii?6*Wh z)i@Chs{tZr%3c&V6c0|aMz-pjKE6BgV7~e7^O|PLV>4sv$ zBsf{7vPoq)k)X6Idl-DD6q#lxvGpS5$*9TW7q!LY8Q#gszkZgoOe$#ODs56XKzfjU zd;hScrU`5GF+Sx>-a3BO$I6oKW0kxuCXnH$XgUa9itv200m6BLyco|v{qN(f#EpyG z=j9oUS#MmL6rq5mC)ot^1k^4qQ8*6Nm-L-%BQ=S*x>qp8-54KdMfYVOPH)pZ9Euq+ z4Rh9FUIYo%Ec9og_J$z3x~N!Hf1K#~@t-mMZ@#FTIPu&C#@B^GIL(4n==tg;jFQ^vPS9c*LbsoQhK3Q&w9O3-z+|&O& zKm8=ZZUSeQ{4SmXRl5})qt<4qCpiGp{oXsagA)sjh5y#_AR=Sw&&()*BkO49J-a z+D2er2ug*#qSeoidvvJi(lxE9n^sgZE`*>di~wcyH7Y@Ef1yGs?eYU!{qR|s__Gt= zhqR@LC?5PF;G- z&{sP=!e^ctWFH+d_(ZYLR+c+rvBEsvyT8Qv&8a6hiczIV9Yh)T%PFRx{g zPl)jPzZS{))wFZBARS7VPP3o)Q80~o%7X(fc{T;Gi-V7ax=EnH<|Jo95Oa8B=lYy$d|MCxGL1kudGhy8qfWCm)Frm?_1 zK%1NP@_hKQqfu`8mp5CWwrhHx3RJdaY*U-!mX|C| z4C4+ZZ7pNir%%6La%e3kg{9otO-O$qBTY;A^z$`-k59plUZ=009;2rm1dv6N*q8gA zE+}F59K^H36oP-)CG?7YdQR0&VG2k*^#wg;MGf~UkQRe30{$&2;R~Vo^t{sNvtL^k z<|EL^1FEe5d`n{og;R_U1?W_oz!tQqow=-ARE7E39Yv?{zTNTs-4%|!h6$cMz=?6k(^+XwpW6`UWx z&qQ7`%PhKZpt5rGBuztOg6tU$ZhNIUPR+l1e&Z|42~(e)1&yq-`}1eKW5$gPL`k^F z227E#KvXcj1x2<}(ozBX9A=7`-B@#SIGV@WJf3^3s04+IQA!8CrtI}}>7l@UaD7QD z3|LDY^2@I$U7?u241N0BIChlyi-h>pqe1$+x}ZCe$nwrx>h52gV4qs!0`hbU29l!q z@Jtjy!Ehprtkh6~YJ%B}Yad>=Df)v2>FH|H%u3Lt|CmM88xlcOj@Tmoy@T}mm;43( z=o4m!evJ}Q$Wu{;R{OEYu_VZ1DW;GjLyS1+66q2!8$VY|>uXAZ(jOeiV6+JM#zLr! zFDUCe<(iq)T^%Ln!u`3E+$|NwNitamq#x)$@~V#Hl+7UKguaUONSko9Q}xux8|q&e za)P;(L(8#W=^F)1rK$5EnsCPIdR`xOAdV3%xzl2%uql{+v!F{QC)jQR`av!TM?gQ` zJDbkVKnEyg^-qol++yxH45J@lCBCVUl*BFJ9*JAIpzlbQ5GF&)@FnH6Ej14fRaf-A z+l@zT@v2_FpJ>>g%6LA-qdjXnl{mFngvutI@5eYrzTq`pO2Y$duSVE3M-5!akAqnZ zCuUwBDFpJtHxA@DP+8AIvVmLI)?E#mxwMkNnhI?+JyCuSE#d&`lU##qvtyqKLgb?8 zT!OqOK2P&^DLgUk%9otCMJh18@9H3@UFyAG)ym?coVi~k@onL~CK9YQSXPFU6Jpe) zom#>8et4R`E}87|$f%cg6X0`^d`jpMmsDXs)^eWNCK&8JDG=)5Dw=85BHK7DVV(>; z7?m_jP>1<|lSzCP%BHSI+Nq&F-TM< zy1v)_q~_YwKhkf0EQX>^JRW)5)?e~)MA4|>OV3I{N&OIa#2^g8dRp%Y-|z6G~%n;&tgV4X)UseBvR@qj)u( zA60q$$5M`OFW;sJ4+7MR(Mw98Aasyt&qbi|QJC@(sA-Y*1=kr9P6MJ(c98tTL4x^q z4Q0BUyosdb?hb1ZmrwjccMpi~(7r!JO5(nz4Jj)#4NXXW>+1=|yTRDC(d@(XS4qyV zQVnLsz#;eUZmy_7yWY8MDYlCfX&p9b3(h>zILyP1--p3Za_QewUSgG3@^aKPnO|YX z1ydwk3C3_$=e(X7=Q&KgN%24y|1E9Z^egjr$6zrL22i(9u~2r~&Fn z=euMnX2 zQYEy&u%epJ4H?ag_sWo!U=`&D51ir{H_ov`gxsUQ8sJT&rxcT563ZhVWdx!ObYg1J zNkaRxjudQYs^mU{n(tV;6mv!r`JCHLp&hu2CD@?6vll80CTGa5&-q9r;Lk4B84yG| zxg7RU(;iI{2&|<;#$2(Ceuz)0ms3UDjnC0PQDt{{+_LeOisn2&2zO=)2l*m*=13hl zJZRdt03G)^&$D5U_ctsELY{Z>B(j6~;B`S=XDz=6NRsyaP9@>TrFUw))p{@0q~ z;U`^Tz{-=MyewZMX^1FAs*(jMIt3|yxdwcPGI`6nI*?pgf@r_ppwhMIQ#z>STZLW9 z2oR=-P^JQT!%;e|=#km&ohqp%+A~snWO_YQ_bq`!cLOqo4M6wYBg-s5T-G7RtOF{f zsF9)c%wBy{1SeD3xehahrgr1VJXU-s+7^`f#2`FL*y6CcW@Cy1DH(}U-dq8iZ&CM$ zl{9DOGYOCf3Z&SlXGon^^q#f=tz7jOdcMo{Bu)}$y=}iXe_BOFSCJ3 z7t?JsC}A(cbg}Iw70nu3D3=A2&E^+Wfz(V&CKr$fG8W3s#&XytUY~7Pdw4FZ*;xG4 z_iSbTHq}?6tHWI`seyNHo4C#mSHDJKKV7U|@RjA#E2K*;RaXa!WPd@!dk7vr|C8^=#GBc$?f!P#<* zOqzfy$HP7R=Po#ZKj4IW`0vmEbdk91L)Ocukr8C`!&4ZZ($0FaPe1tU(^Wl@QyeX4b-oc?^( zYHX&DW9Z((!O4cBCG~Pv2)Tr`(Y{BtwVMq#Y0z@uQh+8snv0r&Gcd);m;E`)I?VX+ z;On+diA_(u9Ekg@_-)U#(Jb-qn}A08;3+0sZ~MRF&*3Xo=-d@g_dTd(7!>8bQ4SO> z9g=u%TF_W-NKKtFNqY41NBWJwPAhVl6iTTQ_#2i@G^Nk)meP|!1)9RV3dX_$kx+`b z5{i6!_J2NazPeB?AT?P>Yq;_?Y8WXFiH}8mH)&L2M9|&{?oA5q6v)`= zPLh0_xsaiDRKIP-**{Y$Om7q{R%Pm|bAgKny~9U`5OooWuyQ`x43cn373k-YY873i z0)ikgZNq=(FbrRz>SftC+Nuo3_GUQcv>PBj7r7^)k(TV)V+?pfy~N z3!t161A|HAPxTe<#uNEpfnMKTHD%PxDB0ugx5jdj2#TErs-@L^&{52^OV^9>E3GnG z$H79yMiuMH6MRxuHBC4VM{%aNDQ?x2Wh*5{(?pKfAij%YMT1)2`$-Y31*yyZ=JiND zimPsriZ)^)fTlN!iV^&M@|*X`hpyoQIMkROeS@!VcXR?VG5~l#|&z={_I8IG! zg?$^&@l;gvs^l{-=b5yeDQQCMF-V5guBKD@sV8~?X}_KTO)HAl((p9Q!-DbgifY#{ zV&b2eX{3rCxcKW!51eLs>HQQr)z-6`h^8yxy1&Cj&rZRp1RaX2z8c5hPsA!&lKvd! z+H8#b-#vbJ+pc|UUN11f?p%OPC;RNhZEv$7cO53)PpA5Ia`}sz55vJt zdUc=91t4|Z<`mR`CvFF6k#6M zPS;sGKXi4Nzq{#%M|$V}1Z`twl_Jj57K6g;Y4Vx;4ht=OfA;rRVu1nJfW( z-G+khQFQLea!N~1(e?$oq|f(#%W@uZNk!VZc1Jkf*tS&JgGx@(Q;@}avfbHtIStvu z{QUrPm-19FpT~XA@Qqu>uHqN$PXjcmy=4rD^hSBZb9%VTRTuPkm%BFU2Q;%BI?XPU znr;kErw{L5?7oOv^>U)Fh&JiwPlw#873Y$I;x~O0%I5sj>MK;84M?FF%D(%tK!o5U z5gj3*^7)w^+;Rmfef$&Ym-&$@%rAqS*@5tg^g}S9+w#+)J1=-4xhD(0$|CJU5X&Cs)8nnVTG}t02!9mvOe-zU&E}`^7X&Yg5cF zTqrK6z}OcQwe?q!#=RA#8u)Mx>2!_CA&_feAuJihb~QzJj4&$?eja~w>{GmuW7rnC z8)Bi%vXoHuu@LcMn$0HUE{vw*p4GZlcYyM9Mww!a;lJPzN>wB=k?P@g<#EL?IHxC( ze)U;@6VG352AX+H8`pI%aH?l{L;0Zo|-=WlZ_t))9K}8 zI6vjHHIyfAh4UVC+Z<8XOv`dlEk!nP`GuyrUZb$I0hbFAzk%9S4P%|2FkFtpb}+6% z7~%S{AcVAC!kD&JI>p##099_nZQwmmA<*RESd=fO9XA)HP@TZ+6F-=&CWS1E^8M-} zQS|>r!M=M_tKNHji+=fCyfIWjPTc~1#}u(b@c#g)%OiR%Kz&tCbG$V|WHYbO^cW>w z&Ib69>=i~fCAuoe&pQU#w%H&Prj9!8Kc)y~`<}1oBv0Q4G}0>;_j|ebz2s5QM@I*v zS!_K8EAG;Ik{VnEW%WvgGxFunPJNQBx}wrk>;fzDSlm48ho9wP&QX5;7e4$HE~MiF ztJ##)IQG$b)D3MtxFc2}e!w(ikt3bd7P#ofJyv1fo$%`JNoPw5d4N>;>0W9;DrY~W z1nn2wlm07eueqCp!UP#2OToXpU2}cs1W9kS0KNX*T@&Y7lD?Lf{owd9;%o?#Q8is1 zWUVH#r@PVIdk{(tMxZF+*>`vRyHg4q+JL_9=G!OBshZStb^h>VmZH=FP|CGsBT%%K zpAM`icGE)Lwm~(tcAT}D0kscQ%G_)I`lr9w&dDS*a>15j@^M>Fl4m3{-;>#39p|3o zU}O_IYWV{Ust#9oE3dk!v<09ChdV$OcRx#NeQ#$fYz=s~s@Lp2q;5ytM70&Oa)Vs= zGW|Q)ttyH~e0y)hK9`u6;LHA)t4#7F|l9DbRj_xIi0FeJ^9IJ^KzMMT-c< zi6fx-opOypzeldZu%ifhOT~(`?x6nVTafOzn!@x?O;+uf$E_EI*N|1pjN(hGI_p@413_I4=kLmj z_CLSrf1ZDC%Kl}+${mkj2$xMoB{y%dT+Q95{h42$2b>?}m*2h6brF$7eOdq%;mGz@ zfC6C@tZ)LqC>c<#{UohX-J?|S59@f4Tp##wG}Y1^FQLhp;$(s2VM9$(tGsR?rSApeM)$LyzP^qM6sqjy8bCh?#aDTSY9R(MYOBl7R8Io$&auWgRgA+K8=qSSZG%LioasK zA!LBAB&h*G>=y}f0kmkN($jzUK*hx*Wv5bN8w(H`p7*l1XryzXyptMaH>r?UaDJSq z5)Z?hdBP*GiBU`=O%ony=>iC_$T9rw=cGbXfJ|>d3)GMTyZl4GNjJ4Q{(TzXex|VR zU;Xy)zi;;@Fg-m~1_sFPXi4eLmrXb;C9-r^`FjdWX_La6Gg#`{tSic09kf}h_cfr| z1?}HStz^h~QaqLRZ#YgQ4FEf&!3VNuXDiA*2L7o22P~8!J@r506Ry*6c>;G* z0CU-D3JyMOH5Fe?lcn9S;M~!D8e@oNtNm6l-;|Zc6zy9FPKsdt8;(|Mlb`!9z#XfO z9(AKB0U{EPlUjKcU{FoP8~Rmi@#a{C{IPhm_us$PZ6xc)WeoLII=Xd+b^hNDS9|}H z-D-MkGHZ)BJ7kd)yG1Xj1!pv#s9CdQM%=_lS99Ve)eJr*Z=n#(DS<+b1cjS6pmBdv z>1YzjwN6^3ph#qOOsji-k84)u%xkJzppy6?v`7M}b`L4Tv}17V_(j0KuIo91KqE=p zN8mkY8o5?-x~5k107D&VDU;&<1YL6qHPb6@K`R{49cGJa9_}5}wXNyY-B>iJDl=|zX#yZNKn|zg;oar@FPs^R?TwmAO;TVT#EF$m6Hr1&q=iZ-0SLc^Hpaz8C2<@^-|z;o8Bvp zt5spXcV4GlAUcS0`!<|?tIjYy?s}g>c=mi2IE~AXQVY)L@O<{GP5Ga3l~80^4ux3= zH;bbcB}>z^ML%h%>irz)lS+5~(>UvVxKo7mMA^g2^7h5jUc0FWoAnJwpX?;Vp zoy^qv5UQKBT`DMv^Zn87>t?u(1&$l;7`8x-1U`4^jso%0u-r*66xJUpZ|b`SxOGA7 zd0Ek)qzSvw7CS>hLs13H8|HXrxwnY#1C{C;R<$b^aVIa?E@y-F!7PRjMiT_QfVK_Du1I(>c38c8KiOQV$+*0Vl(u)Lza*-ywA@{xj=3tLdNCU`#W zTWEM*I!!wTn`JiVvZU9fEDlpPt%z{Lkn-L<3=qUb1-}Ed2&OkUrEOkPB%uMCeuhP* z+VPwj>7AlAJ%veR@MjWkhR$r(zg7XvYLKhlQ7YO>yR_sO=~I~E6|0%(u;mdbvbw7v z{eXrmIiBcdb<8FKI-Do8=0Lc$miK3Lkv^^J6g8mdpWffK+*TBvz*xYq!{630b1qz? zRQ0Z6uJx*oSc&~9SdHz4j_vR&vfu~u1;N8MoB3r=yvtT@(}&ufia z%U@AANKwzBVV@qH{^z-EGCah3^|AIWEDz@Y5K;u~xK5%4$!xv8px=ZflLPkTZ^|ZDk5Zzvd%_hTf?BL4Kzic}?&Og`%5IaTI3?TZ75y`OEwK$e;`X{V^H_Vvl}8>gU>yfsiK%?JZE9v(EWwCe<-F%+h1ipyGqFe5>My zroMxA7M7fRP1$qmNwb+6ITf5IYZ|8r`a#i8x3%Lf5ibMD{qX3Siy<4xt>gwt4d{FK z4GyPau`wKXEblIt^=L^Q`GY$n(^i!w(xq{)gUKC|7NqeYaQ;GJ4~eM6geuH80}SH% zo$t2a-ZbH?yC1~cZhR=|sxXA&H1UKv%ORsoUV%v=pAO52l8<{(HzA*F9sR^`k`|kY z4^p#w`SN{khu2V|iT7hT5*I#_#MNzFF-s%AUJcTeFwJt5MD~Vnff>cj(wH{t^Mf$* zWf8Iorh@u86rp@y09`J89pqFZZvg74S8v9V%Zr~iMfB@r%D@hS)u&gc0Wl>E}GroOSLrFt+#!m{4eoxHtxU_Sw z17-#=%7Ou_j4h|{`EqI^>(oJh=0b$0LWGE`Cqs{PZxm1c^S0k$Rbw~`Bh&QlajLlY zNr^N?d{Dt|K^mi7G;rEgNWQ|y|aC#I}sb3!_`8ja1Z#u`fclg}nK ziXql5%i6ER(Cv49(dM*?o$vaHVb=}h)(D9qz{F(Y*UQs)c9I{6ts`&PX zNu>7uDofFh^yi)fr+dusTg^n2!UQ-RTI|7+nygeQE&>gIY#x-%B0bd5gG+aR&vZVx zqNX9IhD^Dyo&9cp<)63Ajraw{oLquJ=N=B>}Mms_RG}m5O(6ds{X7|i!Po8&4;ZghbiRr;5Wg{V4jAbjT=<7G#WFT5Q1Zh z6?463J!Y@6C;SZZ_aVr1j?iC@{rLPWCc*CK81pZMJGZnvnXpMG$&mpgN)Zwbe2$fe zP^9gXj(l}Ja}9zDk4LidceLj6Xq}~U2?-Sx<&pv_$>OE$&;pf$yJ+;VHwgIezxe6S zTaPeG=(JCvjmK2R(J5J~poG4Jf>yUoM1Bt;xMX!_lP=53Bn1Bo#<8N{y4$hV*6prXTmE$;Q=pY(f{eW%O^* z7E>(F51grG1Ic97VXm*zL(mlMI8pPjQgw1m+#zW8>5?ig(V+O3_j%~I4)e3anY0J` z#QtW`oV$0HWFBD2q!quKIHSyF1B3>kb)wdtneRCm#V9?z<9a@aQ>o93RNv9*xlS(0e`Tq5u_&0+b>0Q*}Wbs^jr`3c9q#QpAk#J{VRc1x{DoPwA;| z0jk{Lym#C!K*`4U)2EoEC+?dn29yI2^RiN}?CL;2J3V81*-oX$TQ}(x=AqgghJ<9$ zyjl`ioafUrPw!_1WMR0vJ2Dt|7_W^@F-K1DQo9rwYIf{Bk4Ws)w67w4*rGp}B8A=i z^47I!;aPGZa4F6Yj)x?E8{&Zpa|3Ao(EH{B6b7lmC6>M4uIN;Of*8%@Szr71M9d%e z&6l^Q|Jw1N#Bc>UuNtJy%o*Hq4P+yhV6O_do8R~N2sM6%U}8yK*pVs;nx89D`Ed9t z%&#pgDrG;zTa0V5%;dtZxZS^<6*OEr`RI>H&pOQ;q?2J}3)14q$3GMBO+9gFXn5(! z-Eg_7@UpJwKfu49BqpmHbt#L-Tp=F{_oMX6zN+gVf9f;~l|DRb5q!L{Kx#xkJJ`3J zsbO$iXYj914bJv)VUXxT==Dl&6Kz7Kfw-rd_~3|LVElzUa8Ae z4k*PIi);Cc8pbXpA#7B1t3W3wPHnZodg}A*>Q==RWita;2(1Uh%KMHhVtXipU}(4w zxI3{2pL|m;)5#|fUSY}N1%gy7O3sS#VKQf{uIhhLY*-8mBkiyhbAca!#~&*}?UJ_d z-&WX%qrJa6@3_3LD4$q3W@)}_8Dyj9yR{WF01a8+VD0akKPBV~+Xk>?(CgEDnTse( z5bdl1aYqUaOLZdVUk9JZU#QOJZfyvVVilxb6%@$5U|W|X4T1)Xnvnh%gxu$CKn_Zn zGI(6lY3I7h1+Q%rv^4s#P28QL8mzO@XK?f`XjThwTqW6P($oxH%S}swULUgRFKt(% zpJf*ZEnK$hT#lIF;foB@U_{G}@cR0lNpi%Og~#o`GH`%R02wF)PgxzYHzB2u6QTJDn(RG72j5kpL_3&u-p&926w`VwRqUgqB1@1m( z-XZGG7L;GEBsosHuQiN&eiISnab)78?|FsC<1M{)1tbZyv(K! zf=0~@h*Lbz%2@Nk3NS~IrUfc#7k5?fCQ{ye9qx3}iPxQiSf@PZH~I4NSHMnA$Sv0Z zgzDn+q;<3})^g7kOG2(eLAQZvRhKw*KPyPpai(1L*Glh-k4&%Y1e&=QfaWizwu8|} zE2`#qwE-zci(@d8Npp56{Njq{0*1wisxX*4lgHm)&O(q4-yVF~IJMGW$Dej>wZxlO zYHkKV01{1A{KS353#dN$X~4V*>A26F`BGVc2F&4p?s2I)mK9FQTz1a2$fmM>`qS;N z=v@l<=k!vUF!eHigAa8qD|ZIYt$K4vxfR zTv22{kN#$gf;iSym3@zHkYu*R1<-&AK1I@8Rh-N$d2^lS;m8M0e{{jDZI*) z5syHTs|c@~fG#G2^o#hW-(sBp#J&mVULP(f(b!Ix>BiS7t?G+I&uJ8-C6sMkDloIa zp-<~(i?Yhs=8TEtMMCP3L142SfVoKi8{03kzwZF}o~KXtVGhs3-_K9u^WW)T@`?0R z<@onN@4wq2&B^Jz>kLzgfXT|3oq`!*= zrI5r9c6=IG=l}JzPFZTwrv=&3C(fgr9!&Wj?C$l;r>(MBrvwP|kVI{5K_{tD7jv)D zITzhO>8B{toBAzfJ@KdeXTRVx;nW8*|6qARD%3;bkh@a2%mJ8QCDMybD_Ys0@9#D1 zzIacZCnx)w=JynQa~OFJ-2fKnQKO2I{d1h@8hsj{evbZis`h!<&sSs+Y+XQXwv$a@ z)8EQ-pu8&K{FFq0)Np=4Y5&bA^7M&s1B*xFK|pGnxQI5vOJOs#j`)XXdN%>$)bN(9 zVg%LaY{Ib5k^`58cQp!Up47Ib{T|?a=S%u+{6|k_T|+UneK@@5TJ1dyc+%@)*z2C@ksnR6qXn zo0M<%(1vPfi^^RZ>Vy1&M;ej>W>3zyY@70ui7+|5ui~tWeLsvrw@nv&6ip$YAIQ{e zSy0ViCNCFn_X^C+!kZ7hV>(YPA zXJR1%3sU}$MQ^)ShCMLat7OiG|6b4h^-#^r$Yhx%o|hu3)+%Eb5`5aU1T=jVT- zd;LE5(9() zXa!TBo^w$OoIcDqQyu3pz9gLe85!k{kc0$Mh%@0`vi4v>t(iw+>OkG7(_a@9dD^6G z@g`}QqQQHH=gDS!*OPu!1?4*mIi^^~`ZHe_TSgfVjXWU-3nTHKl^u*IpH5u$b(D98 zb@dfn4-7z~^3<0e%>`VlJ5iX|fOo3fgytEuR!K`uhEqXROu0IMs#PC*t(*U}Toi1d9k(l}2L|N1(!XL^?) zu#4nP3OgeQqSWiHITV@A9IQMAjf+yQHHZDM9nV~3||HhR=4f>HEKGyWJ{LJ z<6SdMMY0@W8ZLC4g_btL2BuIAILU7a%2vU`UyvVnR=5w)UXz_-URHW&epvHM>D-sXlF)Z02m8HnK(L14Nt~?n`zy?q^94DD1 zsLo8QN+~D}me!#HtBZa=SB@Lqi#c%KFOc?oa$NAk4bY7AR+Vu5qIg52n@|(^tgozn z`uRge8?BAuALFDAGR5~UMEyO)V514o`-(SeTG^edbJpC2W?V}0(7R*R660xIr_|o& zFsfx)<1+MKQ@6aZs+00Tx~qbg1G7_dCZv#$w;18tT!*>qgZ=N@znIp{6PxSM=7LkV z;k0CgtuemqSTQ%S|DPu#dSD1kfpcE-qD+$H9a8zxw|ZD#MjTl|IAy8-iV@5$bdy#O(#J z|DN`bITZN|V>XUIRn~VY2@?)r^&0j=nyU)-cW0uh$=Gku>Za?83@l_yqo8#tiU<{e zJqH{h;pE%!GKyizGD2T>i19gHgd4oR?SRP?09q0b*4!7}r7RrGbPKrItqvpnBrH{h zn|8ym17i2_#bq1U7retkTe7ZW7easxH-j7WRG4f*d&J?Hp4rE}yWiFAccWHn=clFi z1zO08GPDq%3i1wGb{|Z@?G~+Z^}P`CK#Cr0N?SxxLwDr6k$@(&6dPF5MM?Kq9WA{L z@w|Kbc6lqzsuN@ghxr*%!cFVT;2J-vj@00OCb$k3W^>Z2cm*h@VEx0jyQNH+!_6Sk z^|3DPVav8ImRE7SAWopP+OoQpI~V1R?dq!PW}tc{+*i{7HkFJ2#b&_f09uzfgP4;p z4)=W*u6=WE73+Lz*$k#k6#QQ3<+G+VnDaJm2|oA5&N_yA^wzGQFAOq{UT$>+|i98!vUR?^PjyaAR?^wr8GGP#P>{RyRtQ3(!iS4S8{ATXjO4 zTdYL1dO-W$(KmOmSkKqjEg9rAYS`z5@GKIRJ!v;U8bEO;aF;F^Os(|^vJ|wy#?g3K zZFRx8K$9qN`Mjvp??cvi4$Au{w|y;KJ?cdsDFdwK(wXDDT1O(p2+*ZoW3pMC?hGmG zzq&I`+Urm8toI2oM92Uc*4%O)YwH+ZH667UxPCS{J{-TkwzfC@4z`%39SnZ zZE%tviVJJ|2vg-c_|7#JTHaxed!bi@Pf-KB64*oRp_sp-H6%(c^mVvzus}X)b4bB!6#JAF;6Nd>e8i0j245H8WDkCSw(|P`;VIyyChhcj8=2 zMNp_PpiVFQeWX_S>0q@W9mb2vZ*y`6Ngx!K(>>rBaR5-w^7S=!XPS`inDxfR?Io}o z;cN9T%!$jCMyj+83Nt-d+IYOAqN%P#2D9)X0Q-)W6tnni9$v9o z15mag%_3{haFr-Cv~G_;IbO()B#Bb_~X5#HE3F_Qh3Qz}=q7^EA z;y8^jBHk8xSilZ8g}w8`^IWX&vL|l!P<7AM8(`SiT{t2P zE|Ye`fPOnTu4ll-N`nbmXRmtji~J?F*>_$MF*C%xc7`t}@6_q3Q&$JtLV2+FWb~2A z+JJynVIt@O>ogIF0d^&tuzr3%41~_YA=522{hR{$;j%iwly50il+*G`%knM04J`bK zWN+H`!O-KZ>*?;V^IqzUJL~U&oMSM;dWbV*g@`7cngrB;#pTZLzdRTEzkz6j>GQ)J zvmg#~&05>f-rWtm)r(Um-mSUi|D}O2nIb@t?+wqh!EIk-KbODz@;tpCATmE#(Dal~MNbP#&L9wC&{!; zsbU`iEK;XZ$|uuj=Z|~XKJM9FxPtO` zmr0BkmLB1I?xFo0vnlTML+l^+nuIol{Rbb&ZEpGQzAs)Rk;}O?kaUiGYelc!u6cGr zMmXOasM3oq(y9DJd56qM-_Ma|(!F{ah8H)+=ON^>LyAIw4k;#kiNjPcr|RjsdxG4( zqjG?kHi1CBglQ}KUhvfovx8$5GBW#b*M!tWZ60ju0XrHH@pSdGYIy+_#|i<|={ESt zNM+szQch11ZPWBS`scS7T>txHwI`EnI=jrmF9nj?uoP!Nc>!wHD*dSLfv&Dkem%F( z>5@-}OQoopzxKY@+Y42YH)3t8cF~Q)^B){u;{8Yu@+As;YGno$Nb`dBZ9oyxTIA)>odbD1V~TmLP&c%h^wdoAvYJpSY@WUPp~5Eev?gY&{K6KXp!+ znqZ!aJKcfF2X3%Q;W`O&0vo9JTujxHsgueyigUQYJ2UR=kTrEh1#ioN%C0FAOcU(1bG`eG9>)^zhpwm~@fKufQDM-Xh3L&fk!E$@Os* za?*`2vWor+lqdFJxre^1+Ajc&xV(lt6KvpJ z!uj&#{-6r3Iu!wpNyGH|SHbF=M)Sf!Z)gDb3_mVBK z-|;1&S~HkwA;o^iuoVq_MP;txHbCvBURWQ-tmql%B5u6@h)&c9c>5R)A(i6P^ZFB{ zJ%eom`rkNA5K@2!v86d+Y1VAibM;x#_aK|NuI1BDc*27V0im>3Z=zNYRG#bV)CFLuTJjmu;9;}Wu`@2AA z+|uRbr{m*yd!v^xsfnIU7I^|wOm2T|)0r~!nYyvKV>|_Fo8Kpc@9zes`Ga3<0@5E> zcX6`PqrK&rL}rtQEW6XumsYYPr5`SR*EQ^bZc0JM!Sg=D7X#?rUcS%vLYzVke1CO8tPf0*Mg`ZpfNHx4pcU7DPVp6A}Miuz+w-4Vw zbPZ>}dzpEUi`1yyj{gzx`+ z#Vcbb6@jRS7>b)s0%)+yWiL=IF-=H!;(N!I)DK5oox17+x}rLzPfR1xPpa6N0!=b` zLUdJ>>1z51ikRb5>-LnpSE$S=?Yuo@3R?&7q1wwAQD16?M9p7yNZ{HVsw_S%yq3ms%*NCjK>iY=*ONvKv?E~^3PCraZ_d{FX>LN8{{ zmeZRQQr2Q1o4f#TK#;#&^IWYNaWmaZb&#LkQA{^!dhW2^PE~#s-=ZM8yDZhces^0O zB{4<#_2t~X--qH?S8-0|W{arFK+w??JCsQ%Oox6NLai)mJopjPUp6}Y*c_&CayLm+ z7-IHe@KUGjf#r7T3UF#U_9h?ukGWTW2=sS4_TZ5W3o)m0l+a z(ikrvNEfE7btVr@*Ozo|ie{dxZMyIK)3dZp!sS^9Ou`i-Bp%wm%bI3&Y2Q_GP7?$n zOZ~J3_u}mR6kHS+D#He%S!q+SzK9ZC`t5Y~A^jD|XRrb^J@sMpIK9lrK4ivw2wTIZ zeXP??4Qs-bH9Q?$ycCb`l}yR<5{>#P5ve1M)0_~oHvGhT?#Gb`kfU}F@2K#nhkqI+ z%RNS|;o!*^l$i^YGgfe29prmck?eBR44j%q$PB7GPB5HV-)Xw^14;BZz&r{|&Zy3( zI29juvtYTx5s~*oQ|y`k6P9SyP#SU_>7KjjhOE=KJfVt$5^pRf&e?jhM>PFE7_vR7 zgIqZq@sUk7#OY?eY<)HlIH5-7sl&`eQf13#XnydkPzkAAzWPcX9-m%_5ama80jOcO zlfVep;Cn%C^6VyM%FQ+lj-s4XUDWhEx#(S7T|3FPb6m=fJGIC9s%Co*O`M(d0u3+^ zTCr+gCG~ppn5^LZjdbMaCLkF?#a&i6do#%Z4Z?sFW_d`t+^i+pB$Aw{;r#4OW)KZ0 zhySYjAaC9Z;_6|DK!BY$fmQTCC9prbzQb%&4d#6CMSkl8{Zof925W^)_gq+TqKsWT z_zQK|3}kkg{=w9cJ(q8_zRzq|8XNxqYkRXL$#E=M@Yy~iMLI0Q-FzW-fMAr2i;rA? zntJm0{{O!i=mD1l8cBfWaU-*0Yw2z+5X)Kgs2*ZO_vSYI>k!-{(swsqr9hO@Zg#io zta_7xQmaYui&V2Qj1-hQdO>T8iBjL^=jTD0Y;ou#$H~Y1`84rKH8w!dY<2nEfVG)m zQd?T0Jm2DD!k$Os-IHstBIiqi(jGtm1=exKp<#6|gPq7(<-qVyJ-SF`9*zy57&^bR zs`&!2ZAce_UjMh`M~PPoMPJjabfuHFlxCfhcq&L4?*Ky= zYFhpPYu5#3O8tyjvb~Tsoc~K8|C>Sn_rDv5>eBy1ZJd8_ZcP8rKmD6?eXo@F-YP1_ zu4N8PIV*Z39Dq*`oq5g96W8+{zdZQ|){g&W4DR>&#OL)#C(Qt`-;_^RKmnBVpn}ab zgUS{4Ny-s_z=O&f^YPDn`qQL}^Jmz0Ix~%fc#ayl2Du4$mjG1cC1d2LY!=jq4o`RVz0`ZfLM_wVPY-z_o5}|A@f1jw-A>x z(b(YA-oV&fCUH|V9k8PO;UA&qBqgXUlHPQnVr8=VbT@a*`6X#!1({-L%_BgF`s?|s zf|6g#U>bmKDMrjVEzHF*__(^NM5{3$nz^FqHeCdC=|E|bTaO&*=LS#`V95iR{Y{Wq zfX^I>_y)TkZ+^$$E01uw!Wb z_4)ZlxIgjRvJ3GmMI@M7aBleRUV8Przsml1H$n_@=l^{Q=a&Efv#`bpq2+7R5fOff z6Ne;oheaO$j`MwEbxVS@nC;;tcTlNd8It80QkXQ)YZ#HnjnSGYl>&b<`bXNo9>b&plU$38%yzs;MUfm)d)BpL0gDQri!E@bZ!~4`X*H*EXay%! z0k#Ij?%p6co1Z9*7DHkNkolDq!O#DCU&mbrc`x3^Vy_42vP=D7Jv- zn^l?7#j?7xceM7Y=C~Lw$88&lnu{g+OH>wBo^!GQ)XhNCV~P3I7NEYIL~)J#{Z>EL$awJH|WEp1TKT_(TkR-A4{Zvyct4`IoZJ9G**C31`a7Jjiq zIR`t)Ec^BaMOrw4yCN6rRuIjca<@Ve%qGE?e2;9oqmv;rXS&y9I`q8hO z#NDw;S{%SF>HR>R(DmN15);SiFvv)b>Mc0A&p{l!iHDEAZrhZ5a-4e9-YU%`Jg(+4q}e`*TBq zv)85|Yd~oI@g&0)oiMdTQHLnMY7M&(Hz-fqw%qsh@hy_zbfg5&*d33fg3tY>2CSEt z1mz`trxY#Q@iG;hhaK-GLfu2g+6*dxL?t$=A4%htRYj>o{uw8%$TK4UlWipEH>YQ= zrXmyBu%%#%m+msq9H7vN$~DF+&`-Kh5t{b?YK$3PVtb9LDN2PhVY>L?2X1I|VA-_3 zxL1uBCEu!u4`lSnnsIoK@7oVRwcH@dKFX(chFI~k5xiK=m&^Jp;?lIX_U3LWFP^8a zZ|NVz(_s==3U^i9&^+mYWhJK$$8x`nb=nIq*Be`~Qj08IFt?}YLC3Xv^>GZ?HE&-C zM1ZG_jFc^~WJOZC6yR;!hB;+e*4}JdQ+uwiW7?)3Tn~$pE+vIVGxmX(eEUKMGQABg zTjC%O$m9bTPhAe<+mP;C1TpC%(=2QCiWka!&RSd675}U8p17wL(D8w=fSAReq?SPE zDCDa>mn%#w>WH@60<~%$`=Dlo$Rtw<&n|5kOMit_5ZCbMNFU;r=R6Pw$0@-E*;Gje zG(EPm1PU|3`aQ69kb2^xNQ*}|#<#Q0JP@`W4U&Mx1-)-}{);lc8{Ij#Kz_4>Kk2Nm zp}sdP=+ty>^|YT)RA8lN_}foyvcoHiBrW0^XRHR=wpD{gBA;|JJ#~CeBfU~AY1N~Z zK?BsIi!`@Q;@LjVY|`i;RtWP;w?ewQO}jRvFHp28bl_bWUQ7rtLwcnX+LZyCm2F9u zC)QCqS+d*3>BLwHI<=yK241OCVu{(KdWgc>2N?$%ae$BtmLZO~9C@2`&?wbIq}Mv9fh#5DBGk{A_@YDB+oT=O9rm2mV-}>57ZZHcbM50j}i2lQ^@;>c#TGR z%EMJD-I#FtVLOqfbs!s(ib(_H6lf{u?93tQ`;t?l9w3xVXNKloJYwdKBUK!!9s|+( z+FsHV{L;^~GO=1@d0NC|E>0@nD~0L7`3ef1=Fzeni5e~6-$Rr%9@5`VYeZz`BPN== zwEVgcLhuFEd_gB(aUJVLx+nqFaJR1`Ho0kMlY7~R6pactHyXGSm?Lr86fd=`sqtof zBLAMW_hjt>Gc~2i_DeFTjQ!?F2T~gFB`Qe2KEJ0ZUnrZA`Y1ikK>G7M(h#ViCsP98$+UDn@?3GnWgR2ZUpPkj zNlNdqE8Vj&{*mI6QN6|+C6we#YFPM}PE-2&x&`Wgzjh}HBa8q1@Q=`Y;h#8E-q9dF zBd^E7Vhz7?b&VKx^jlW9Uy0HGJXV=J=mf<61%wuPY@IGPJ5OkebkK0LYB*W*%(2pr zUH+=?u0wZNFg05}^nfcsf#ieBfX(KH^nXP#oR%sj+b};rzrR1Tk_ZhuQ#&WpFJUnG z0Z1?Q?KXtxUphXU75R0Y$5an$lcmVdq-9`-R;XWD)wM|A|!o(tPlb zm_B#PTcY~)Q$!_+<^u;RleQ_lqNH3D1RN$aF_%+?NmPBmcB1Yd=#a`gDcA(kr+D9Y zX`Awwv=oa9l61A}KtCI8Gb3STO-m{j zjPi;lHFvi@g47Dms(|5!167NFVL)0cI|)0 z3kUvbx+zMajMN)krb;yctvHU}fBH5VJD6#tj~>8_@r=g}DAH^0 z;#YvurQhFIB7e>|K+741Sx#eZ`8ygGaX@ECe672*;8Y2o8RrdJVp88si%1VBJU;@j z^Qvw|>x9c+6(wYi!Vnd{zT2(wJv1PLD?c51>!53m~*3~dJ9rTtb%ziW~({& z5StSM;FutXTvWO&7p75AqPX+rXh>B@gqavfT@^Oq{juZ@s2mLq)eWHoZ+7YO2=+jOS5CTQRSOeuIW zIeMyr{A`i5dhF0k+@jjzt5b%@pz`*EfEFyN8`O|~U-mW}539<=Bx(2t7_ zI>e$0&5r-`qg)6_Qeh?=Q8~rW6MIC{f(w#;Ke3{o=$Z3jGB`;7&EIuOtZ%c>K!A^T z9y&4n)9dbTm%=x#ww_;?ymVo+2w^A$h-aPX<^T#jXc7(aCDNq^*}?lKC_?%2W%BxFCBOK$f zP9|rjM?b_qj(-jTY@1EDv}36N2$@mGMDP-nF48-RUWva{5PsLK5)<6sJz_zoU{8Vj z)M9p*$4?3A{Dx9Gf<&|hX9A%pLOnkL~+As;+MHdqBFJ?p>^`)cb3yNEx(gP5V26Wgh37 zNBX4W(5^`VUwWf>iM&t-6H;EXC-Nc4k8zm5kw-U*qnO>Lf6`^|5TLyOitq8Pmm|}M zF~m7=i@nnXH1|jR+#jhOCPJ!5@$dNe#^B~>G$BQ2mzlf-Q@)~E4Y&=d8t#X*2>K20 zlzKm>3pd{Q3&W_!!)tUV5V399_H7@^tZl?&(GJ} z&~;Z5vNVIFiJZZ!EoE(0GGey$>n62wn;t0ia*6`EQN!aJvkM3G{d-0_Q>Ib@opF-!_u+G54HyDC}O!HMJ}#B_SaMrB$G5O@Bc^=mGQQ z+Tf_fC?*n;A}BRQP%1&+279Xp^Q&$~q#lw#AmuVe7k=qHIUyZal*Zq63(`wJiqs$U zn@^eLP5ma_le${MY57}U2BcWf&;;|m{-bQRZid#8LrGUL`*uKG2%9i|Knbb^>NOn~ zKh_~4P2@*Po~cKky#P*~rm?68<$u&2)0QqiJRflJHtU?#H|WPDo?cPiOxExE0HJbQ zyrD(tjPXV-t3W^2?2?$@O-naxZ(sYHxZO<^)J2tNT^`w`g9lM2%Qk6&R4Xb$v@Gdy zE8Jiz>+B|-EI%}gde)|})bFn8EK>R-q&pwqKjMfyA12Gf5});Mx=RHl*%VBj8~m!9 z`gi8M!}J1EbC_;wGuBW}V{PL!$@|}#D#)3AHvI6AC}$>QE2Lr$@~cRR>h3{uPnnqf z^CJD9KR*3O`m9=;Uj0*LvN^M-3tuQ3QgG50K(*ujJ1GML`hC2ld$YHj50fi3uGIZl zdlm|?)W1`jEFCSoOZlR@VDqF-^XHGli7vBiBWvpv;&&;AP1}{bJ_N2A>4F~P=PJ<&PpQz*%XEyP$UC19dso$E-08tCJ1S&;Q#kiZbyr_ezTS*N4%3>nI-_fu+ z$eTjTbVgy0xhyA(@?ezt3W8<}%;QnFhdJbH_tqVtCbJ$}fc~lfc*r~@kYg5LNr@ER2^t+^AD)kmc|EGywcvCo9X_;cj7J<<l1H(q2V9g1z}^gS&Y_ozDqL!->M3 zn(m`{RCsZA*?sdv+oy-X5-}fD$Jm6ftmNC=YOEjphV?XO+c^2iQyWJM(+BQ95?qmr znp}Epx}aj@m+98@(D{}FqCE#rHqubIrLYw(>f_Kb?@)o_I6pRUeq2g$@7y;LwJ81G z-;Q4?y*GlL_8vJ+8_y_L*PcA^zh>7EdpqS6{xZSORU}bdW&K z4=iiq+OSkD$*+D5=Enokb7&6pnRWLzP&tPAEkFZBOI6CVH(|K`a%$VJV&rXvJ*2)5kTHM+ zq+>xo2kM2+^G!`lx?ReRKM~oV`0g+qvneUzG(BH%UX7v4wBdTXYzc4lGwV^kUvK_= z^LBhlolW6^>d^;XR#c|p&j~^-Mcohs?V}Kl+Q#NSP{#hv1H-HRVXZfb`>Y`it*Ry6uTt z()GO`UTr^gR_fR1;rbfX>x;krzRu*_$raU>avN5!Xk~Nbo|<>IH+(ugA@gMiN5TQ5 zwxCc2ar5W^tY1X>_nDQ=#a$h$emRZ9O`vC*bd-!y1<m);&P z=8wG*=zFI~&%h;MNkTmnF7er(z+dVhF)?&2&bk4+tY2OIr}RL^0y*aHa65I`^I?s2 zsgs~a)`0YaVTL^q?DOkcs#l;ff>_jtU8JM7S_{%p0tN9QKll&6pK~CJxhB&=4~!FD z6X@Ar(kne$Q`A5GS2|1fzT9~D?OUepM~xM7C|QwU!d>hnQtZ@#bWhtkj!9D=)MGJo z+M-^OzM@ntXbuJ0*|^-uslmJtN8{c4o}U%WV&qwA!5Z`$-5yYB*#=EJJ zuF*mTEj&Ycoitn-AVE!kp7tqmHry7}uN739Wp<-6O*f=8&fdsP<&7t!t5n2ojSt+{ zaxotcZD)-fpRH;=>CvM<>FO%H`RS?B#oSOR#)L^SUABPD{9LB1n|^Q7KV+zB!Owkn+YGn{ z4T}I}ewjyUYIg9^!WKfuz<8(=p zzCTCmSA}v^P3k}^cWO^b>1kUk%11ShlU2k0;%oUmpa@)ZyYXiWy17Sfn_Q`)M9;xKOR-=-0kzXij)$`gO=0c;{o+8w5a5As zM{AGi)(>F^sPJXCakE9@(t<)yS)`E9^Uin(cV8Z(?6vj$IMjd5*Ht*}?yPe^JYAzS z%5Dvt!{b2x>_5;1w8!quv$swD1FCJPRZ&jj32fK_s>TGW1*nZ-a!D>k>REN~!FSZ4 zFY3|E@v3kW5%E^2yH}J0 zYVw|Xq-K^f6Gb9$60;d@3+vd;2{vo#*0Fx=iWG-jj*jOMf9-m3kD zg>DS>$4@Nm9*57LxAO@YBZ2%v+O>uSCkdwFYf5jFxYSUlzdn*x%g+pskM+MhpMNnUu=bz%5kM>*+#xlh+h&(aP~yCY?NH$N6sMX< zs+~b$tF}o$FKF0x$G!yHBg554N@R8fq&2xqT8Wh!dQjPF!5ZDp+msVNcQ=jMW~D)M zqXF`2f-2_4s(Lf8s>?RyEl8bL&Za#XsIEhf)HdccAF=-X-9;rr+tE*VD!Mul|JAOz z{>{30it*+uQX*`>%z2Z=Pfy2vh=vpzn%{<|@`8~ht;i0PTbHHq^4DP22Os+TmtQs; z&l#UizXJ8d@XyZ(5?FePmZ5MS0>(|d|)?1qhk@%M5eD4zeHPn8x=Wb+T9yIxCCY}KM? zPF@Q7a%x?hXG~g$a|QBLgUm|tunPKXmeV@Gkh+KYUBDds5`(*I@)Un@`cUO&w4&f& z!TUuOVU;ZJDnAQr@2=;fz_hQY{%ES24WFQ9r@%ufzJCfK?W|r3lu~9w{4r1<^*#>i z7$S+nfwIzu81!(}op`lYUgd)omGt3LNM&qq%k-}hfTstbYlF%o(^X+5hG6bvRo$)= zX23zBI;CRQ)e5SjQKPn1J$tHb+n52;zmP7Cw6fk6tE$%pfDU7jHKa#X-!Qh(&Nvmw zMS2hHXiuNND5^-qyJ$8$)YWl*PTu3Afv>)6q@S(+=_&uSBm$4lgi%GPl7KOK55gF) zTA)_68RES*!!}TrfvROHLQ|;a5@W;=r}t~HOWe{KN(frOZd9vAH8D?O zVjyajV8szw&lRmut%lllFiW;bss|cX-Ud8#K`5A?g9k5ypA!GF?0!H; zo!o%fnlgDdcmvWC+pwe`={jbTq5Y4eK~eY>%k+N?;QGt4!s_ov+MxcAEI*D*^O!DS)C8&3W8F^4U`3?xh4#>u3^>y8j zvS(9+=wKjmVh2bjd#4d(K~j!w0L&lHiK`X)g{^{2-`_D6R#!Gx#&>q@xZ&FL5fq9+ z0k>+WS=N$(x^Bq@HQlK#Kx@Y--Zd^nyNINNw_>^mbkVd1R=r?VG(6FQl?>kZet74C zJ8fe!kM=><91TIsPn4E0Fd8e@bfOSoRe_=ci^?^A99e2VR&2|C_Vf_I$Vdk`*B8&L zR;$aE+*j(DFW0P;VS3}cIP~4?e}?#<^mhLz4F7RxZ-saw))v!v0!UzFT^O|FJJv9( zHA!m6;nJ|MmxIhTo2`YWkVQ->!N>qbQBd_k!e)`b0r5oJpHi zbStFuOJ$I_kWow>=^t%VI%t?iLi&xHc#1;Yb=eNTy)103jP4Em0F_uFof20cCSS0Y zHMMGtsd}&~Tx2@(Zi`^kfRw61u2&eEOW~g^Z`H6M6`va05Pc_+VZe%OE z0jZ^^3gsX{Ln{2_7jM7JHn1&7kLeZ7c@;;Ri@}DvK>>za#;%YqYucg3-kn3b%>s%2 zI#+F5Qz{0fWHo5e*7!?h(5~Ql3TerHedMt3v&8v1;rzJhbX#ii?6iAM+cc!jiZ{#2 zc*D#G?bOZZX>x#e8}=&{`5~1Okdp0XxlfItA%*Gdce!hEw4yY@u*CjytGW6czrJiM zbC1uH+xYsE^O6Kl)ft+k@*L`Np^}O;IO?^J|1o(+fTmiw*oFPzU$s=NH7M%Tyh@QZ z4$ZPtjigsLn)B6wJ43NI(9%wCEi6*{nIFCC~h9SFC8B_^X2a`1-$b@^d{Ed`hIJ=_l)U zk!q1A=~|KYtM3jo^^lvxA-+CvM|tvJN^m>4Lt1w4!+OybuW5QpKC>_60nV?B8D#%I zXs9Z`7p5F$P->^MLV5`r6H1|Es$$^VOxy~o(_<>krM{wHYXkUx9__x46^fT3nun5U zs;5I0FDHgZ2o{zJ%aqTH9iG7akl==&BS23rq|56NY0gOX^zV<# zs?*b&wgJq`QxF&w&;7B9yW(n<@Zc+9W`rMnZ7*xm()BuT2b?(35B+AiItskVEw{kA z0_0Iy2j5cY66?~r%8`}4d^YU^#)+sCj`2`Fj<-2gB~R`wtM-=Vi4oF zMZ?BD?b%=(BNvBGfasjc*JR0~FM4yVkY-fqNN@_|7sN5>1jK=`!? zPGTArk`=qXXb}7u=w%%F>|5o~8zl%mR4M`}q1F`Z$>e7PDd>1}sc+QJ%b;%S9`LdH zgLG|FOM##j9OkYFuM$w}%R4|_?y|lOpl|qbrJrBkwtsAQt0u8Tu|c$?f`ucJ4M} zrpQI}K+wr`$V%D|HJHbdwZ)3Y^XbaOU59{DdIGc;{5WN(V+VkiREid;pCqc$rx&mn zvoyZg%C8=^>B@AQw7ed!(M2P$5KN`Eg_f5F$d?{u8`J;c_0`oM9-OUDuatsXZUE)R z6o*US9WK)Zc6TS}$5^b491Oe5tWv}_EvGRfbIok~`ZlQR5#CL(8Pz73>u7S>C9K*O zd@EO$E(|Sd1K7)vXyW+0NAzk8b%9_TkEoTi!_#}`7Us&RMf-zQjjSkmvay!OEfJxR z{L|=jx!`0g3i(7lS3m#y_j)xgrk4Q(Hs^Yh|I$U(s!;cocC?GH-c=1{i~hXw_}H>e z_d6M1O5g26g-~r!4?~SyS}%&TVd{8K84%b-2vlU{pFf}zJRhjsU8y7ez8qTL#f{J3 zHO(*k9wbXl-0~{0m+MU`NUi1&;%2`hhFOr-nTPD9X6Y}OokIR6@K7Zood>J}6MDQy z`gMo&K90UNgsD4CYVH^BLb~%YXRpgOwM(BGS|!0Wt*LOx?QXd-6%?i|iN7F-ed(M_ zZ{5iYYNmr{i=j|8|C+u1Ul|+wubxoSN-96ZRpt{ir zs{!mqm%(UXRsCzoP%~85E3p5dR#ZIs2`byC#*bX3I@B$zp=GqvKRCEXf?AeX3ZZ1u z_@Y^1@{Dc$^M7B|Eq3XjNdHMc%rw0c@E67z+nvt)hzV|};A1cr4owaA`Pow%Ceg<_ z(vP`9E26U>h}2pvb)vZ3@(tQskVhJGsXKBX%jsDe)6v;24y(8C}4p8MTm zZ%>MUr1ypCDz=_MFfJ|W(W>lycnI4e8Tk} zL)KO5E_IZ6^nKqCli1H@EA6xs;-sLfpsSRgeIr&#iwZg@3N1+E(8c~bkW7i&-UF-D zq-g*&&s92*J$t=6QEn-oj3Yn5Zu-1WNyhZ22bID&cslUriQsZ-<0d7W z!NbKwz@HvVyqEnho95J1f?8@Ppe8_~EQS26!3Usj8lehG<+&tE^>RsK7wB!c-qQNL zWNG(F(fwt-0_5(8pC&!@);uK~aUlJmpi$u!$0_)NhH)sVx}Ky(KE$Q&%ZtMf%Mi=)N6dHA_WO(k0~$CWi$7{^sJB zqPY`PKdA%7MWt!&qQ)zwhyFReP3hFTHy1*mUa8QZy{N`MuOmNk_zV|XDkAH?jVvF9 znPlCyK>a4WoQD0{_kECOotq~I1zZCiW|WH_Sy!KIWUC#!9vSF~&d2lGNifaG~P8Gm&ty*#weQ197T)ur9;CI>EHC z+nV2-NPS8BOz{vX-LqsHF@%mA_wTa(GxnftXLc0{&%^~gm) zYuXD3Lr>mG6GE&@sxT(1Z-UFNi|dlSoK-6tBSA86Pj7+a>6KvBKY9YPntpw+E)ImV z3e3+{WVuS$0`{^*RMQCM`w!$Q z6@<&LKoim;0*vu*T&&mXZi96$YF z&2#^nN9z~dcL{wQY_LOon}rmJJU#gHqWYjvjBAScwNu6E z=M`1e3GyFpx&LdrKMykf6iMqE3Ibr)%P!QS_8Uy$P4zg^E$*CYQY%Iw0OLRDcicuP zAx$roo{yAYxCNq;!ZV?|I|FKl0_A85~{5t#gPhg zgwMov<9HuL86Rn==Zz|%{D@HxsHyM=fXk`|iBqL~3tAF=d)t%Fc~Go;8vBPk8HRln zpZzPLDxX`GEl|e>n)q8~%aNfXvfW#XSh!b(ki4UsMOF2s{`J#}c5xk>##N&FNRyf< zlsNg`W0az(s}xwZ7^l1%uzIf?_!RxV@YAVdHOhTCNNA8NVKF@=B&ShnYOQ$EDLn#- znp#v>YX9}wtjPCO8dazos$_8jjCdIMzo?>J#gPgOsu4Y@vYbGA+AzCbrku4u!R`k^ zNfcAcBi3nz5(75G^R8mWu>D2=aF-4 z-BPBDKO%tS)dfQlQW~fO1bz>wj@7Jem`9Zx);8T)SBPg$i~0gjoo-HG!=tFj)lG3r zdJmwqUu@k%SC#Z3i2Gf|dpVk*Q9~a?^yi0D$hU%Y|5BiBi;4vYIz}y_5$Mtcdfy<_ zKClb&Y^uuns`o@z+=HsZ=v&hSwO_2&T3l+aZ=6n(MEwt@>x&YrlB>2wo=>T6*2|x3 zmV(ASTKx0J4yz%x`{6X*(z*m8peIq?j-%u;Ko))lU8#Osh3U|PrARrJ`zxcAh9Yb7 zQ4U2jSgBR560OUJ?$x4D@rvB~?3d5VKiLMxQFP75K7}eqy+R)Ks$WqR>VJI5uw>K# z7b*LLnyTg&07ALqZPi&)rHb=@N#p$=9-%Fem@+Z{FivgD%hYk?S@CHALSHR=jyDG z+N7KTrH_=TXd(G74MpkK&eKU%pvCNQzwS?^TdF@#roOk(ULGjTpHYk_kcIXL^J@nt zQLCIA;)@wxxO7&jQ+H^nxFw&EB`F}EkQJ&>G-02=!0Jq9D}Gj%6j*NxghE?p|F2Vz zeNt}6)Ys!P`dWJ!w?#n;rjSv$Fo6cS(a zO+P8A^#`3a*ND`BO0QCKqKXQ2fz(V6r7cKNMVlg%ev?sgT9z=7ej-kNdZaLy!nY~s zr@3)_HiJ6C`EmJWUS+)6Rgl;4@yGZi>gOy?mQ5NVD@>^TnQGWfD zN`0vV{T#`OrN{5_tM6L|Jx8l=(Sy#xH)&waJ{Kk;eGO*H^<M@!iP#^ zK&YSID9I}lg%UZgS+q|ZkX|YoX$(U+d+Sd+dEn1dnMeAKMIGaPl^N0{qr+!REb*J^;urG5B~NP z1q-6lgpVRx)mCCLY zRQ8&R&;qsEmJI!$$K*UM>CijSJ&P=gYo-TJp_I$w9B1p*;uk@5a#n^>v`S^W`_F+EK$8mbv{g~vJfg*+Wz0#QXD`t}t)>+mmoqj>h zh!(7o(-3k$k4BU!UwH@1?~xL}pCuBh)LS4e0#qX}QsvpCW1#l|YEe1EXz~yrNumXH zBSH#JyA*!$x`Zja5t2+u>Ol9C=-ZM6JP=CLKoTyh0Ff7^i!;Rui9zPBSrb%)bqhrF zSTE_rU9-{V)u)ICF0aKkRRn%8pUMU4W0+GbD*RRAY<^HZnL1hTU_)4}X;>F}gKdHu z#qjUH+DHAgko@ftBTb!CWJC6I+fF9(0{Z9!=t|wmy4_ZpmHUn0_ zyi?gVp#^JvidwyyE-KP=)1>JZUfo-KPQP0F?98c-#7sy^p_GDFI*(8-S4<%T8Wc~D z5t59j8{YeV8ARA(%{-z*qv=4UohqT6!?lMCsG{pM_Ni&Y`mC*RSVM&opsWx+@nj@yxuKKTI48#xO3XdJ+Q&#w7;O&Qe=>JyKzY=f0oN+WrCMQIg6- zfkHsr$*9$mT3aN2(p+Y%i36ZF0b2hRAm36X{hn`ux2lonsGK6*z9xYNTRM=WV^K&e zK0jUmQZ_B>^Ae==9bZD#n_)|3uaxD(2ZB!oSj|irD4{C6)518M#etJDAT`1 zfy&Q77x_H=Y)@UOBh=iHbUIzFC0}X<)l6OvO;97N?Ni3zAKFj0RZ=afdx7C$6c1Ap z(s>G1bkO-h_p|}(MU7cru)ButEmd?`+WA0vSQyex>PgM6MqXdDSBkGO}C^3c(vy@_0 z!(!(}IclDmu4AnW`LD=m;%HUHd{a4)P#sy0GXYxMN^Z%e;pBrl`Z30juifqh47`1| z_zSwlAA4F;QN+_ilD4BOn!OOGTQ;h-xslQW^|37;L*FQNqwJxhZ*Pc53{9*2-=&9o?*ZfYe*|K(>8ojZqP-$J$bMohjc(Hl$}MN!xRXy0o8l zI%NoD?f3P`Jw3HN|07aurl9)&e5y_uk;QWa8Ikg0l`qOF!e-UvHo$vR=s$4s!NVE? z$q{K;w9|Z1`X`?RG%0E@_wH3feoHA~dUWC=RccDBu%yoP(t4J5o*pGpUaB>(cc%RQ$t?PA!8S{rIO-pMQB- zEwNHjsrIH@!Ce$y$SG6VrgaTBGeo42~TX-vPn6L zaqht3(!?v za!cdmkuo*96atJ-aD7Gb@k4R-%gNdIqq8F4J}dTFKVy>`swoMu z>!6=p-2dS&1jw?aHL#mRig+ZRDjVldF)%0K?Ng{`1aZHt`clQ2kotl@^EdlHHhum+ z7@Ib?-G!Ph6I2^i$UA|T9{cq9`+#4eoDr7`YqbM(d-D%^DTq}H;Y^M-WG4#Q1l6Te z7$()JQ9iW?hid43+_ZAjtt48Lu79}BdfrjtHWVpL5M%GR3iN|TRQ;ry)*pU#cbpaJF_3cR4H&vW30;p-+Tcyf&`>{Yqxxu^LMU{H`{;g1ZUi1AM z3NP6pHGO;ot318JYTbg>P5SwS_3IF;(e61sU9av2Yd1IT1J&4A+b|@6#ET~{3YUQT zuF%d6MP&_cjxyMO_hDreQF{~75h^3qNE`3+EMAc)4%@roLf zOnCo$V_#$65>yc7e2+cJK>Om-lB1VG9W%WIeo-WCc(g*Bc~;5lE+@L>fEl_JkEdN*IJ?WPXj|Gm0?>fbf7lShQO!P`@q? zvc&ycwo9G#h!^_8edkh&rg0UR>1c(aO6fvzS@Ej1hC_uyEnoxa7YDkE1YlU30M^}^8p)Lh3;IcvRoa-Z zZrAo9K60kABc)4wHRS@j@cSUJORVCuh)M(2#R#u@Ysl zVyPL_vat22>LFcXG=!m~snE0F&sB;`ttjg&GEdgkR`ppvcN;!`>apiaaM=e8(+8if zRws%@mBpe4^7ySIJ#4b}Bka>=kH}gO1Sn6VD%WT(|-l)1EV0Bx`;AhuPtK1 zg1S^f?Xrgcsdeo)iMB32q)eODNfE0^sFRsuQC0B-7#ggu<7^TB2oD#oHM#-d1nc5z zRTrn1t5r;cfOV`P#b_>2F|DTvpwBgc?5TCuk=}Bhij)KfL@S*xzQ1N#ZzEJ9FI13z zKu610n(QZ{mD!g>0w|aOMl+TTzBSU7@c|%ZYImt&oE`wr$)EW*>91bV9+GKUU;PPB z8oSH_q{P}mmAH3>MIrbc6bIyHO0m0NcsgLMFr1Ja?N)EJvcOkt4PP` z>iE}Gip)Y@C4a&cv0!$Y^A@BmI5s*=^{Qi_OYd^(0yX&kNYzqXaTr=lPipUN5d%tl zakgC3EKYw6v;F|+_(&xfM8I^XrH}<{%I&j&l;>5T1ey~_0s^xsPNLCa#7hQ*YKW?Hh` zdll#R#hYKZNKJ~cceKuyt8!&UW4_WrM!QhfkX-0HeSJiKZUgDp9a3?(hOhl_vqM#x zawSbzbb>+_E3xn+>pkD4iWz7moSq!Xs_Xf+8&UPk18eOgY!K4?6E@J?iY@D^g}^s@ z?^25vu=QGXd`O}U&Y8Ui6O4;}qU&$j9i=HuX>!fC3Ce^L4qX z%Vny9o95M87FA7cNi}yU!wVXK@%kLq%iFdcQNvty=Ta7x{^N^BfA#&bO?4~ObC#q= zwH2uy^izGhR$(*Xw}Ez~>@P$U()6hfJ@oo-gnX>xc5;YP4*t4%MIG3Y&elH9uVJxLbC zW2=wz^&j+raIzc`?V$oF#0vJ}PTOU#58nd&GCa}I4}VdlbjtrFvMTh}e8;zKF}Uvh z=%H9&;L8(nD*N_zZLs_DWpK)dJHRTv#1sjvkQ&!2tp^z}aBM{} z3IM2TDXJNrla;S8;x3Po|JQ=`Yr*=ae)YFuc#hB5v|?+ox?evI)c_Ya;Mn*5sgQ8)TOb>RRayE^FvSW7 z=KIl2mI@7^FaQ2)e_3K!%+Kx9;cd`5#R3f*A0|I9sQ0f!RO-|d z%RE&rbpp~)g*{aXb$n&*NvwY_a+%4&3f+J{6t!zDSsdGWl@qRpOY3s`o8Mm)UWE?R zfe;8PVwVKfF2yS^Cn2jk)Zl~n(GinzankAMKZZ3Wq-zxeCs!3Y$}X_ITI5^G=DphZ z5cKyCK%b%kjiQF_?WufwE?hezh9{)xq_EFlaKqiyxT*q$Pko3J%_?qZ2B*27DBchY0(Yve(VQVfs*-C z7KR#3{P_hS7fOJm%8+w#D|fq4vj>c6g8FywDzS*|!uku13$9@(wWW68SZ7Vo7Wl$- zlrtrWX93f2NI&_PoIU=T4ghOHs8|Z4ix@jyS<9QMRhHapN4g_qkX?XJKdD3pPy)lY zU5ZyYaitJhC)NV0?p2sjT**@PD~|Qg%rvLbFjx~h+^XDBov_ZyfFKn^-!)-%PB@n? zE7{X^{~Oo+FyH?!o+AUy3=0p*4e-GW!39~UYoTZT~chS45k~PfciB;{r)7gy6ku(aaaBD^e*3r{=GqTPs@!2 z8=@NtW`O9P=u0-BDJv{p?yw%ODFHnqEREOdVli;L`QLX%{XCS(Qmu0ZQbR zf@2%+D7G=&^D9cDYH2sNDzya^aX}mj?yj)%`yT`MU@re`Q6F+FT5{@y79#=3smoeL zg2xk;QOBZ=_4AUpf8VE*^v<3t%KWcGFyRNa3h-=I$`&0sfqb)C&a(xn4Xm3g&;#q5 zQ6v^xymw_bQBA9G1;9re!VoE5%w)0TEY39Lh3&eUca42at_7%`v z+J8(Jq>HnbV!(uH zq#6;<%NXaRT0p}%pwp%cI$$BWO^lE(?SOMwbGnKxVSZ>`g?UiqDguom8TeM^GDd>* zcJl1vF-#B_!w?QP>gj=iMIIE@3YYHG1jQ{-{bliera!cxoU~v%ISlri)uG>S;XU~0 zAz1sa-bavWdo{dLn;2n9&Otse5I?^t3i%o{suVoqpYMzne@;iK7q@GS7E!+N@hQfq z{zZhh&_BIBr6>07s(z2_7-XUK1R2fr?yNjXWFh#gWymAc4Qn+Cd(OnZtoxxyj&h1j zPYJje_R@gXJiqmCh6{c4?40y+SDv);QX&CKfiwgvI3caj zCe{fyNEKC-6K%3kq+PPmaO=(8;GFgCff7{oaRU|22gl>2E{rldBi7FYtSPATBCESa zWJrsOx?j?~cuw?nrKJZCtsdg8eaMqYo6D;+->-Rj_Xbl1O{_m)Ph)7>8#1+U{qjZ3 z1Wz@$Jnv?d(+AYgufx@Sr$*_MjMm0?WXUPnPnSn}I~A%YA%rF|jr)qZ?>)^V^sADY zE1El6(KA$E)Foh!MHBs%$`NjW7f5`t5`|DiC=IP5vk#S?_pS+Q-1vdm)aT#1^kX}{ zZ?V%z8y0Mjf}H5eE`%iQ_bSfMmM&C0+lkKR?#S+RuN_)E#9 zi8;PER5g(>d@WhjuBb(sj>6_T(z{yJ*yRs0zqG+d5gt(~BK|~2qN)>y{$BvBN9;GB*|B&2GMCk=SMRb7$biA<-g zBZaDVx|_P5>6BH-v{p3vy@E*F2Q+v^xlxN)P~}*#j0s$x-#844@2sCE@8y{dQL9Bc_l77O(h)cRG` zqD}o>fs%$^gcU_9;ESlrDQcM4fgXH*J4?QtA4?t0Ng?*rY3S({P>c5;{)8RbzEzMO z?NoHqwzO$Xv+xtRUEX8vgYVT0btI*wKaMX|9qR{LE0@fU7)CXve$VoKy=y}omsS&X zD&Tj#dgA4(AOVe%2E>>8)c@&8>M?`XC`NQpDB)|XimP>GsyD3c1(S00kL8An@kbn= zk)3S1JW9zmU?o?pCI9zVk=}*s=l@Lc%1cZ!5NIchNJ%_o(uyX|uYod6PhQ%gK+5kc z6f4L}yACwP*0+>UzyPY5M(vw&Z0HpB)b1@0R^Z_(X(@R_Az%&Y(~~hzPd9l@v1&@G zUpi~tbOmgOtEx29wtFm3i_!(|00h%4v@Rup{phUmIm&(=VU@RM>HbiiKSkN;+fM>u zdnNv<7cqv`VNX_>#L69pDfRBx7Zj|hrmtTgQaOL3#z8u8{XzH+C2bNLKrg4?k7hLu zSjDRr{UFRppEDuUJRp2q4ge!<$SA@4wHQuK_vbQ5uSr!IT%;OO7FsV5U*;-?e~WKX zKR}y9k2j>`3O&9euL@o9{cWzWwE*?$Hi>Q;`3OJ${^QjTkxE0ObVVhwP*l#PX!Lig zFuyM$<@(F8M+3#9HCD~0S_%{NqL`;Ew10&jt*WX>Q?4SS{(VD5Uk~EvXG%>IpBSP; z3Sf#nrI4rc9JbmLU2&e7Ug{jB}qt6AaOppTA81^>p-WP}3vzJF=4X zPc4e&LPfAy^*0^;l89btlzBJ1KE2;2rjkoMCkQ(A!ynUY)D z4R8Klt9q$spDqYbwuMfoyCZD}QSj0OpZ5@IocCw}JMyU7Woo)+{XgopfKOwK8zC@7 z9caT*Yq|w$ApO6X?d$rnH59qx?)kZd5XcGwUFmd%bu1KAg;Z(24rC_$9m4#`+_e5U zba_NnPvbdAP`yUw3MOk(FtE$2;!j-@Qt#3~B&i=q%4eTV07g;8`8sIpP3{8q9O&4d z%)kAv|G4~9|53Il^Noao>9oY1qe9Pod5#};x{=VX@V;fW@OPUx(zIY>fC5jA*gwFL zq0*F3nP~@3H!t|8LXDJqe;vMX74eq9W1M+t!yotOu8Jnxa#Bq9rPNu&N71jK`q@EA z2SlY|59&9ET38X~!81X8jx71E-`Bk*`{3?rz z*?C{y>3aU1LR@wIecLvrnVi1WZoLQHzO=3~?H9qXB0lGx`+d5*v}{)@`bil>VD9O0 zt@brq79HBYY-y6_-j5}FB5y(OWsj07!Ql`iJaL^5A_?N&A{q$qo5_f-i46Q5-MMm(d~Q38aRN(1J9c*3TTZ-aYa@ zoBkK{2G%J-^D z)t_c|WpNTn)1(L$#cJ-QqEZ~XSk#HENhvPXd*EX-slMh|$>kEIi9L`1&gcE8P^Tpg zX%BS0D~JlJkk1sxO~TZb8w(=3ex8u?`o2kJc2lao9Wb6N6s5_c&aiw+39e3~Ns=%- z6d$hVeu5H;e&yLvU~(uz*Ed171-C{CwXXW+YqhqdAD28QIm`t~S8M$>$x>$lU4hvy z$P*S8MM{+UzZ(@wnie+3^7z*r)oi0PL4b5(dXx=6Q$_A?y39hXF6ngsb^3=&6>jUw zI=!e_Kv`l6T{xLeRv8Qmepeaj=aKQ>@q&Io@>VI^=1+()P*FH2_^C=Ui(7CC>QYwL zb2TizC!yy~U;kr#Ees7y!1sxN{-Q!H{b>Bx02N>TKT~M+ANF&jo8D$vHMY@Eny{Ff z6RTBS2Ii=sl8dP)R(1QKO8+Ge3DY|fphd1v%c{v(d3RZ}eyX}u_bIL_AJ^YL9y_SW zR?i=JzM`!Tdv|_Q6<9S@|GFlu@X)(WxcZiKU$qdXkqat{t6j(DdX?n5go4yl(xO&B zKN1(8Qd0~@x#?iJ;pdlCegSeJjSSU_*Gd*u6y>wlQv=%czh(o{mJk=pg8K_tA@>X6 zjI?8w;`4naRvj_{p!)r5F(6)tJFbFlBY-%N`!$H(=_YmyP&t+Ro-bYcrw(xrsn;*+ zL<{ya#&j{BQcz3>AJYepUV)+#kYKX=U#Kjd)38r>gQ?{&j-=G&j@JX3QgwpdiD9J1 zw`q~9lCTHllaAMPV${5*cR$Cfaq~=_=BbWJRSE-APz|1PiPiC3)p{nSI79tfEotV| zX#ML||29Jm?=SrTA@QN7ww2dYy0z_0F%5T$t>Mlb03xSG*|!*(QnGvj!epbg|M90_ zRmb=okGemP9REp&M3VKl?p@&Zpi-F@ePJl3R5ri|IerXr{wLd%GL}8nt zK|G(VW^o6Ha+L~c6Lgx#fQ91gc&t#ktRu~j|ND9V-@(MfB%uZmJ!5fYaj5z(uEPK6 z%EzBbm8zToIj*d&zrWobn}--hz{@JRwu^#--cqf|I#lX%yrN`bNF{(%8$h^rn_^@E z9@ILy!s^Rc03wQ&MI|i?dM~(Yr}lNZ{rk;_;P|4F+o?)ssW_dI z7T)VsqXf~IsyH=@P*q@(xJ#V$zK7coS9O(`uP=AnmuS`^=^#$-`!VcW&@Q$#^ipLv z?Hbqyk;l_2WK+98JyTG7SA4c#=^|Kl9k--7URB)wgemlUFV?AGb^QLLGo`a7rCyCX zMEv{XKCPex;p#F4)RDuCKgg;V2bFA6AM#M0|nhN|Ix#L(vSF` z|6Kp``k(Nh5Ow-B{dm#oPcMV`&nl{R)j{;!4wT^32pU%`I@k%Sb5lRi2tjsl$IJTp zG3ve4<-JD1pS~rwd-vx%CeYb9aP+58S>;cs+taQ0 z?EJ1i0&zVB;QL2aQ?fA)MF1UBDOaW|O2Fx_Je0`l=c*z``$$Md2+olfaddE5dAJS~`}0%VRMN8aRJiQE^fS+k=< z6#dwrR5Hb+>0iSXKa}Niu4oA7o_`JJ#yiS(?u{nt*23eh(;nR2mn~^1mQ-fd;VMqN zQFL{_+n}$p$VZwOiqo;uyZtmsJ&Y>Zs^onnz-+%ly1--Hvx@is zf}(%kubN1T3O292>{E&pP|&a3<+EX3F@@$|4#WP_L4PLGfmRDM;(Q0}VfCY|XA4$5 z`R27R$~^IB?^Py#L4BBlRX7mvS73#eCK7xT*4xsccx!X}5)YYtFY8(i)6)fxg zmb!!Uo091(A4uF3RytJn6}xh&g&{G6s@NDM-qlfoj{VpIn(nOs!KZ%}8RYX7VavCO7bR5^|DgE#vj+Dv zy;sAOy!1aVsg69wv{pUF1-U$be0>}O{h&dhXUa|NIa%MGng6lyhl?NPLm2ntZQo*;7IQU3QK+pa z-BaIw&sWgeZON3Y4 zFhh6hVwTHFb|a&5M*6*tG-1^lk{Z-C{Up+Lnl|kun=J7^df6sa91PgqdlE9FMXW;H z&rV!wOZo-(bOU{Y|4D}l*S>J>>LR04ZrlN`QH9#VgGi@)z!gZ#T9D0x`E{UKFWcb; z5Fw|jVTuFPw6dq7s=TJC0FeuY5f;7m;#3v&6=}0&UY4jig8Gr}fTt%ncqobzDHX&& zH``JU&+0D4*40wr+CQBGLdIi}BuvJ(k0CS7% z{ih|xtC~@MktZdPeSH-u@%~@KEroyAYj+*&@YWr$KPN-VDOT0vnd&wuHB@17S@8+d zADmF7i74vwF-$o$Y}VtdgU@V-wi3{2%{_m-gPeDwQ>h+n}mK znr@?4eacIF9IHw1DrhQQI$E+=Q+=^^wWe89_Yh_}R_0$ytgkOqdhLE%2immEVQX64 zRmNs#zbepNvhsNK^9W@E>Pj_@JE0P5TkMlPnWcHs{wdbf-G6-Osn&4j3TupKjMQaQ z&#LgtSyohb#frK~>EjqB`6Yf)Rm*u!eOZWGvMD_oGlp|?#l z7PXblwDqv7tM%x*-rMIct6FfuVNgp&;VMp~11TRq`1+?E>UZ2z8Pwu6&}xEO0s2LX zI!7=cx@5P!Uic0u3Yb=pwp4YnUzdEUCbgB=$60q}*sHauN3`qrRyMo6c1pFOD7W`FK?v%?J(BipreyU#8aE_^q-Co%7e{fn7;@+3GK)UpWrRLc%!;YWj!g$B6iRp zH`9TjvO80QhONtAK4eSCuay2g9qRY@rJvYV&3_dI5wO||v8(6#z{kT_1#?`UHuD!79q>7 zC@A96O{%+f%oiR0jib}jUTS-O2j5HYGpI$h?dh=DOT*$`Vpus_{d`q7O+pw}b+@cy zl;1ht2bbF5?ZYM^h4TvAw#ME=HJ9ps#Tn}d*H1D*TTOWPR~#*tRkdZl6~?B=%$9({O7%>Ky1#mZD_@CEk*{F-&|@kEZkYi43Zq;zc0eGr)zs< zPoJwhzc}{&?fo6AR1?yuoQ)lA*m1|cMn-Oq52)^M*n})=S{PcvGvqa^+8u$M8t`$? zsQ%Gz@s{d3gX%b2C``nb3FgNpO#FK3d;7=wKg09?#OFB=vc&g;a5?F8&FXtSw<;}n>{%AxmJlb~sRdgspQ&`_YCsVUC{Nn8{gy(|b`it1u$s6` z9LLFZ;uNtOXiY09@%4irJJ)|v6?jV|oTDwenncD0x3P*9_y5!UQu+R`)L%E`7ni+S zu$uB-#Y{=QzH<<(!W*Qe_y%8}tB!@g*k!boP9 z`^F1K(XBA+>r(7yS;q-4ZL>hED*(>Z%lkT?v!59!tCE2{DJEj#^p zpZ)H`Ym1}gbULT*c6D8F*I!pX8*=siuyeLnLQJyw@^A~p{J5t($`x)VBg{$PK(CQ* zu4Q_hf_#rAMS)MZ!8}-F{a1Z(!|%w-tOO#YXsU{d0vBEaAHpyuF% ze%>upmM6?5UXoaJK00Ptd3;r>vRADwfggerkI2Hg0J;6>;#V8cDttEyx+Rcd*8+Q zespGpZqoH+Z=PXsMCPFSOUrZO#x_WpW>G`;`asLMxTPnB7nuTIiMgqce{e#ob{6wK zhPbeRJj!@|9bSgp)PJT{>ES|_`IuWu-7u8{cxo@clKPh|ir5`TX|FsJH$} z+(J|;xF}pOrgJx?Onkc<@nkli+trSf>)6$#51BH(bVZY32fqQ2j>Ysj`TH_IcC)k8 z(`&@c%Tq|tUEo8nJ9&ZF$*$Kbf;__JCqtXeBW((_hEYdYqw`7@f;ak`c^0DSWQZbu z+0bMwu`WtdV-6!O^;jz(lDH(Un1X#EbCGwUYx?ei#K#Tfg!a{oF{fT;KO- z>f_}8n+^s8ICVA-!-!SY$}Jm|CWtWiWbQ$3ZO@%cn^w`zgkg<f6TD?G4~fK{r~TV@HUt? zlV3PN;j1?k#D=_l4GR0KRid~xg$lMOuWD1g9AR0ckP9WtdH9@vKOz1kkm29-?7BSf z=Ugs3&cARTRggOX*uxR{ZVK}{eIBlpr614sD$*}EtL^mTDyQEa?WVA|o6udRDMV7D z_{o=yF$H+(i~UX?O^9^AjT?wryR>{sSzKY#52gY%i11e+sMn&V~I^uj$orO@jr=hUtU zU8O6AU!-$=3WrGveZ7$PC_gq2P9|X;O3MCz>3;n_L@74{>}`!Hc6>ae5<_rmYz^&H zvGczCn%HeGqgm@2D)IE;$4Fc8yy;%Njm555-UTX2B6hoe!V^rQjNShI_uuy{(;06m zT-Y#~-;{4#)tA=R@+)cSWl>X@wkS*ID4V3Ew|fN((2t9Hd3hNd?`XgenMeG(--egl SRm?V3JD! literal 113194 zcmV(vK{E?NUCng zig)#1<5q;ja=RMt*iHNYH;j=0$prWTaMpQq?B~55lj%?3`Z-y&jTI&WD74@~}$hBtrD-G`&yZz`Wz9clRJ zY~(K4C|^czEHk#=muO?mYNOqMJF}2${7INC1O%tH`ToWE-5?l~AI8zzOBSN@ss1g! z3-O*#4#v5m*L=srVtU^37M{d3MVQf4cYLw)4}dtdX0)SGtxzgIp}L6Fc~3HQgAlzp za6j})uAW42kLQ09UDp&*?ie(FoQFFM(pu+ib}~hA&-p9EUml%zdj6N}3`40`l+q&< zqHRhL{XaeP%UJk+TyC%#WnZ=K&YRPCSEtNi6<>9`9$#VMmqk-V`LmC6z4XSXewr@d zAJ+bxPLcIry#uiR|CIrCeA)fRb-F3zJiX}rM*?&Pw7fn4?N7U3a}U#-HLP?1DLcOE zz2BRzDMIw_kOPVq^fv{tDdncO!I{gqTs-Z5TY6I(e~4nJ5P`BoBwNvl)_ai>%%5Gq znC*bawr7j@2Aj0=M%63`l>O|x>8DEHZFaGR{VB~TllyHK!0NvJzt_xuF@PwVDSDeR z`S%a=R~x=xb>BQ}!DZ~vhd(2|!ueg1num}UKL1@vvF)xc>+j)}+`Ye#0|KhiYODe=htzf7}9 zDZ+36`CV|14Cbh#bw`i*&!1`k7U=>`wR4n3_wUXhB8{*g!C1#~I)9>9oIZRM{vs*B z{Q6yS@^~S9AHJ6C;fui3bgX_H;N6f`6>VtXa_y3yoxU-sr^uew1P=#gbyiS%L zX{Yx~hn6IeSo_8X6WVSz=NJzE$Eba((7IM;K(#4HIbCc0iMPV%Gl+5pgPE+n57QhR zQwgPapX-wMnLo!#^2ukm@~SHycrs`2&FS5f&*NRs;xXNGoPwg;GR)uIJJHgyAL!2U zKU|J(y*Z`#^TMg2_e*b;V;fX+*W}&4Xl+x3c~~BC_i!IMe>?V9T~jcq89E#zf%}{L zCd2AD(%)4$^M@$E0?MDw?LE$JN{`Re;6|ZG>9UoV$w~52_Vy&Hf9oDMy!ZIVhvZJS z>(U#3k|dU1H8aYuJzJs}^}AX6;5nx?e0&{-{Tn_yQDJt0+75}glY{KBSGjXC{4n)! zPNq3q^Z=AhIxGh&@uj1}Kf7->>G@%bp@>hrTc6hUD2?$;-t_qvMNPJZ@)u{`p)er) zx7TshYOaR1?MpW)JKVFpyDwMpMC)%QF=&5tb{?#4c}i<|>fN_78Y$t-&eAZ4bU5>I zBaC@&Q}m%e8Zq#p5=Y-hcZif$pteizc~o4^G2$P1i`xBTKPf$FWykSv9yS%jJHJCs zH(2aUALk3yLymyY*9U*e=yvE8i_~4u98It3j`;?TGmKwp95imxh99#) z$kYTvM*3h~sX;j(tE)ai>YdGOPeWl)1=42ZklZw`_ckC7<0SJFT)??(PpSv{{b_kh z;aKHJ`gB4Fq&@{}vgjgt8D0`~dZ50vJ!#ny1WDoDp+}lzya>+Nx*OWAMUX%H6<(1b z_c`YN{+#V851$}V=op;#>6#T=tkINEzTu(F8rdlUci z54}ZP+@ZX`0$=^LjU?g#sEvX81)zJb{yF@E%k?A5&Uim;@-T|e~FdgrCNK{-%7?%HcWX-|fYIjnmCPf?dwp1%S3^WGu#@!quc zX^!24>w}ax1aUQRT~<7D>#?6dm&@DX4ambZs3%yDn8hT5cdsb_8{wnCGSJ_hSD?7^ zH~KoZngyVmzi~A8 zjU6vu`NR>IdD%!`O=$iZmo^~$#{Vqw2XCJg8a;3_C=B)yrt@dSB?maUyakml%P4o# zZ4(2_;2sBa*@e>!m8QlPsjLs54uSTP>X;XRj_lwr&IX0`!Ca0q!V#t_n>7wQkK^Qb zx6=f?46{%Ynkd%1dUwxt0qfHrAFVsXJ)%yxC&Kp{n4gEAWy4;DDLNa9mdMO~9Nd|% zV?b>dUE*!B1arrpbLYb_TXSR2^EylHYVz<j~w!$o+EEEBQiSu%0xl|p?l|HgS`M38TighBOjJiNoJ~77?RDtviYFK)G zHLHjlw4--(gN6jhO)4|DcHNA#3Z0QrkrNBMK9QEc6zy-$9;bK-#-huI6jw3lIy%NKhFv*12sjmw|+Ru8h% zaP$ZER6dbFAY27>O~u)Rk2wWe4nw5-cjx@l`(28M-Y&@`glUXzR2w#^q10lF^DD)P zuixnF={z_+-dmrLnZJ{1$2b266p8qZXz+QFKfizSkVK|k9=kmWlvYZ5{*gZYb?E}JXsCQGE8m&!hIZo$ugox9JO>f@gHf5M! zP5<$%<@?R2s^+Oz>Z$Gfr&jgqW=yMlt+mPDrfXsngUcQd=o)S{aZ>CW#TzS1OEoyR z!H9RC^v&;p&rdVvPt%oMqG;}qeO=12>)zuuY6+hFx(h=%zgL`JdpK|U=8RFdY1C#? z>h`AS!N)MSzh99R2Nl`4@YFD6JXW8nJH>!XcHjHx?+vx&vOPoIJKn>YANMo$%%D}> ztO}3lRv{=s<%^d(Wm4)?)HqXv^h1xK7l}RwKfj0H;ve{Zx@i!u4DXRu`t8~Kvm{wK z${U##^#@ZWsQ@#%?h}7cuJz`>y?1AHYw{=M6MUOSv|*^ z8ML-RQV+p>c1rTXF$ap**ias+0J`~D_7zi7xmzmdAt8M7calF$+Th6u(;iDv&O`OP zJOohqOtLzj0{UbF?>^JY7wQzM*4H1axWjd53cz1sAm!hdy#KI_BOZ;D`5ZJUUU z@gj(aR?CBV#h(RQG?dGxVke$HU`-L}|$@bQ~JJ8w>eVUp}rWs6_V;xWuF5YTo0@V9WLYF++5 zc@LCx*m(}~bk@qmSR|h$DeuxaHR)r@!5T9t%Mv4x)6OqgmI_UM6;5~{ON-7ternPb zXIH)@<7x6Cl5vC63?vIXjSJCS0U$1e=C5;3k59w+HjUJ^_qlx82Px}~iMfmp(|~-@ zd3cOw8z-u8=Kg(N@ZeBe+^Y|z?KFFgpi?RU!i(;C|mn8WVT5 z5r8Dm`1DClsF&I+L*M$O@+)zQk37x-bXV$eAaaMLn=3K(=yPn?}cb3yui=d+xrZY z(q|>_ezWBj(Q+U?`zya+ou$*ocL59gG%BL9Zz=#clX;P5t}ca$di{ire0A<^eXq0L zV>y8t=k*2%KgXa?IeG}MvFp+BJU$>(!aQj{Tv9=}gW2KEUk`&DRLfwR4#|9H|5yKi zbb_#IMehxJPX9LyZq0B{`xVm-yLi^!^4P|);&ss*)h|6dANVDJyQ`gdtn4Y_qj>6v z+w?2jtGLd7zM9)zPZ~8)f8lrj5PM5LJ0A#7AJ2YUh&O*5JbxJN_9Lr!8~BZpf{rhE zh^$%^0eqCh&zS#DCiegf8-m?;lus9|yB@OL58j-{o>$E3ZHycM^;{3qr~SwU62KPs zY}Wy>3tK`)&rm?S_S@0?+ZY=T?(|0=0!4E(z8_q+CtG5VHfQk&mi36CNcDzp+e3iZ z2lU5&_o_+bNTE6GPW_gf`z(XHKXR0Agp5gGy%wxmsRY0)@J3t$MVLR$ed?ySQM*vq z8|cNpfO9t}XtSUIl}L&7-)~#p^616-E!)k}i)lmjQUv;Q{5U>_9D6i6U;JG!n;@mh zO&`5lk5*&EG)KC`grE;|bx61EL}OAg@0Xz9;x7IE$E{lNH!{m>zP$>y=_1ZAe(Lq?n*3K04!!@pUT?7u*_HNp znO}8{44XYUCO|Q@AWRidZoTz1*^vfeC#9w#ulK|}9Zy`)ZbX!e!1#3nmoEL`?@`(7b#3nF%?w!H}LmB0KmQD)**B{0)0c8BOB+*ISEB-#}bn2WiO;?7Q zpLzBnf(HDdySnP3prY{DX#DM|I(3Ls&U_~qa3-J76E}d0F5jJ1yey8MPmoCa)L*nm zKryG^n#X`*z%!03y$;>@yva_>qd!}oFuMm6oMCUZ3vY}iOzwE2?z4!L42pQk5vQNi z{N2oF9c~Gl{pp8S`d1O>Z}X4W`^$a(%s;=pJo}$tUhYpXFIhgCb`$RKnPGUgrx+Bh z2R%>j9PfOyx(N?F1&lw$A+iq8;5+pdDhllEJ2ggKZY$d17+upP4it=|)KXf5lzJcT zZnQGca!59-1cdI{y?L2V#9@bRbfrhG#bs>jyo(e(G* z^(Qk*l)-G}f3>LhyCbC9P=HsDbmN%j^AEm`2Pr30Zp>xz!H3CCy$o{>Wu!g>9fpfl z>8fjq`pW#F3e-XC@Rj7`-xS=pC`y-L9zR1yYT-wCS$KrkVa&Tz{)=EU1hQo@NssOm zshTp_B0;5_HeDT6BQAy#CeeZ`kZ;o@B$;WBN+6dPDSX~JJU#i4$4u7dB!T6+>3oHP zbXzE#DaEXr;0zOObvREymygpf4f1-H_(&?Ka^J#R)Ct0`NWEuijBWdp{~w&CaP0eV zePTR=G3iUr_>P_wYSJv?JTT}8@MchE^ti5*9OVSV%%1#tM|SH&)EP!(Nl(v?`z9px z>pnvFd(a!9j(=DR@S(zPu;@n~kxWP?hInM=m}-4L*?xiU7xDD_TrR8^jwk(bUP zp5zmPpyzsn6D`98igU~ymLcc_x3xqpes_r z&%Do;-ojwpIn2=SJS>w}2d_|znV|vknzHa_X>HK6xkUuyt;N}Ta4De~b z-lxevejlXd0ZB4_MWYxcq+j1js=PLk)0fzAI8O4q-ho5;PN2)Iw>eDy;!OPfrrDtU z&QVfJB*d%wA<#Ofyfe5w{7bKXFeTFNZZ+U*!AdCCBX2^BnWhxhnn_?R@bw{r(3!(cc+>nPP&+e(O#+Im5U^Mq>CG*I*{K9P><*uq~UFGh=*NQ zVJ#QM!)@=6d@Ba8=$BFueqIaJ6xJ6SdOW0g}5RZO%kZE^noXdNhW z7?Wj+l-!ei3@LqLP-US_ln*mjk@###z?VP|vFRei)5l)JB2IsYYV)o!C^yiko!?vI z(4)#jl~D==@;=iFo?R+EdDqkATc|pdvd?xjYj2BIGxl+Hf_0B7p4rhvBlHQ1K))L|5;Z3c zW(0E%EVbWhfUKYcj;G$}@DSEpZqY8imGJgmgEJQIkS$pi6J`rm!Dl5L{@7lomYd7> z-aYzLk7g%*1l0eJhD)ggbkB>I8G4_2~Q>+6y7;Yb(%Su$jd&?o`$Xa zDxByN8kdsY&II%IHto6WgkEyI)OLw)O5!_1qfTX*yH$JInt~3#>8A~+WW^gpDC4;n zw)A)%+n*e!3=Izt!eq9DiSy*M__(m)wq6&VU-1r^k_G+4IYfG}nJH@i`->|%*n<$Mf#5^H0BHcoAI(_<;-j?q+fcU&GkKNZj*g-!J9D9T!OiH`E3IxTCbHx zpweC1o$HtjCL4To=P)tqQl!hMun(yMKhZJL?-?m)rMa8r z@AvWE_TT2g`|@ir}px7v`6G=_i%!-<_(pxDxeOr{6}g|F&(F!70-!bs70fj5iR#N zwWwOBuzFDY`TvZHwWk#Na09qtWapGqs?)h-Qu#=26b;rSB+iV6DxT{@h*yKR8EK`@ zKUe+fp0?Av3;5%r!An?I1+|%2R$K+NM%yV@3s_;YniJCb(B@XL0h%{A<6Q8D`U7Hs zw_TuRy5k)!q8zGR;`^ZPRDkOD2|-y-5!(gkp)(b-It+^X7_Zku;`Dj+1ZgTOYOJ~6 z>4YM=fs-;mpputh##j`Oo4E*{S~A##0gS+;?K^|>jQ=G9a8&c!8N<+vDQ?x zFXQ}bmg{}QJ+xwG;2$8*qQNz%JoXvZdI(07~5()JpGyEi`CuhDeq(_ zw@JvJ)XM6cxQcYE=Sbvi)rmZ_iEZ`sy6?3TJ<{B^5ysm^8m>(ts*oDvjL+H~=0D?e z*D%jhj^mDOQH3zxj@HT=-I5T^j?04v*5(0l`T1m{?}qr!%N{Vmcq+0 z{dxK@DfA2O8RD@x)e`18`YG_iuUe#ISX zKVBh(0I1^*@o084^*dv~Vc^>bdZ5$-)wJagONJ*Qn56D#lKRx3-+RyVYrL#c(>0rl zPWjn;o~N~fT8Mlkm+VrTMx7MBQ>#sMo-7h&7*>RcB4O@K2D>`a<`2Jab z?ZW(NBKPr*8(lv8x%{cbsLmu$j(}2BVw5w0Niqg+BOTHaPG@E z!_4=!Ih#s#7Vq!T-{gq)Z}m?cC<^;?=XD7uJ3!+=tpp*Ewo{mGkkk96tA0_Ha6YR? zOOE6(b(YDA^d!|wPss(4>1eVgxvEGvO%LAVveZ(9*0`GIg2pAkIAs(&(_#$6tuhQ zLF?}1HCxfeA&Vl)I+nj{Phj&gO68bxq+#VnTaC0KhW`1W=VL459?o*^IWy8(AA{;% z11&pAhCPLV{n2#^(BA`-{>H#*Rk-5!6LF@|$@h759zB(^rd>^1cB=SY-}Ye~th@)} zMD?b_1ej&P(P`}$9S@`uLlpjFjl zC^YzN)ZpW+6w)Qv=uGpOqx{Xc>JCmsrx56&0dR~AoL-}OGO{a}y)L}t^8pSEFI-bH z=3-CY^>9d$cCCVsEChVsGVLbeIC~ScEo)$IWqFqFhl2~Mign)Ny6^PhpeVv==To4N zc@?JKp7BGHe0B;~l#eAxOFv$3xu3a6{eka=&zv3advSSb+`FhU&_f!Uxmt$0!htA)x1A3iN(;f1gsiOUQPwU9;nG_>dG-6kW6rTZ=2BUxs!dAl=D$jP@S^Vx!#Q4#lnK8>y5>OM{+_1G3QID;pej-1qvcPd{>I|_Fz zP$qB(#}w(@H#mP&oL^LkZl1^CuHJ6M+Bx_yc+oLFE44$I3bmMWnDYB6Q@GnJJ;!Pp zx^M@l&CpxERg-rdM5sO?Et(dk=WOYBVyVlI;nY|F)uYxM7rg$0`ze>@Car49IQMvy zlAlMnAt|V;lx7llvq3R1vZSQA?MZl`0;cQdI+;ebgsNmVyoc_Zvx0&e7e2i-46?(; z&oVv5DQV#Ki6O59^7rRN9r$y!eLrr`R29!luEw@_eZ?7R$m`GYz~?}}8Z!dU>!)>u zPNJtEBAvfqCQjN>7-C3y-y4YrdLu7Dzm9-@H6U*s@y77A{)LHLun! zngG?Yx>K~MH>ty$=pq*9FH$*;!vC$uxmoj(G#}k=3ha2$W?O<%)p8A1u#ynlWC`TK z;Kh92XA!wezA0v+I77WRzQ6T*!*KGV2|9``iclq#hixVLhW9%U={1i@`$r_3)>Y@8 zUQsOqp1yz*IG6smAv}akB}Ln+Dile_bw*QI+OvBUf_Ln~Zv)qS5hupiS{hSb{romO zd6Z|A%c87Do>TllB!x&c%0O}C^)%=Ye?aj!rUdxK^w22FoY;dVttD|3wCad%g}PEp z``@%xIEnb)8ihGp>;tu_>t2M;!4c|uOwldOL3la$RbTG7Vd0rt`DqWpU z&o4#d;6;(Xfbl2Duf+p=MBQ_FKqw&H zK-p+TcpSG?D)ErC3A(>~HE1qnpt|Jc1Dn)G#ATCygR(13dCo(g|6SV@4T2Jr$eoAE z{V4BjupL!DYj947Od}RShw^`2&V%MBFQrhY!RrTWwoQ=|SzB9$bmkJ8qO=WqY$u0C z!NLD%{0*NG92b;oV=0-WA;G+@dv@qWn_Wvd^d*R;2d^M%difn`s|~qsP>wU6dP_m( z_t!bl+xYbM@%+}^NFzp2H=~%k=`XQP4A$n;=d(Iw*1)>&YY*Plg*`eu-+JK^G&cA(uwr5qdvkd1VTq1}I2X&-MpaN~SakeKP zQH#) z({=MEm2%G5=%*jjfTpw!GHVsWV5X1R{;8zttR4(x3{s?dK`((PJqVjK8Z058^-7}i`-QTH7ySwYrU#ol;b!>Rz*wcLQum_>S_;JGX-fG z$a&AGqfyIUGdH!~-=wS1Ht{XI;tcXo*bND2v@3sK(EBLu?15SqGqd%eD>onvepQ-p; zx1$5T%UXWVhV=Y9`teG`4X-B#J1)#}?W8SpiRP50`9q7!YG1Ahy z+N6OmMc>DWa^J=9LxE!AhAfg@-b`)%6} zKD8f+$Z2}n1^S>6G=s``xgiVvhES-+w9WW225G{liRKnH-vhf~tILomLy`WkYq`JT zq)xiiBoS1F^z>t_>$6F)x(U!!Q+e61J&9IFixLBB2kDjKR;Bn6p`(gblU+~n$lSO0 zRB8~|sg@ohxJi=?+QciMe5?vBt^ShDy+EbHp3m_i5rlAx`Tw3hwM1#)-a76aDP7?T>NnuG(`) z32@z`{JGKT@Frh|8KTm$qYe=?I7X6!Wc=cC_du9Y_Kpd2b%r>vS(4+Su5a?aq_nk+ zlY~r$IP$;$m~%CH-i2uy%$Q}+Puu#=Cedhy_}DbI8Ls=_z`jt#Ghk`-^)m+qW)C6ux(h`zb}(|^M9splTTnyD^Y5vv^W#jgS??7euS2T z<*y38ppMFdIk5#BwZ3?~gS-A|^jG@FD5+`fN#0tW@k#YnJ|`FNUSkb#N?9HjLDsVj z^tUm3{4*lQ@VkH%p3InUgv_Xw+~7zvt@N9~v7S0kfuyAh;-dwKU+mS3A@f)Vn)brj ztZ{j=QNzMYQ6^#`i}H$Kfrek6H5E2C2F694zi0;P!}iSOiH^=v)qJ!M-Unp|e{;7y z!t@^2^b1T%iSX)GLK#X6r@HMIqoHLy8l9BJM;TVKw@%G^F?1gog3BPwig|t$s9Z^> z?$ImP%3K$or5Di6Bqv%%N>X3zSi*mUMVHc7s^hx?wb`u{k5U1r2kORcm(Xwmv`|Ui z({x-=a!MiDM#+LA#?X5;%>7p@M00|&ifbDU$>8BsE!f~i0(1$5sP+{Od<9S+?utNP z9A6PjlXBJi%}lm?J@TCVNCYAjsw@?7{xrPidvMb~vO(#^$$te`6&82BJ57>Uv^3aN z#`)9C6>~GqJwqY=wHgv2K1SBp`C@nEJef(wW|N9Kq{|MqyYsx8e)O_QcZD4pRF+&w z;GKX*WSjdXxLBXG%K zw)n~2B%@wt5$JEDTkS@?k{vVKKGH!iED^K~1`d^%lnrYW)DXxGU4kW?=j~!+d^EaT z`=BnYo~|d9KLmSeHhD1-#Ah`YMZeuR9M2zaELx75ceKV=oGy<_g4TG!p5GuqddIn$ zf$sDNUnSGlx+n<)r z$Z3eryBeI$(mD`6>rXgV>cf3ppNyXQmMsSN2gyWyXOJ=k&6>)GmrhVr{24Sg4)t$+ z+oG6ym04zH5HtPYKpfagHWZyKc_=yvQ@Q5PYWg^!jnsO(Gb=76yvKn7#d{^ChCYd# zY!WxAK*L$aNqx3qUvfHx-3RN76Ni>`AQU7$1smt$K&j-=>MERDD+ZLqK>AOQ1S6r^ zB#3ESID=|hV(IeR&U_MN5Z+VJHO*_ z+#yDKFJAZJJ*;w@#iILDrgOWU-c6K($`dFsG-Sb zR^7F0oCIy$Kw?}(IX}Ko($NKiqh-10R-U|Vx%`^Br-HruPscO)z}6sYur_#N9rZxD zu~z(^Us|KGQfN)*7HMM|fk6e8Tu2RzyE&QJKrlT>Ei5zdYnMXuru79Hljc^~hSR8CzzNHdv;0FMM(gcS2koSQ`x!<7%~ ziZD~EqYb5qqX*)vbOPbPOK|H;G=OWSO_SpE|IJX+sKLpMDJDT(c5w5g=s4b-@P9o} zsqbk@0kj9g_&bTU)C1>IxN|0)_bQ-0?Zw{CxsgqF!S=j&wJjDTHVU;D>-vL*CyuK+ zoN>a765!rZ9R#LC0y-*bjIq}`EY=cuAFPSGeak~-$3~N6H;`fs-i&1!w|5ZFv2@BHaK1njIPn0Y_@w%}~HH1!l zTLW~_dr7_gh=-eDl+HUR)n4ABvp|Voeg^>vmK8gbW$#_r)WY0;63ad^kp|Tq??R~v zMj<_0dLG?@#1@21C7fTgmgD9#E=AGJaAyr?P%ec#yQZE~$3Vq3=qVIH(i4q8;wH%9 zuH&+L;sV&0o84)A$`Bim580DAGzBVbRCcNXs;0Db6K{HV1hi5yZKHX=IHwe=RP%61 z8LLuCT4^x?GFgKc?~lASvGDdnoCgD?#s-F-w zkG(hUUHBv~i<`QdIcXAkLJrcqHqawzxk60?CzKY;P-6#>A=vKa2cbB`@pi>E(Xp$ zWW|N%2cO@fI*g^1aqeYI$w9(6z~vSfQu)&8`xh8OZ&_T@b2=tWE}+g>hIvYfbFlK| z_qoW%)S>bKO3T(ToJmICN9E=c%p;tOUHk2uSCPnJD~r4OB~IKEH>hgLFuxk7oVCuW zchEyB5hHy}Rj2^CyB0m|3gd2YYKF6~!TA%Tl4s2HQt@_=6z74#DJnTh#Q9Jr(&EUb zGBR&6tiMM#Ra2l?e4ng+XAiV0M1N+3G^5_zo0bl4)Q5R3#-<=c$dSrCeClw0mW}%1 z#K-8v57HRbn}_5r(u%x(t1r%g@+4H*SZ^%BB&Heg2|4r+K}F-mO<|T|>L4{P@0-WO zs%JVHY4za~dd3uYPMpSrh*4}6%FKLJi3mUT@wwqZPvgvq1}QS>Jt^2mj65%J$7yxgB169Rxha-a$P{r)+DN!%_J|0aaIYUNecmo>TZxqDKr)1^Z8+P|qNUGP)gN zJv`tP{N3AzA)g{hp5WWi=SKEKtRm=|%C#M!QwiRtJqOBke}dn~sjAti;KNd&7>oCOA3}U*=c$h^)czdm(vN3DPY&+W z^&NDE8uzG)IeuyFzkZwd@%>$n_T5q`Vhp5BtClz3n=bp)_}~Dk3W=_)p2wwvzbm_t zJJ*>HJlyWycN5Wac%RBCM)Git(EH@UIC}_~wYY+D8F@CQ_!$ zTiq&ZrcV4@lB|*$G;$Ng!xW)59@QRV{$kek9tj5CZO7dA0Oey9tC5=eHMCRI>9}r$ zekE3spqHd+-y+gqBHPEf+-S@lGv2d}$3v*H40Z0->AEnH*mKtHJ|P{0E|}}9pq_8n zJ*Z_x(j7%_w*H#m0NvU*eh$?ZgHfUNDQJMQPi)Gd8W6crQ)cmOJ&>Lj zi@Zx3-O@C&?fKoWAr3HmRYl#{RWSK=`VM(l;+pjPR#EULj+k^x-8w!&kv;@~j%MGn zv_GbgP(Db+}N=F7;$n7 z-%#&WqjMW!kh)4r7hWKJP=$G%mQ~R>p_?QQZtrM|rX?ZGaY_|*6YM+2ExXkn>EzV` zfYNPg+cI6KQj~$l`)YlKTqoz(pDu5MFo9*D{n%N9cU_J&39MpcVHs%H0;4)r{k*$w-eJGhleeV*`^=)_c1A&HViamz@Smj0j^R#+T_H#K+lC`-*SaVKOj@KL@ zV@5Szn-;5Fd)cDz#NcJu=ctZ7MuLNKiC1+$i$IpKwls31;%w9VboZWBjp{f|k=1@R z)dFNkig{KUI6k@>tdKeRZuEy8h8OqoB47SuNO*8B)67x8_s6M$-4QzhqCkJZAg0av z5_|7WotY1p_g>VFPzIb4o2A|5O4$2k+shTec)EOUJ@h2Nk>Om<77-KK=KNFz!#4l? z=jT8FdH?iZr~zs6{@|Y=Nd54~>ZA9$=9{?O{~k5r1SHgbE@;`Pu(vStR-%oXaAlnP ztQq{1ze`q*kf0mE?^wL_^QYbfXBYdm%NzAnB?Odg_5l|#cXxE6L)~}KTn(^Syh^P_ z6tXH>)ZhN&(*9cKt*SY4Q(^MOStUp@eoeyU0qNHn(s;?Qw2D4FZFS5Bx;?*|o~BM5 z`c+Mn;!rZOjK{fGl865;)YOV5Wv@P)EOm%%S_kL8G>(ae1Vvkw_r>-;GSRF5SM%ai z$LWIeC&dYCH@7<)%YDi)-fzyqqN5UW5;|D<)u*lzC<-8S?lZ2^RQs9nLy)qr#xq*a z4BB%C75e0gPX9mxBca~#L7~1}{X@&XcTPR2(i_KfRFz~ZE-fP;{v-`! zn@A6FiZb`5^Yn1|SUZC2>nhAyC&uS~kUwFq+X+f+U5ZXFqf8I~K%_c(6|Ld|#lUDs z(>cP*0elGSUC&9q3^7oNri}BDcoNk3QP#v_VxB-O(9C#TB7dnJji z5zwzQpa}TeAfY=^$_zcQJgPn&f9a1ddlXUn0m&XnHkPI598^Z38(ud>d-5j~lNt0i zvJwtiUZTqzlqSW|$>)uU`h?Nq0I0BGdhiO+b%V0Ww_y}H&^k>KiQ*ZeJgD&R@dmuY+asX&hhC!!1VW?L9v$G2O)N>7(?l|m++GeIkfEWuYEth=JTT!ibuI{ z9Xs~SkwV7qjYbH6Bb(PD&hO^$U-S24{M-F$6)A$fP<7`e5bx-nW8bG?-pa#L}bSQHMSEo?x};S3G;MMq?3LJ^dh{REQ{}hFT3QS z-qNT>o#On()fINB=|cM{JkkwaJ^pu{_BZHF>ieHR*?EeWlR6$G+|_ztmtXT|Uh8IQ z&5}+1KDDn$tZoNJE%*13Q7wrn+a3eN?84Vw=JoyJmth{#7Y*p|F?%>=&5Uy{s=91a zPOb%k)zEKVgo(Ac!_N52i9M^>wI~-s`CJj;eV~#E{xMWm0rYNKBiqqm`XHGkV6lGc z_JKYoPQDM+&CTXj7VS%|9hpU|^%b>=5tL&TTM1GFa!gSR^zgGk1y$7*-5bQD;0M9j zPRE)gHBXmCNS9AxB9}%;x`^h|r^nyZYwxDd?sK}0w~b7X&h(jf0}~QBG?X#*!sjB6 zJw$P&k_Jg3q+GPezJ z;pphC-r3&Tmm5!M*coRDJ(4Xy^L^ueRVpi_8rSEl%vb^5-Sd3^i&rM03qo=1GVM#6_R(I9sisY+Bc~bA9 zl2jQd$WiS`f|2X)%q@C7#@}j%?gE8wD3$XGs9*oD6QDO~-sNqSe_ujO?|5II+W{&w zGW;EtoP`YEU=Yd-XxrL1snsFOt1+n}VQLemGZkG;K6V1zXULy6;o3{nlMDV>cdg*j!u2{ezV6C6DK!Nf{-Xi^nBry znvM&ZT?NkN#(Z|h-|T+9bUVnqG;h+}C)DI}e&;IjRcL#mAws7Toc?HWn0??5D1kgq zK$$1Vy?tdUN~Dke$ZK$dt)WwbQ+J`mqy}d?hq?ULqAwY6jwcBdOqhnO@XtxzHUe7z{0EIw$zXFR6)HX$!-r4(|I83dz zc(GXr3QvGm32^pd_RW$o2d41X9m>hTj1Ic>Wt?kqbbee0MxTpF4aq_O`Oq0HRGHQ! zJTPJO&2LVk{9HF^0#o&&G+GZcJ3Qe8VOW2h1_wcS%UX-9%EKn2)dS40 zW0>E+&C};ccmKQ%pSOi7CTEM;zmevUSsa!|Y0}+VpM`d;3(_W6rzS@CqCNN9ACkIL zKU_;f;-rSgf!FBNvb+U8@~efCIEs)?H${xU`*O=O%wdW;tXEy=j+8As^6W-}u_Tq=k5v!JL?l)K)MMyAY>hK91RNnH(6`uXq@CGf#*4IsX`>Hy{kJcK6b z*zP!Bo4Hd2q-DGetB$`$|JA*9xUQ(kBOa+eQQfhJ+owM0{s1Uw%s};91(b4R{Jo5n zMdG^Y{cW7?Ijd0xe;pkwG(UOA@4y7f6K2i`=mV^vK~UZRrv*}~oL#Ky#3ET>2h%N@ ziFp#rDFKn!cUCtw+jNiqa~hZ&o3%cqe_-yFauLiD3LkWFgK`2p%>UrVuK~K(k#~AS z)$#0sc8jRpJ2^yxpomlCUMi#_mySJq17Da<9!r_v_|0HJ7$aF-oYU7CB46s8GROnx zG%mj1-vW7HQe{mlI^reNht9T3HmFQf5l~jv0KLBKmF&q7od5-kA`iM=qyVay*tut% z*)!6w8R>6=)aRN{>eIXQ4>KSM4SJ>@^lcn>kcxDCxZCS2m;D8r-&#^va|`QP|>`GaV;#vX)5lN>XxxW1yV z080W)1bi6b>cwWlkAlG$cES}r&;xek9nP#LvgHjtvH zyGbRZ_%45BLOfE3bXn@yJGim$BekbY4SYRHyyr=}KxfoQyxKvC@DksePY@(3w{C-49MW0TSp zc|Iyqixk8rkS`q+mxR?Vf3@=&3C1tWIQc!lofg1YrrEH4SI)FZQt{|P7sGqo6QD}; zaj3llsV%vu2kuXcJ!&KdnFeLopjmT@J!2x5@;N|AhVHF<>>nDYY45eRmUcxGGnj=e zw%ou}VbwJ~GL&KNTuI}2U-{s+g{Si1^`h6MD52AQGAy;G6VyWVpf%Ia258NbG6>B4 z7lU9qg~^|6z_=t(a4dlQUUWPTK6={lLRI*A#@)BbuL9Gk} z_FA&rJ!%m|oVeXPqNyywJS~U?LFjHXY*TaIp_AZ&IsyHGwaBbF`(rrnvIghgAj7mg zEm~-aU8bj9Uy{L~ikEN44Vu)$M5NPvz@Bl^I=}^^ZVt3)PR^s-lSGC9Y9%>bmD#m( zO$lpIDd+YK%RV|zyzyAcL7;xu()^m_%inXWswS%U6i^;tN+tq-`VaIDm7@+K*RD;N zy5ku#(Uy?-b_wUVx4loRg5iR8AYptVlC7BSJup>`+4@fQL$A<~)iQe~LA?p0oCS$nS)F&KXtQ7Ur%29yXmBgze8k&V$=hm56;h~5FA;qa>WXMczYj-HD`YOAiUMou z7O%D3qGAKHYsx6U8&X@R#_-Sgr%(6Ur|IXoefX}WRl}Q7FQFN{_SmijQ==WN-w;~kQQ0~tku7O6if!Xy zow$d_?c`U6IMuk{=X6!7@4|kq*aeL!=E)Ce2V?m)_;fn~UIo$hY{&BOnFzo{70`2x z6!l*||4TXouj<`uYczNL$sL7e4G=1okJT#2`!WBZaiuDY<+u1;onAx@7j=6ufT0C@ zeGAc?_sO_E(ti~&mWoceS2v_z)jDI|S=qiH6yv8zVOp@Ko2xMWMtu*3QRuN!3~hFn zy}Cx?x1~Sz2%xs+RC9KRr=>E^!tQiVKV3o@m)s0Qn;yD|R*39wvS50FQz}}rSRg^w z`Eo0JxSD$OyODps0YM0UA8B}?x>02n5~%li~* zsL-L%6uF39&ZPGUC*4*jejlpQ^BZUxs#Gw80V+B&qnq?ITDpXTOIUjn%=)HW%9JsTc_U_ML z|Bsc*Qu+1O-E+JA3D**}xAqRbrVJ=Q z^Szi;rUFcFQp_&tg^9=*+)be?XCZpJ?4Iaek~C7SS|5-)Ud~fGi%swpR>XN!F2QuD z23d(3H_YDkv!MQ$B8J}soCf~|kQIFdCv}b9yfu5Y5t)r|s$-!Sal|V<}8ABUhP2*t2^G!!}N9F%-JA|X`$oEXrK@m?XH42wtqD(T7 zwqPuf^t<#Ampe3Qk5cC~^tcC%_OE-iPf}1DHF}Fs4|*}(=jSl_Up{Mgms9%Cd@To( zJceCaFv?-V*z~+u0h1s0@C$PS0}0@U&@X=wtD99u8;SqrLs99oCeh6>MPufMQT(QD zYF6F)%QE&290`X9kXi+h6@x(=jFw?+o0k>f$>*DzH3Hk;Owce(}a-nl$ZZH#|$-uS@Z`Ct`k0bvixx5(U6Yozuxg@qPxoK=Se|AY)XQ=o*2Bscb$L?%_4``Bsl84VjEykcMA%s=2jI^rF>OI&2n zcY&d9kqWq(i@as7(%bj5dtke6L^#>OVnERx{szXR8l+>K=Qo>YnWKYMS*Y`oyY~Y= zIFgb-Byk!%3f@uiuGj#L7(pBE_v-*j!!eQykH43XuZ|TdI&z_Wd0(H?c zhp>6jWhq1bZT>$0rBC?w^7r}W`LF-``T6<&>VLTZeR<7j5W7^KsXb!xNo7MBR+eQ# zM3ibvMWFjdu>3l>93FN5Mu~2a81^LsCw`=SuHyIle57n>2lq(60rLkizkfMQsI+37 zp?SLB|NOb%@6WII*Za%ee|ddYgg?XnnqpB{lQRH1AJAMVk*6<3%4RLxLZ`Pd+TcdZX4GmLbtPUkSRt4we zLl)z-81^q4c8~LXTpp5%*Ch|=YeOsIqmX{bf{1%PSSBfjl2~{hQelN%31^0ZzK+S~ zksfx{2WcKDF!$MWqP8ILLBkN?Wln$e)-RkV$e6}2tuHTyH&$C7;u9jjIJkcgEG7?u#1}i}&2mdXj z#LgGl>1STYFJ`1)GScsFbEnULJw4$PXP+DUISY=`dpwh@WlhRB!x)?5HABPLFf7pt zk@EZLcKb=#Fidpa;gP5(K3kt|-iIl_#%t8zEKTtmN;c4CwT2spY3f-;0TsclW9a%SHL?#721{v1P- zWU(gaJqjlYv)e_SU(GP=JHlh1Sk?a}@#N;SGE76mDXnKdVQ3<8#gJC`c+c#6I>Xzkof%%D3q_vMp;5i%nVU@tcW&WdEm-uY8Ct8oS^XVrhCtX6 z@R{X#>spc-FKy^*_&ti_#JYg|4iI|TpluTr)wUP`e8KpeFxOLI0oTp@bPI9&(>Ut) z1BzG~^Pp}g4~if1+gHr7d8fXlv(d3Gc9weB4?3 z6z8m^H1~h_?`HZN8iFQz8^jFyK`-qYQS6?S?&_gf*u+0K<+MZ~;d>z!?ot|;Q=(Md;} zK!rBIx5=oNr|H&rxm;1TM6smyy7Unwff>2EF2YPt(X{l`Uw%@c0e+|}pt}m=KCB&x zGhW#~jQSkH{90DOQD{dT9KyPRE_?JZ+38KaeZ1Okdh$VK=1fx}xjhZNIRQ4P(wPC> z0INYclTWfyoNPT&94AD;-XKp_12i_Lr9)Gv^|>l}vP|lm0aZ=c zE>22^4#2OU1C&4C-#_@xkB{}<`{DjR`W(F*M&rD}{N5VeUEhZ8b{p=)^K+Jl#*?JV zVU+CbAQj!CtKWCYH7a7tTyRbpjvsAwKx5=M8;zNNKkk{dFg>%BlF+&$q5iZUB$pg` z9cJbV0E-Imd-U}cg`~*RJ=LF}Iq48~MVCeQ$KKJ<;XF4Ih zSEZqBnRpv|6zGo)YQt<_g+UJ+fIgSzpv$8uA+KpyT>=3@tW!?YKfP6Kq#9GV?I@*|qwL0Y3+!I=g=~4{?r6jGgia2Apx})a2&ZYhr;PIJXC{{!*-KGm zUB;#QJ`F?PUOoDYJeAcx#`F8tNDh@Zd}EvwG9V{7gKYNUco(&~c%N_XI+v_S+c3l) zPjnEa^GS$7&Be15rQ?Sd)Jy@lV=AzeKwW5PjLMN=L{bD~*f zb9H1Yb_oiI(E3co^7%u<$tHq->QqyM^izLIBG}J61RsCV-b$l0ocwZ9edrP>!=z*X z_r#bFc+z$C)!_Ul1v}fi_Cj!=Z9Gh6oZn60yg6%WyDpzM^D(0R0wW(i>ky3(qw^)I zBIEWU&~E_!o{P?FzyGpX^=NN{ZAX(cFRN76_arhAKFW6}k_<|5r0m(&wPLQ5oZ9n9 z#2_B9Fr#EPHmw`y&6UU3<>$VX*l;>hp)DfS-mwEz)Fa$;0GA`^J)LZJq74j zBPi&7|9qXS{C9Kza~nSHPfC9(b2HrsHA94WN?3m=H}})*q*U5hvNwVXAs~U+lsAYP z&Nty&L&EdLE&9EUOV>eJ6r{S*xts>AmdacbnFE?`l|BGU(E!Af9{@T=P0DbrcGOGC zf$5GK>3ZxXZ)4Ub9Qz{7@88B~e=CpRS|39Sy&`_pLCw-ixSlZ98{2^+P+s0a4q#nV zhPhiN{pSjG*uhyz4$l=fmZPK??;+GS-f-UsfM5Fn(Je+D8~5A&<6{;5qJ-JbKkOM) z@9NOzKn;yrIS`$8+&OWiP8ec7JhkKubl-EX%WsW&yA+jj#sQ{An%gvNQPW1LDT9ww8Q#=*_9n3_w_KTwS-ZtU(+~CV0Z}NR7m3?-9~4dajRFv!M_6;+Wq5X1ymsYl~ zYG~9EA(C$SHtqRkFJlp^fHDN7;~aeQ-Ki(1<%l@2yIe8ZD!(y<@@gqjjrbd-2B;41 z(}#NY-NSoc&i?wj+5ojtDqD{di}9fa_J(Vz4AQUrs_8k-Uxq1*O0QP+!onlK_ou%1 znUmt4)cAa9IZJJvl#WQh7o@*iq|@~0@1JRUU*ga{ev+~e;c|AZbG>`#B7 zI$nf=aMAoA>O?z>)DhCHNx!}W(0DaH^&hTxSbRtBUQD;%-EKZgA8f9fJHLm0v?HBG5n>l| zk>R?4^v`IzPIgaEcn)#m>5fckoZjSQr^vsa)pJeBpgSVj>-JQ9d#Z&c{v0KZ{rEjN z_=seBUIk&!4pQNwk9~v^t_i}4QYp6VJQ_jm6vgo(#^0#qJqUKxJPrCeJ8T?-uV%Vi_xeRo! zb#+Q8EQ?WODaNIMw zTz1@f1F2P!>ab`~pU#g%V6Qj(XK1_h52~i-SYr#RHKG5d0;~Tc5BLq$L94n+jqw($ zLR~dIfS{D*vITef{HYO?O`UEkz@{Nd=+_uyzd=;8xb5FerRkPbk&@Y!m&Zxn8J#e{ zG$nG1^gbH%$EPE=fi$P_(b;C$kMmCM&|Qt+1FT^W^s80Bj(?mywRG#GU@v!W ztO^YMAo1|fGglRg&MNp+9a8)9PHk>aQvTKL`1beym*z%Z zK4{8YZioE~C5V_YOS(*z^f0+bozaRF$}rU?x~2&7>*t_Q#=GF!@p935Po8kuqdGbG z@i6@oBgF|$>e#tWTz{dw_M-T9PgxUGY~;{GyvVUIO0!onY+oXzD428ZEJ{iadhRC# z$_v0tf+&vuK{Y$Uzm3c`iP!4JM{@r8=~hLpm+ycGtY{M zuVOwl7gSqea>4}NMk<;jNQ^aMg++V`Z}(7MU-&uRGvH)(LvML?9Br<%cB<=T6Pld? zG-RA#zuy5U@Zo?FZKntNt|s@c?(g@%H*n5Q_er7MK(sH~-}+B(`jb(n%%1qj=BCoxR!W(4KpA(DXEFpQ1gjsrypG1iM;~>#~S|3AGwbPcpCtA#!TF7|6|DB#B9*ZXK(fBRT}ZmO}wOv+MwC@vATbr~lqE<|3&L;rMn`Yy5$ zbwUo#S%3OiG0!B)lM5y1B209Y4P#HdkuEO#f7qJWvc%+NhXBlbp>Hl}kXehZipJU*U^}oz`l-^g}^CVlP;6Y{D z^T>fZypkGOg>%|5;An;@r$@{_*Vi4jCyZKyRzvfa0>h&wWYVBVtXTGB9|ovz;rknY zcpG%i;ZrG=oT@sn%g#HA90emGQ$EKDirewy<*53ww>>M~?DjM?SJ%Y(6`-g|w`URP z&u?=u$aHezEBQtRX;XTzOl8Gzb_Iv7Ed!J#kgVDqC`vvDwf>15={#iytqGgLppXqT zV@|L#8D}j3foHFpGD?4p!f@TJS=27Uvu+Was1KdD4wo76wwg6;)fr5%y1fO}nlezk zy3Mdlhcz7uSLg#02APKDa*Qm?JHfd?d$SDE!)(W>u8`eBCw{)tf2LyIIaC&kh$^TJ zE==t+x1dZ&GlB)3jq2V(O|;fDy3XU}1S(}p#MyQX^=pQDr!7k25HlHoj=!k+xjlad z+X5n-iG&()!TDo1=p*f^jfj_9?44(tXVwN%hWrM_<8Qke8Ai1>eI}iI?w6Q z^HRi#HvISg-7othI<1mzI&L><6}{+2`D@*!9I2pzHc^Gt#|Mzn^qX{m@&Yz%sE>jf zkAl7Y0Yg0##`p zKU1*j*76fq&OWecYv$T`*`jz3vB`-q0IljI+{nHz7InP-Bw{CF87Ga6ls;Dzrx=+^ zTvT9;>p;uBi!_uWq^LuxVNo^NL%o3OWUV_=VC&*@+6oARkykAP{c7|CY4^z;wBqj| zt?ne$=+*#<0joK4oM@4t#*!k;8pYo~`!G-XAQpWsMM<;fvrSIq_bDjfisnR`_vTN$ z>%vju)tsZUPKA5gXzi>xc@PZ|Rg$eL!u;`W^zC^N2t}lvyW3CKy&~q@lO|R{)8bfmcnPw`Ygx^3*K?DLeQt22M0NNj6XJg{gu$Q;*ied`M|$A86l& zqf`TwPLDmkoUV2|;-3GX-?GJ9JrM&afqK4@2XH%z_+1rThZO3*E#uomxHa#gL8L`d zd8fB6^Z0yD3?$PI%yb2m>1;Tdq{n{x{IpbgH16~B{j>ku?`KeHrdO((Ea%C#%;lpv z@#fP#p!zwsSx+zaKHjF=+*lNq_B8$Ov_)^PyJ%0LWnTFRG{*ZqHMzG< zVby_@WD%O-J`6+uF*-ay+1XxyBcBDG&r;x|#K9+E(&g->#A}=Xm#T{ozgp zbZcMqsB^LB+JH~tQ1#H0aN-E4?ZT7zBS6U+aGIFQ>gjVK5*!A0u&rV!c~Qc~8R6pM zFNH|=nKkS6pQ8)fYq$NAf81aYxJwK*7AcEhkM)+MKwS{#so14d;nN@7nB-o)J$-Ot zC#xrUwJ0G(`)^uy2Tt*>>yPZ|BR@Pi9l-qNFb|x)ST?|v(QM?StVh$~M9AhaBRGIC zjTw{`Pr4T4;SzI##bdwec(!p11geeh^K;0XV6JOuudb9+!JwKNy4+9w5{90G?Qm+MJ_0VZ~fhb8)2!P^CjFL=8|<-e+YuiBFb(vlcV zh!(14X65g>VE>M6(*A_o#6GHJkMqWGqhq z$@Tc^3ep&B_L-E_G%U^`y8jAN+VI#<7YZe-qu%93)Ac6?a$mqUPP)kGWl(2|>nC~=TN*^UuC)?Jp! z6t9;PX9S@t4|Nt6aeg;wXpf4-V>)xa{Pai!ZD~%_ptLVn_FOAX;_D?*tegV;id`HT zrmUUZ%Het@x}%}KE2eNxvG)PupH>s?!bP@VW1S z{N*5_;6g~=teTB#C-aU+^3j;v@Mp@UY-Kvoy7!0|dY=8A*^59^Cyt>?zw%HDF50sc zkN|t0;uV|7HySSMIl}neBBX6)cyQtc^@8e|O#~Zlk-|6vzi_YY| z*(ozXXLpzZB3%)+MQFcOcwtZRBd3v+vYKR?txoQnTA}6%GGpIlN zP=A`i{*e`93h?mixNFm| z8R*ZF-1SrXh}+3nh@4)}M=&#Y+AURxRBe5AQoJpy@56guVeRR8bod;`B)Tg`dKEOP znFt#-ki$;8F}q=TSvUK3L}F|C&6+?5P#nbnrJS)&p;H)c`j1X1`EY{lt?tnBuQApnIXDXb#syY6eHW8!ELfP@zGJv^u9@%>bi9k z(aUw&BbPxq@?k(2#K|R;{>WQJ3>K6V52ObisNVVKkxodWdIf{7H)+o&2@1(h%yEExHMwNm^Y`cEBcecGTMDKEEm^#rQ2I#3jfS6|Xw# z%NMbOM-m=s_vaY*LqLQG~ zSHzhMZJmB*H-Tw^-fW32 z`52@R@ci<`U@mOUl0CELH?i0Kwxc(~49cN)7|XGJ6;uJX`M>>bqy%*AL)#86CX5u_ zanon*Z)v8ygOtJ%I8RHA$=0wAElyt1wa}H5q{V%{?DF@BVf&m*&(pB{D6xD~H63*}D z>CZGMWO3dhb#FfLFj`09kvk%jO`<%@2A|H2Xdvy7*#R)$p?wmMwBh-?XwRLCKBjk+ zcX|Um_fRp%qij2$clLweVUtO6Pl(FNzF8su>_Pkq)K8YwWDG9*=V?@vHU!eBH#5GD z_Vqx6{eD#<>r)t9q3oI7wz+4*^LG*D zSAHu^C_2ouj6Fu2$NS~hoVhs*GebR^p7_$INDb#oCEzn1R9;{Fm$)4ZWzv$B=GGCg{x&w&!;QU9 zp9n}lM~l2_*;DLUV<&4lQj7*$*A#K0t6++S(c|jqBXeyeh67rBzx>cTUQbg} z%mMdbXR3fEMbQMTAx>hSjy%vqMsEzHatt!kmWO%PWo0KkESG`S^gldQ&&gG;mtD%o zt)VhW&o)Jv{t}hL>+~Aq`^2NIfCtD&F*dLl ztR|J~ghMEm&=gJy=ofYG`Ts(+xu7;euQmgWYy8dhfiaC?7~FL0$MK%c4yc-~%yyKb*$l1w?jD_oSb2oL@5~%e?O|+Tq9#H5s2MO^CpwGjz8bYte zXo>Dw=bYJBBmQ};S;zpiPjXV%=g!JI+KNqZ67e53Y}7HW5E(t@oP>{54C~5Hk$Tq` zAMcAu_xva;M}L~pTu=6L1@U$1nS2w}#OL{z;-PA=($D@EvR!OSJ)jV!qlXR5|Cec@ zs2t(1RGcb^%CfZ7<kNu)K+FlGt`liI@4>bnHJ1d&gW%MU_!L@EwB-nVfGqj6C$ir2cgtu$G@W? zyaHvq%+M1K0@5XzR5Zb%`5QvvP-6jU=I!Ih zMQ2KZ!1b;oB&5CHpRL6I`QkxcoA@Pb0}pixqem%{_j)@7zx0I0_hGQ_!;qIE3b9E< ziC%03QoWPRlqpCZi{oXGxz7wtK$b|h5UJSB15PDeqLYiuQ&e{-?O|amE!K)(Z!jn+^)ySWPu(=FpB2%H@vo^kHrG(wB0}O zj}Vt=0*dOmH?CBCe@aZ&T9A$(r!Fl)3hI0+S@rwtk?g6QknU()FFT;)*%PP=qmXC;r@xqzWgqu4=u1y{ zj8u!sbFMg<7li~S70HVzecbfn^P~4ky)S%!!=_^XNR|h6>!@$;lBPGur^9{nFX;95 z80dEd`rC|-wGDT3-5H;5{G8gs$)h?OJ?3PSXG5|q(L?e2`r!3t2K_|ZWKt4uRq{s^vTSk+@X>< zt7&nZet*tT{(d!YANT(4{dODr`{2ZN_@& z78R`dyEk>Y`8-0m5Z*<7vLJ12%UxWlXtRuon01Ock4M6NMnCziWE7k8RKo$5$w{tD z&v6=>Y-(i{PQMj3X$gkX@#WO#tO9qYcGhUi8b4euk$%CZU!*G+N!`cDMm$nUXPP=>{cr6-<3_2!#Bb6bQO*kL0tUQ2=h!{>LH8SviTibJ5S338Gg=+`08RRBxBq8TQ9ZuAdc{f<3v zgz5GwWb`y99CbZ*UC*v1rZ!_#_XyH*ZV*+}@}0sm*|%TQr^~5}eTrqTK5iMp&=;jg*dRtHH7Vdp(Qcs5{;3VKLS?A&Nt`-!^%6H}<9|0k32*2pm#20$1fPgX zpwTG<<<|;RBrr8yOZy;ynLXE&6lF)^LBEYRs^yL!nanziH1i{L7JPe#4q4&|rN&OlA98KCDH|2&~ z9*=d@jV)DsfU|WnP`o8LzcY`tEboJj|Y%t5g0`{!iEzaW9x z48z~?%|6Ejxr-)EKBqle`x6)@Qt6<-U9nq}!|Wh2h~DAzjTvAjfG~84$FP=8k)z4r z{(34ukF*%vMCj0R2S5?kqbaivXi&o#ZI}}=cT0^#*%PN^^VDoy6%Re`&8e%Aq?+c-v5yobA*s`9Zq7T$2+8wiKHrBK-4wZSql22?H)*%h};p6A}vi0B+H+?FNzpB7l9sx zdc)#Lk>XwTQ9HVVb`K{s25C674ks}Y8X94Hk6EZ%J;5dT7VAO8fl|9N&aY+|`~F4*FI^;} z+&tZ0wPzs6^}!xp_ekv=OO^X1eD<4{W?L$d?rhJ2MOA()3oG7c(gjN?kY0 zAC_?bvZA=$V%{@pEm;MgzkyN=U?5s|scy5tggUZ|KsP2$SH7qHdc8CX4Wxl42HHV$ zwrQu6OB7yFM16dTP!evD6Pu1xsQ~3}Uhg(xjdvXTIqY(382Q zd82N|SSuUe(A_E!^6_qQr_OOI!K?A1%lahNm*;$E`8Xe^%E$CfsJNdt!-;Bsx-m>Y?onDeO7PSNLgHYRhH zWqQc$dg3g2A4RUzRnw!P={cnJ!IO_ngyyey2@ga^Y)@TN2FmL90y-6HtAM|C)Kxc=)yf*j` zx`(`jx=m0S(L>IUT`_t$H{~i7coSK%#3b%Xn{b5|VHQfcKHcBup_erJ5^0gu?Q=Q^ zv>63e)*{-kCiLQ6krb>fnV_2p8L8g@hSf?66|)TN_l{k$ z!ghMY4O+KRv%cItLXfQn5(6sui86sv>&&R%(WqT#|7Zg)_g94^a#H^s{5JEaavX0y zlltd--T+f1Puwc>9pI$F+fJO~oEM*8pr58@=&nkziyF10zfuwP8%DO^Y+Bp_zb7uK z6QCF>Ik=>jsn(H{oc58PaFoR()#KgXYCCAb4ph=nhV+SAGfNgdm>U@8v-M;!@Z%nM zdJ6PUwk2-YWQ;Y+Hu3uu9c3k`-}l<=V_E0h$Jx5(3?}IyFP#ue7X+VokT-W9$DHXg!@1M>7W#6*&I2@_bYlITVu=bFCXGn<(8?E=D z|DITB@A&#I$I(cWcWh)2PhGp?N%vH`mLjBO)jcS7ksx=`q^L*OiFg%G-XF4)Us*KE zRHY7(irMRzk@~fE^Tkv*dRfKx7+Reb8~~R+b8eGDwSl1S4CICeYWcYM)l6a&M%PWR zQyi_Kd}s!x*nvus_OWJGg>{X4Aw>o~sN|4#RhyK{erzxxMDoeZ~aRb=p;iP=n}$~gDZQXRQTFYaa5x|{{O z=F}-VQn~ZuyeD?dJSls7vc#}~a#YEnKGH7{`EN~Hc2F}544PsTYG}MQ+qvv^3T!}| zY-xiu!Q{8WYosNmV*iUbtjfl)vd-5|?c7dV3DI?za+8YKV^(9*=jh^xk=_`Wyy@7a zWeL%ZB5L|7L|R_L?7p&e=i^EZe21t*Mmv77grK16%#^`Ft|8(|Hlp1fS1NwV(P zb4LS>0p%qVsLD}GOv#*dH(8suii)P-)hprrX}ZXrx+r2y zOAp6G7ZfA-2H7u|3i(r6inQx&8`^g&ke+({afgq(jC5|_=|J(V(ZsdB?0BR}bmH2; z{8h?;!TuFq^)4c<0b21`Ld{10_~SV7M?KQ!<4fNMlkPwVWE73`X z`uRDB#ZHB1M^kV;_#7z&2H%2oQ^}ye4Fb~Ol_nA2NzU*5GQInJoCJpPz*{9Gx2ed} z62JCEoK>Qd*8;v>Rc3ZSajF%FGzU_k0#0;Ab@yeZA&?M$>Q8{yJd`Xvl`RUvJTf*MrU!>u$XpJ6mUIm_?T z8hJULtDo{6kl&_(O1S_O$A6fh;beyCP}}1XUKkY3&Otu^9I$PI1GOws5KE?VMP-bsCv_X?_|9!gUa-Ak6uR7oe z$z(;D&*uy=P#L9kuV_uojB9@>5985i*pzq+DfFt;i+3Hk(PV>)K1oe<5E4Jf_;rQpxu`7Pr3@*wTSG^51=87Vj>$KC>}Ra2T{mb) zLA1Z0e0u77e(Dpl2y{gu(Jx>mPCKSbkp48T4?MZHPbFCdVpELgTJKD|E!H_RRUtAd zV@+LD{3tO{2oS^SM_AT!&2zT=oG}6t=rrtJHYzBW%&4X)O|d9cP<`ScHz@~E@d?sj zl2nhDn%gwVOo-$_HD#P!F9~hYi&vX_Q;|OF?zQslC^cJ-H8-i*V$BWe?|eI;A8H)HzcH$D zxG!>9M{8dZ=VxM*3?^gFhe+aEXyYoO{GFG6>f@C(?5+M~uF@SzX@-?Q>(X>~m@O(o zo&Dzt3+w}R(_E~X`%*sqsC58(K!v~LtSkckY6fpkklZiCaP-B7AG+&{zl`T{am3Z4 zoaq2ci9rpwgc6(#RzRo`Q#$bi-5mc1-=Q@1ZY#`fNPhMcQU){}#irO(sA2KvyM2W4 zl`Z~gYTJBmMsTJ5jcyn&rX8HrdAkbS(Gqrl% z`6@Sdf%8OmVA{1uNOhE$uh;~QRWz>^Ex6l2^T@1OE}nLP)Vk0(QG;}x#*;gB4{6o( zUM8NyF4lDQZPco$Kq}u2RZONp5+;jv$qISoQ0YZKA3CPIBTzl}SvE;iTe`4neL((E z9=I|w$olZP)m83AJfRUisKbfUN!vu9T&=={$=xE(r7k%{=2aiK3hGA+ffhJW@(#o@ zndP1&9oqmN>mo`Ag*w`*3s&khCe98h|0P&QXuaKTR@_>3zulA_L%Za7%4uP;!@U8^g@rm?f4+8<|5l%ZZ(Ym*ERNc9>B;8jxQNWT}P z-z!q`qxYqM|4C2d$A>>j_6bDMqi$`^(GypH)~eZ=Hk>=U4c<;$sHch?D~nhy@q*V_ zb!WYkXXy>2Oic#Z1^V*d^gahqBI{mrM}-{K~&k^j__$ldMXFe#sI@Jf7X&-(Mh*+%0v<{=E+beb7w)9Qs^SrIIX` z-gkT9e|5aWBO$*LL@6Vyfaca?ooF^Df)6*Na5){e_?!d<>N;-_)F}bQ^rkUdcOh{a z@OOMnvqOA7^IxwPsldmnsN(Ol6df1@Q>fM^*FK&n_Mv`rs5AjTE_L&fXhr(z!yN6? zoCjsEUwTK`n{8E{AARE5FWA&asK)I#pd26lN@^K9xWW2JHNByP*Z_m0HxArR{afNLxbz#CF_r{=)&5`^tv)NX(S|z#*H?!ki?!~qFV|Fv1W!|*YI__leqWg`&!~m2MhcLjv|8~@ zSyiXnNP+Qv%Ai8X>J4IGRZy*El8-e|E3~WnFg2lwos~bS8Ihm-Fd$t(`O|n)V#2(O z!QbBLzn-I#=!S0nKB3i5HffSf6CGA1(B04L0{V4B(pjF8JGr{?yIP)UzL3nOkmXt_@xfA*0^?=QvC(4?W}*GdvDafCpux!9|7Of zB;76LNpY7$zquzF!K&3+PqIl`6T#mMMZm(@2hvaJ=KkZ6{&KOK4qx9s@1JkEtWELw z$AJWYtT5W`GZqprWEU0hQ*nI=t!uDO2?bh>t16c(D6{VA?Z+qXK3++WRbtjYFpO40 zofxegeg00LlRpXXKEHNT&gD{3S=owiv7M^KmXh=ePvpm2$u3qYBxt8@BbyBKI$KxQPM0DrOZhkY|?UT zY9K>wyTJ$u9yR?k+hzxHc zs%D25V3mu&*3R~5D*A%9(e_)U@zy6cJXeEs_rZ6d{fdeOcEK=S#v0b&P&19Qphn|e zN}LSQwIr(Sni{CDk@_0?^xf9B+nrh=8$=@>Aer(=rB6>4@Ai&JRkUgwl@=db{y9AH zF~BHa_oLHdjT%I#kOcLPp9$^-pgOsoFo;xPQeu}gBlRcvul@kNy>$MYcJ9OfZB#yv zQXWLwlTv5yloL!sSv^dl6_NhTJ!)s4>ugh$oobrrhlC8j@e|TU%>INYiW1JVA>^PS zq92o3rPnFQ#aWM7@ma|XsSYeo{{hlqW`F#py;?+>mG{t`(%~sJiJ}tXM)cB=_FfU_ zK{0)p?nHlmH`7aZO>=Q!?h~N#@xfkFK(RcAYWrmzp@8zY>E8eJy*1`;@7CJZ|1x)+ zE_COEy74w<2JK~KTGD}5R_Xk8!lpHU7hZZx=FG3sc{^HG-;aTP~(}GRZzJtH}x~M)aEq3q8lXXyWLlbi^Uq_gzW%-khZlNilNt{w8_jx+LP-Dwg8}7Fg7Rxd z`Gt3>NZ~Qv?rxOeI>wjj?$qdRsvBGTC(H8Z36HsMo~HSUCuJI@%&3lEqf!Sh`Ez>$ z=>zmAb;=yL@G*iWaGADAl`r9!ggzTp&WlL*uBUz)U#BL&+q(nuPH#Hj6l29YahS=$TqM$C{L+OgL6$s9%lgq#1^t8PvlL z`iJz&In<;kVWW6d5$RWRyGfz%LK4gDeV{rDQc{0$??ct?UiECptlp#@gqeKWClkl& z(k`#iJ5{#o*FpJwj59RIhMpeFqcDYxPLIGUzTQ?Y=|{%2hm1Mj8P4C|22bX3Dup}M zlPRw<(jDntz5)0ZAZ?uXsDP@*`^Vk*#LwPunD`OIJZC3yl=79ucY_N+(>!#}29Zus zxa3|nGElWwAU$`f&!f10uP%f>XiP%4Z>tWL-vw^GI;5kXS`p^w+)|iwjL3PT##hxs zMV-^snd0MB&AAbs<4F6E@c-@r{Yrp-H@DFlztT64?sXpfxw!xuB`SGTnwqcGy~)Xt zNu$7v(Ih+2k>@)g>7DS0ACD+yvwk;$v3+4}XCF|e8hjogAEHUWPHNH&m4XKWUEA_w z)yC1;BGT{Xy&qn+*3+{;N*MamjDsjVPF>fH3|l}4sTRKvgkyu#2~_V^b$FJjfO>eO z{C0Q=JNNp4Kj~k9;6WP_v0wIT7a1otJ5_yCM2Ty4rk5O6fSZu@`fG290Zlhpu(vCo z{B>>eF6}}?dIi=qU-hz&v|3OKaR9Y@>J+MM{D19|u9(P0P_`=cZT)>k@4J1>r3^?& z``7jrkG=WpX;&`!((E(@1Uqu8rZ*W>ZdDuGyvtBG+cW(=IzKO2HSScTE4_se5D82l zN6V$+YXVFe){u4i@|DyDug-skP z8%GEZ71KyXn4v(muW!0Mr|AcnDCqqRoCMUAvIjN#elb_ki%>B)cVFjaW9f&S*Func zwTd3x6p}xmRQKrf;vO}5>C>T@R%g;1>MH)+zmNU&*TZ}_dYT=;4$#8GRD3^EctLUg z2i+^aMU%h}{6jR|Q$o3E@^SndUMZ}-PB<6eqD2dWEWb;W77su*c6FvUw0Ims{mr4O z=;!gvyuT0Lo4yR@-Awn5>Q&k`vD=D*m*^hksOX?gT~rcDSxaOoXBPb?n7gNbY{Qbj z-JsN&3o};83C&zykYGmWC(}rZhp1A8V7k2cKA47T{bWNvJq8ic>~lHjBP}USJmWq` zwU#A)9Vc;heVjooG|w2q|CrnOU;ZbYNt`i?fVVtqp9>zrC4Ec!I`{G@^7Em=Zna`ww?25fGudZx`rKW>leGX;glI{~TlwnU2?BTJIG%J2lrr==R&t zP@f>FONqNbiTM)3VTwkhj%`XR%niXxNQ7hQ{;ikaq=l5yH*&ImLW>5L6V9t%f?DSoKlzIlS~ksR@z@Y+^P=s;RqA zS0SZ)^ZZ0z?(&$)ef+JD7j(s&ks{K_?IWCQf`)7n=`V!4nPKN4Py7Zit4lOE4(?CZ z;H_)==ZJCK`V0VJi*p>Z&jZ)CBpaM+1>z>?AuPZ`v9B}z*cnMfzHcAdjx9e?#i~yS z5&|emCnHl`N|lPLPfI;Cb%s6ppjdMKky?X?5D{QYV>3P9vD{nrF;ywSB%RJ_7*u2HxjDg|-N<{Gjy&m0dt|^T6BcBSRq_XFVxTgrDFclr zHJlv1gtTh3D{v)=7UcLm_?Nso6&NUOQ$+c*-c=`aXb;Z!L%9i(w;)WFtQ|SQ8tzqp zS=C@|r;t2Z9f${R)b=wz%h8OwS8vK>YQlYGD~@HHLXA#5) zk^U!6D&~AXN(Ga%&ge%wmE5Bso?2g|4?+H*j{4y@qJ3Tt&@{eV=d&p@y^oNXlQM6h zD!=m)?I2!xG1Y4aWqH7tA@xy|-y2m(^N+^DzbOh?JbGoXbI(%q3^C*P-nNnr%Op6R z?&o)&_#AzLb9^6_zwRZuC6G)_Gg&SbCU-bJGoBQ?SDSLEqIz$yEJ_m`XA+p0sC-&9 z=T|enz-~Tr62`cF_Mz*C9x6B(I#;2LW_XKF$5}j`(Xkz^i1fEH&tb?^?_W`J3$;={ zUf?+SgdfFzmqbyDx6bu8OrDpa`lJtRD3d>@K{&YL;8!VLQ#*nTGQHABz5#d~Wl5~j zHf5k+%@Q`b!%X@z(ojQ(7j=VEph@JJv@&@REilVlLi&Z~StOz3Qj?62D3RTL2Wnlp zU&IL{nnKixL*;LMvNa(uEh0TB_Qh$`jjC+jXh*Ji&bBJ&#|!~K29E>KpF^Pj+SlV7 zEc!(7?9^Fyx^}Iad8!@q;h7#$cu(c`tg|UnUP%dDZe1|y*Re%^e}9^%QQoJUKj$;s zR2uMqcsI7+k9OANDdPHT>vKLIrl@1AE>?X?tN$B^kYtLlke-wa6Epu1f@szE1L;bv zcWPkvi*+9fMpAT6m5}~~d#nk(2;tQ%zE8r@g3I~k&(u%y{%(ht{<8Q)tosn1Z#!r% z+T8X0D$qz%gl#GSH_E7&2$bwPBo-bY?MxIT=g|$;Zw4)reV|yshmArBHKuLwVm7X` z(xD7s&UQitQR>|8>#r>5Z-kD!*Iwr#i=gMU(jI7h;C$$|_P1<}zOW@JWK4NEp91#0 zg*t&sSrujzYM_Rf9lbvCjDe6~?8DFF16tVei4xtu4tmHUIa>)*(KSJXpp5j~p6#?D zpRFCA@;ems(!zfcNe^|$xiX#8(Px!?zysMzxlBRr-snp9@wLT_FW;#=QzfYk1Omr6 ze-~d>pmf*(ts%_E_#B8*1U05=ojg$XZgXe3rtDRJnrWPS=PLyhDn?V{p@PsTblj3= z{|srY`qm&F-8zagnpJve<@U%ay($^q(4J(xp8_4BIYMe=$XTk8!kv;bhDdgSIqLPV zdZIL_qzlNRI*fVq>`EuGm576BMWAs0%TB$Ar!;lG`ZZJ2M3ZaiMNPrH$p}q`)rHev zgB6Tk1Es0&$zDBX_N<`w=p$Q(!j5&kl&ke3bTAzm75iJOt33Y9W-A zWPu4()6Q?{WvDA9;(esu3hrS8%_K2b;9g~Sl+>X&r3A8ErP}t< z)Vxkp51qeDHJ*Gr-bAHYEamS~O_~E`n48Aw-A~@|RjjAMpka2>=hP}mR5c#amn~{T zq7s1q?gRZLpv|P!)VrI%e5WV>88&3RX<*rgo{WL`csk^Rv7Tb;$H2~n<$l)goq@+##lJT%Rc zgYkxXI-r~{y`aMFtDxGPbf)O!!AFJluR%%%Jol(qW4?L3UuqAwMWcUa)ugCG=U+Bu zs5|lLnEW(sN$0Duf*#-BsGuMAX>$@vbEu9cCtFtq^{^@B(5#h;Lxz;c44!^uQmRUr zd}XhRd$NplCqv=MwCxbGJkpmyXBs4+4MP9k{uGW0$JrAsCBI!xmJa!I_XVJ_(!DC!&JRQQHY#KC2kMHJ-+3k#kEz1?pehlb=@LrMwv%Dhw~Wq~!7g}&gRT$H)Y=iz&oR{2#h9f6Dhz6u zb6*3sA_&BL1!`#ylb%T45SY_~7n&97FKE{_e+pMWd z>fX>t)JXI0p9Fmbry%SSVn3%Nop-^YKNv|Jrc$zUx$B26EQHpW^FtDtJpOa2{#1#% zxL(E@VCtB!_G>LwA7L$D15wXO)5OQ1lQLz(?biklozR|kiW5xv@BJvI!TRMkA~0nc zQltGiLRu^6)l*_6J5qS$W=+9%I?9e$fSo?3v9q|g#?hK-o}U}7JOEp+{)WacIiR5q z53PMK4(-~-0zeH^%q9&JTkrqS127gH$hsnMHOZ)V!kQ@k-X{{3BG5b#O!+buy}N&0 z-K*hs72Rn*v_^!T94pR5=#6S48C?f`OQu)#ZCN(h+*QqGYuX6vGsM|C*QS zhFMRPt61abosO5?6awZ{FzS1BdllfBn7~4VRzTQ~o7Lr;72Qe&o6_6|&r(o>#4^Sm{u!MC(5<|2jtX;BitLNZS-q&IAfaQ=@%;y}pA|l@cKr3%C3QgKh&Qzc>NFcL zM%VZmxjSszPNPp*GWCH%<9D>F_x+BTDU$azXGwK7npF04kV<;9C|Mt%^f{H&H|;M| zGyLNp)9Wx|SV6Db)J$f~E7mhFq1t^>O{bKY1bKpX?N))bchGZQZTh{S{Y}wAJE3{D zw`bRy=eN<_Uj3oR5t~}-UWv80&=I~*{4lG0?KN4>)obeiw@^*x2TZhWUbmn^TVW?4 z!_wL){ptqN&7C#=mJj{@dXa@51JPwm(+Z@Uvg})t@cu}n!mLsW=y4~&CpCc!h1$@= zgY_u1DM78v%We<}mrkqN*uO}D;2F}NM&0k@)024eIiZ@~KRBz>Wa;Y7#;SBS1b))B z;6nm|I!oVkqpBpp>KhsaESYqltOB!is1P}236J=m!6)9+jbcX$(Vaw@J6pH^9{|c6fI8m+V%M zyKZamA%n|2v*VSNg?bTGG)mTmjH-OU(P#y9KkFKq>&O6dNqV+%BOgS($CM4epa>PT z+IKg+-KU{@BXK7P(ch_)aU7P4ab6oNw_VQjOQ5OudlQ^>eD8Ki6$1mI=n<~~n`hbR zUHcs7ZkrBzx{r*pP!FSRq4omu5^mU}iXr8!`ddQ!)962cPzlt2v%W;Nk8Cp>^o1VN zl?N&)f4rbcEui?!4q1YfvKSE67frf5WNRlAa&B#eG5Kgv;e}|?(2}65^Ou6_MPgFP z_o|FSx@DmLI_RIgTlN1_lkKJ#`}P*3{~TQHc#q1p8P@8VPz8B1w1^7Uax;VN*N_XB zSq0cezfmwX)RRIK3vTl(F`frNu`QD`#qU_vitvzCVbl$*kyQsB7%cN4>#7z;uvg%L z0Lh_AfTkChqiRbT=l1YVOP8G;)n2i!TKYqvnE^_&os^27sRXKb)2P4HKsCX7@BS3& zpa3#ZZ+q|V@9*!%pR2rpRZh2dRO$JtO0O#&qzOwv5xDyG3>sQP?c7ZbxgP1Mp!LT4 zfiP(mfeM-IQyJ(APa*z-zV@Wn8a^A}U%^{53+PcO2+Cp}wumHZb)pGVS0$i3b)lus z9$H$=CTpn_dsHL?&#8>}M6tp2ozKj|Dnw zlU<;bUC;;mx)R)n_jQhte?gbDcM?*4(frIaQIgvJ8`ELnx^h$4D;)flIj+ z)J34#=y@(E_^dUZe#Eb3n4vcmYd%anT^Hj%(#n`B6{cjqN+iAttyg#|s6gBF;={fL zZL&J6R-CLI#Ycu!CY?8?k7+>3tUo{VBZdy&z^JWBzaqetJDoShLRNgl@OP>OWx3^r zi-ZtFb%?3ErVR7+L}61P$a^1CC*Ed_3KN(CqQ}^gl&Gbq?-9Jq1YoGg@_p5AKg(c}`r3ofGxWTq0Rg^$w zn^c?7!cev-dKvq10CCkY4zVHZyw+Y>wW7rjjZ9t7HB?iqO8D?k)NWm~~kJn*dUrT^0VW}(N z63{V5)BcCyQ_?t?o=B|rYD^Vu3WaxiX|>IgIat z-MN5rfsJk{Dp?Y%dY4>M3DrhRla@iFu7LEXfrgf%s)YGujQ}iXxjbn%PY0vE*7{h5 z&x8rpGJ!&>R1sHw5hzWEYr6H3NH3ji@6oCj9YWVBR;j$Mh`G?TJ_k#Vl;TeT3Ii87 z{S{CT&y%5D3#<=iDXBauTYr^Y{I-kDmPMq$4W_}}+RYqyYBI(2KnG(_WouMfy=85N zRcxvhl6F7=s=uiWX??siZ0WdFds(q+N2QSM)dZ>SgSM8(_WfHxdjIokv<*EL2SN3| z1BlOU=7bJbLj1K;K2~9U0Yb@*S8bcJP50vz^)b^e`2gT0e+sJP0|kW%-UQJlo}8)# zX$OvFQAQfy0_NM41b_WzeKtam5QdnZZ{sb0Y)#{oy}nS&RH60mD5%2o8e9J)Q>6*U z7MzeJn|?RrYe-MVTg2O!*1txrNQ1VJgmqdFSRNv!n2Ic%NDu3v9njI^~Nu5%<-(Ixw|-|W*Wu)w-@ss`r5$Ou3b z4AblLWwko;{sIhZuCv5g$<{te`-u#$LrZwcroYW+`+2)bImc4?yk{5I18fkvNGU^_ zL@7Qj2xY$#PAdBkvAaSCpFeDxt}chuJA%*bmg02^n5%TPlq%}2qCC#tF=2xHc= zvqWAdEw^f)-2GHeflf^k=y%f(G+SrPAtzui_+57WdxG!WtZASU$AW9X-p6~3NAz*( z&8$-rJ_l+gbq|#rEo}~#i)qI_01YUH^>+{Jck^aU>$7kOfZdcB2QfE*_%9#fhto;M z-S*X}1p;?v3}nl=7wAP^`Uy8c$*zzr9o8xKs{L9u8@u?&Cb8QF(gn880xV1wj=M!&Y<-lW6H4%-#k$jj{=yrmQLyf=VV0BkgW)s2P7Ym#-dUwChND${01 zT;8vt!~)tq>6gK~*Q8}B!)j-NkTO;(|0GQy%Ww8yf-5+}X67}TxpM=`Dtx;pZ2-i- z)1N~9icn9_)E?B@d6raVm08tzW3Ex4WaBKq*t7}K@71V3&4+z{vmg4&*~w{x)8&i{ z{lQUjHG?{0pNg1i_1}2?vLgcnpS&$@%h!9BbK>i}R1^;4uA}#x4&7>R1A4{2rPs+WRe& zG_IK>xY9j|*Orm~pF$G2r?|ClfP^jt@>_2~aqE2bXQ&|ue6NFA<6Fa%5Y)isU@_R3 zJYFUY97=Z$+L?+*-I_Jb-A&S=%L)Xwqq(16S)Dkdv{ZDvTJNv4>JFjGo@ltrP!A>r zU-?^mceyM?mw+VlP2oKOpA-@)QwZ^Jfzq(F{CK?zSyVGg=&p^!cQeQ@ybJGv{Dt7&q>TKpztg=R z%?lbsZj+2EyMg9*$Q43nN$8ldbh#vS6WZ`IX?%_H&iLQs$F_ai{h)YK4+}%dRjRLsC1 zOgGgJf&C6P4eDDt*yi^9{4CyXO0*u<-?ulYTD*l`tHDgV-x{?$H!9BpR-JW@wNDCx z{VZj*>qr-)!UIe`O$#}ztrHBW&R9~_)zw|4SOckzZAu$}us%83rQ=%tGs(9HCWWab zNYIoxTCKabJA&F0W1PPLYM}o4{QNLC8{gH?aHBuZAKS{Wf=e+NgI1t7O!Mo6DTIO* zNN*N*zxt!IGzjDcarSkh~ zA&col2Mbs^9nS)6(!hoZ!6;_C*#KW0>24k<7kbXUwy->0$%}z|vg>TG+QbF8Fb-DQ zsIcj&j@_P3>!_l0iSQ$+bX}orSp^kQz^hM&50ye6d=D~IViTCOXn9X)f>rrcyVj~< znZrZSIDZeXLBrc5!$$>bes;4fB3Aa_4T+q zc3Sxez#`#MykL)O0Do|v^9vF?c4vVg^LbrWdcga_=xUPVOvi(-&@eYpuc=?=Vc?}oA}J#~|h|9M-!n|NqICzt2Mf(Laoo%Q9VU zXN$st3LO^?eRq8_806Bn02};k#mQ=3{Taem9o@5AxmTyPcoUMNnViR2q37eOp zj$=QJBhdp?k{nk<6>T?k5E1FkAyhnHp*a3<#AGxT?senZs*tb~rq37bpvh6l7V>S2;4%m)lyzZg|&>dU{CRgs&$A~1u?Nh-3lIRb^Z})3|D98e~$DQHvL)~ z7!E&w{5GE4Fl@Li(l(|Y(v?~50>;Jrb+QYR_aI}5k-NMJ3aTt0)Oquubu+2MHDHk; z<*Y+@~KcgEa_F{T&;htNq(xI*HP@82|X4gBo2SmSu$ zl$I~Y&ree5x;)!9PhocGxo|$#>XUuTL&6?&7&6Go7WTpMzSU7#_PUhxOwHYS)3&G$ z%QizQ()G7{NU1ECAr%?>LQ7n?-o*+;+s}EIU|Jdw?KZ;-{Q=tJgn&HY9@$i(eaP%0 zt+#2vlnEYs$;u>p?Yd8SDA)zEFx6CefpdwLy=rUOZH2H^lbVP^PotE{nM#5|AHy^Z72*jF4U};zJus(*CjZU%Agf9g>=cS9xZ>;H<&lFl-K{?^g7&=n3UhE zEWnZp_nNXMs7MrX&H)yzN5z$Q9Qn6dPdY@~*{opIK2>{=264U= z=y%}XNPsRImC{su4;1I~slPwHe&n^aR7tMe>9^Yxd`cQ{Y~#JzrSC*+q5;~-MC?#o)PTb z?M_$o{&l)f8=Q?a`Rv7C{nqe8pzQca)4WeBj?_*n$VS@RxLY4)it8ZDK}5&0&277G z0;%JBTZIb%2bUd|aW=pt>6R7uKf zz;EjBSexzxn*^&MHgru9MxD75Y(>#^)7DoSNOF4m;brNq|w{_cZ76+o`G}T56>V}0v_!*KwCV46vq@0sb6fsI4iB*~N)!f^cLav|L-Yw&NmR!*Skq0J zw}~m#vM+~LQS;pUd4IpjLGWq(45@bs%U-QV%6hH2kibK1p)ycLVA{gpvF#C3gjPgu zbqT51Z4+tb_0nZ1T|n@fWbZneCe|V;3=viet_x6qiR2;z=J(Z1#XHW#H-o)z*x4(e zu{tVTIOXtI%Mpob zs1>SGj%HhYGN^pSp;)V!6e77}>A{seL1`m2Bk`Lp45xcDS(kfSmdAH+vVf$&b%{|0 z+qFFr^0qZt5rHwwGA4m_FW|pp{9}isU3*;4422<$B37?3la<&vjbTI>%d0qM{sJR> zaum*ct-hSa6cn4dPy^aqyt5Ulb<-{Z)kJc>BGMJmwwlr7&+a*uNEs9qM*TcbrwBwQ=~F*mFjX=wVmGqT(Jk*=s}{R zd}(=y^JP!n$Tq&BZ0*3XYcK` z+>Sp>GpII5b)*%7gzy;cPBiOWwP8QUvHu+2BH>|0DWg`k5u-T%o<)y4e;N{L&oD-v zRTa`Lu+76uhP13N8k;XLk|zv$TxFVVnIwZTf%N>scHRG&ZEc{UI!LT%J@tthaP%b! zjLp+RNQ@P6<}aG*>6%^<9Xwx33s<(|7IPSQhEHx)ndXEYrP@W9haoEiym@-9xC3C% z&c`Vd&q6)xY?m?_#)0-YJ-?S=9=%`?&YnomFsXkzxDJ`NMnDDHcpp{uTd>inKK>&m z#B;FU-+Y^nQCk@IsX=Xc00qMPv@vSdB`S~dE5A=$ze#(XDRl_74bK=Ac0DocvIj2D zw-7rk=+5!ClS%TLuw91}Mbr3K)?Y_-qgDQ!f^f6;zw#4RMI&oM!?EQL#Iu^>-6|6u5RMG@e94TM{9wi!qUkB>=j1_?fOU?bH$(sAN5d-idrzG|497OEt*FD z{zmY>ia2SDE~gtdRUQSzrY_Rb0<({j#Pcu&U9BD0kxDQYj`ol3RrW(VMLIgR=T=?I$b-R>~jMm@~ z)^w8aQZ@XY%anSi4Ad&%tWlKS!1TwR;cS~14)Qx_$dXX=@6reO_4D;FeAc^Z0D1JQ z{yRa1^&l{5{!89wMiN@XnHOUGY4ho@fXes;%p(=)uy00%}fzI4tw)9`ccAcGz}SgUz_tdi20=;GQRTsaHOUoNt< ze%$Bh%ZSFW@$=Oj2p7h~?~V>XD2o;Kq~Xn$g~$oi5PGQY5bF=aIt)E}j%~2z$Tq<8 zd4&B#5&s*;Q$m-Q+=2SiM5lfNG7C8ue8gKg-D!jVfnz--{yLP_Rd~x)G?AR7ohpqu z<=lW!u;!sV4bk2Eo;fRedqJe(|AJzLza(wP&wbzKseHTK<_AFc9aQK>HGvqMIwG}-g2XlHb2m@V29^*c9{eGkUIyS4|C?FkqWtzh|+}2SgB;;6&v${-W~~RmaG|Yrn9GWiZMjs zmx~SYEksMAr|OdlUzt$9QS%+Ii+n?#EcuV0LO&&N<@>h8M(A4>Ba=WsHksOg>Uq#9Qt|3ow4~U9s1I9n#bxd(Qa2yG<#3QlwA{q9={^*wv4&Sm(R`eAyG2dAH%snVc9~FYzqdx{~=brNlu%Drv zu!s}>^YhdAU+$lu#{c|J{GEZVndP9xcOChCjCFIUJRw1; z^{e%e(6E7`-C%q4!tt9l;iT@-HEltaQo@;^C{CV-y*k})(`|n~=2%Q^9**shHluBW z3+YOoW=?%*+bjW%s~@LhhtJ9H0};1@7(#0Dk>90t?@O9EvrK0Okpi_@HoC0+DbBR3 zV9bqsd-Fk5#+zG~CyZ^INJo9FN5pv>q;u1z=t=VR+>_D|Uq4^p`*7)+qV<2BZZEd$ zx*=~c%tmDn?kWi`8imr73SGR(HIQT|LN1&3+`zk&=)*L>Io-!MH9TETqUTW=Z(Y?v zuUc~E9GUQFqY>tfs#8U*M4Mu)@we+R`URxhRMW3;uL`I0xt>aHN=!;Y{==o>FES<4 zc5>1FC?JLQ&$;fy)@xO%e;XP;wY?#6wRPVn%+w(aASS8;%#&fBK~IQV+~!TZroo6b ziaJDB%111To>B_6BdIE=6<1(Q2RL@se;#UxK|eIc4zLldIl&qm&TFtr>vyKK_HCkW z4IAHX5Uko=tmQW2SxUhWCmrDO(quyUjozI=q+0^IdH_c2LusnNawv1r-R1jrY2EGw z4a$)SC^xH0f&meTbIMqM8-MUMcyl_C(4bOPe8Mf?TvSIzCnV;2dx#Zr&zfNEWoR_7 zfx2^~<{xg85l%kvHWrEVr1z@Uo7TBx5|GnKmZh!4v{QcYA|LU&$d(-%%`}^(ge@f z71TR!%C|RCCRTxKM)j+ozi+KtN06wAW(0Z}C{gfZmC(4MWFEu_e@=1wfArJ1k09qZ z#{S$Lj_=d@{;mrMSxH9{1QL8BGBIodmZr(tt|=Imz`pj~6U}8tWIG&rkGw)roG(3G zL&ZsM*Tf6RSw^{tlr%GDNNK>oj_+J`SMxXtUQW=b#S#*gFpiZ?{J&PoQggc<1fAy5jOXzJx&tDdX&3+O3TcG84^YgS+o z%BGl;A{7xjdJ~sSi;4>Jq-ktd+8GQ!om#gh*xP8SuN$FDl4#XOt=ChOK3IXm58`}@ z;EuHuuAgB8wTi~Yc45@qfgh zkJKU3DrTfq=apgg23&<+Ay7(MbR^`EBaOwhfb`cVCAYg#FYWaAz83Q3D!0ot1{Ft($9REZ>?c>82)m?A6tAR9?v_p&DmrV*o_MiZBvFyL|&yW`&ct1 zTDIK~#mEk!h5<-jgo>lp&xF2YPeT$qGCI^KrgnO5?Mvs)5tbz0v932NL7T=c6lO!4 zykys(#!Miep2qRWz?-U16|HYBN3-tNWV`^{G;ffjIR%M7K#6(}U{d$V<~h%mQZgViBvai7kuS<93Smp)ibQzSxQTJJfy8%QP$glfYMC?oxC-pAqe;eUIy zDq24_Vy{O?wGR4eHAwfMWz;B_L)RTqQoaEaFF+j?t!j!$e~4&gOwBsvq4wo-wIiXo znpGtkfw4iDwg3}>LNg?Xf848LJkp$&%ab#+ChfBn48&CX9O*B`Enxt1oZ2*XJyInr zk8b1&dK@U{7ZCJDK^l4mn0_-iF3xBi-&>n$$Qo(l4PibH1Jr&xcVFxMmuf3@AwH+vHfGG;|6PHA5c?emu9To!+ep z5i7_&-Q*Z-myKj_IJ9y%-qR2@0?C}=gcmz>xGrmey{^^+uEJ>3U=54GQqB-Y*!2QNx|aKk270lojU)U8^; z`rBxbPfq*1SP%MvJyimZwND&D7xmN&KoJelQ~&15_2is>UzyAv~M z?zF1d{WGbrBdE~Rq9?5?sN0|=_Xv+nf93T48CH?T14Qs&#(JOnR=Ydpf%jGa&&}F18PeDaGKYlJ3PM95r!a{naW7;!w`&C%eXdQzp~~F1(uDL5(%%ee@KbgU zvhMg9!wM|qKAltScS;x=v7aNP(EfP~oO3lcwc{8hNDe~6yl<(h7ic3+wz^is`KZI~ zN(9d(s6X2#yZ;W7_gyue9BphH2z#Is?GI{%i>B(Eqs{tp-`9S)UMS4fbog$t$-WhF zDyWP%JQh@eH~DaHg&TyycSd$H*{FyOmT;MQqOo$Nr9nF)axa^979W?2==S7 z+NUxkQ)hx+-SV+xzkAo#>XxR2zw6!&Sm(Vf=Gd@|6}>r_VH}1?n&8g%!~jh|vcDry zVos=hXkq`)=%u5^!U2w4_{leK^Y=%h?|hD(*_S6Z7;1mR9D9X%8{$W#3&O&(vCu^% z%Cn`fl6~x|Dji`BjYM4)R^zdHRQ)tqogA(sna#SAQKA6DaPsf~axia;0!(4YHe(0^ zgGV^nu>T#rGz@GKAV{jki>3xE+O5$>fSlH@*5nNZkB}3kRIf)sMIWd$Yk*1$4Twe6 z%oVwFANvR)F+-{nq$-~NvcGCxUps&B&F#XPcV zO`l>+V5X^Yt={2_k+`VV_%&1x4y;-+kFiw)W>)PtsSM~yE09h*h4=sTOl`$#xKTB! z)yx-cq@^k&{SJd;MkH<@!0$&vcSXT)tFX#r0!ke-T#Pp zgZZIbAB?tK<9N-i%8r!@T4gm@&9*@FIjoH%J7uC8mE}xUiz79g0Ly=Yk z?Q1)Zx6ynW^NG0XY3zoXP&;pPsoAPIiymA51gV5m^?y`@^aC&4no3#n;8pg4gdkO} zW;oMl4ISC&i!i4!)MYT;+UR`5muKdEFIwt%A8JIV4QU;{IGg9ip zPm%sKy?yE5!@$L>KS?hylWSjI;tino4!=r!TvBSFL$!wlD?d_Q=prs#^{4U2S+05{ zQ#5E2tUfS;pHt_rNZsc4C#2^n@s!jI+2MD*HUMtgP#r7+1=M>) zI)=`Xy?4XQZ*Yh5Pg~6Mo{hu96_}gR_%KippeJ=&j$EK(KMSa-YHwvA*+xh_R%_#n zQN;OmWLs8wd0Y4ExpC)!W$q;v+ltl>^A z-UEl~w&%+=Wu#%P=qVGms%8EAc9U9DFL4p)S0JfV_m)SjZ)ZR&Xjrxc zg-1wLECMj3Uq?uNs?zyv6)68qdy8M_dfuam+wf8!0S(Rhvd*M;BZdL-s%|4NNAVy1 zuG=wb4KH<0NxyE>eD!T8XV%vRUaZ@a{9#>Aqvpz4b4)y6=frZtNQ4fO$Nl0;*3~ooPqq#z3>T8wp$&t=yBS zVrWHDXBG2kVG-6Np&)_*(cL4^uR471VGM# zV!F^c*SWVmT3I*n19sgJfLg7ZA`Qb&l}P6}fF9o`u>w8NS}!2o<*Vj4O=|7b^9%;~ z5>6r$nwGPF__@w_mWkIgn~LP>qQu43AZ^epglh1~_`_rTfhn!}2C9kXG z=reT3)?5G;&ny{*97CBne@wlALat`zXYHK&g)b+qThh=Yq{_vTKo!#B=OH4oTu+T* zP0AN^^eMkCK}8>i9UWeUSy^alCq5_!R1T7)8oT`Xt4#nvhpLD+kX>GS6sRj;CAumc zX}<{oI;yl5%}Q8!bW)twt*Hg%Y?p^ngND5dP=6Ar-#Ju&`MdG&$1ugwcOSdl`r$U& zS5nrwZJ^4OS-R&K;~Fq3s&TuxPvrIq+TNHf8H8F?rm_J>Q4NeIY- z>jV2%@Q7^)nwfDEq`g{ZNs35!Pt_^XTDihRVv9lTw?++V;5pLR9DwEDORjdI)~bOg zw32)Sa26*SG+<2)R*0N@H{CFBKEYbz^UK7-%(%(LrDY6X%W&6yFzhn)?HKpmBkVFC za?>VA!?12!iF8H>uf<9ihaF;7-E6yJC(N>&2O?D6D)vDGQt`K_2XFD^Ie=uyFF$K< zXMF(2|C1?C}EBj;|Rl3lT1OSABg@J!@#4IHZ6+_N7Mp#pY^%ddJ&i!)+#vnDY?&|H;G8R~cA7idK_%oWy? zdCi(S7WS$dtU}XB8ztBLfJGB-r&VNm zjpGCx2zZ?H@n_#HHq?!N9ARim=><;581%M=$^Sb(0>WzD#88LE7!9&dpM+SY7{W(v zh+ZT*5#$t&7URNYrAOV_OQolUBTS(q+4*?btZQsCXIO++Of@n1ab3xK=2#Gzr^6tq zSRJ;>j&d1e0ir;-adjUq&kg<1_ZX~4n?!2H|3-+k-dEol=F$TCqI$eDYW;sAL>m6S zbm_&{Iy}~3Y8Iqg%!Zj1-G96jhtY1_Otwe!YQ}28SCV6LWi4k{op!gMxoL} zeG2Y};HMB{24Lwg`u(Uzb@fn(^J4w~CEnW{O}{U(tdA9K3bBCjtFZyooTu1nhDYO~ z(8tRJ#>AQLUg!`7HHlNCLtVlP#5~X`BK%3Dwg)-I5)g!uWaw?QIaSlybI{y#tFh8j z4H6_sVl|>{GrxF_?)&#hj@N%AnRV}uu4=DS!7+je7pzBYGfJ6z#&d*dvs}i(cRv=N z3!AZH<@8WO1?w3~|0~Fa++cA$1&sK^pDaahjxq7`PPH|3cd$o&oi| z?yNv9NV48G6>!r;AxBL<^Zl|L3Vk3QbyZ0e|HCv8$EgEOsEMf>oIkspbSd@44yMP3 zEfp!q7~!`osG1jxI&M)%Nb8q>4{4yg@E7@oXsufudp{s`Fd}fE&nOjprWT}WP7U^3 zkY5}mubqdQBxwfS!p}WO@rPkQ$v!4w3AZ@9>jU)8tEs&h?yy!cZqWyz8!o+*C% z&6=wU#^Ypcsr7jnD1(Tbb1`ZH*Sq~rKLU5rIE&Yl_t-f*ZxwZ%AX+}WoLC9zR0NpZ4lB=*%RIWTNwj%69AO%iNr*3gyA7n`m6KS^aX}6Z&ZI z#7CtxBlIpxJVW~Y{0d6m^WO*2s26;7-IPU1s)|^H#Lav#EFDJd1zU5)t*PiNm0_L) zoHSxZ`}9(%6w_R^dmE^4wVGk%6>?`9n*A+8JbTeI!<&aG^GvrQoQqvvV2gEl)iY`z zC)Dc$vGrw~{BrN3pTCMT#B>;bA8>gJzT%p%xFL5`2Kk)@B5N==d!r1_)sxMAYDJ->jjMK0TPs(@g5&~ zEr*twGM7WpnR{JeV4{U!1T&`x-7ns~58k~VXwro`t_`#PkFSTyGs~g|d5-izrC2PA zi+&zV#|fhSSwe~Svrz=+L(P4&h_#JM*2XA#-JUZ&yrV%>@8H???0Kvt;W=FuPIijS zI^5I7vm~A#+MZA`26c+)TjC>GQ3~#p+7Qu|Fs1?WNE7Kk9qzl)_T#e;k8ST%UL`zx zHt7^aSYO|`c@9w3wf3GQw@I&NIrSW+|6dB-{FNxrKUdd(*{43&l$erZmuGLYaH%Ej ziqg`b7mLFL{R3BgPa_AQ)x_o5u>{_KH!Ds@IE%ZprXzC5%$fGTh?vNUZ32(cIKS)& zNV+vl){-Ug@LR3|XI<-N?x6PFs7NcGJt`uQ0>-!@$CZ9 zQpHv=Z=b+K4(i-BnA^S=NfTYUN_7oPl(^~X4{~5i%Y9?^E#>f!8j}OQ-{l3AYWABw z&BnWq=C&4$eVS!tt-1^S=ZL)^NoJ%(B&9*6nO!sCA7i1;)p83s++A(##S+b@EOqRXb{0RB~ z*&@h;+j7CI$6-`|vMjS|6boCD9$fMM@Bd(p<2dIjBytcNmvLeU-;I&!D%3tA*Lb)t zoim#>vySNm+Y^q-YACsIn8q#<5^^@Zh54_qfBoxU9_GJRm>WC(WBSKG{z2d8PlP#( zj4jDaF`V4XyN>S7BFaKbB=y)|?+BR{QB^xQ;qOi*4qzG-kn3QkGz)T^4AeSe zC2^RfczmFSn07$GYgw}HlDnv-C)YB}!tXP?Wu%V z2KmTfK$VM!1o{BYGcW}5)GfhjL@3Kv;4HjcgMayv8DMHxQXTk#VNrVk6tNNrwkm*L zZduZG1|?!hW6`ubXvgkMr@5bZbZ3@vp1!-ToAr_Anm`OGsbf;>H67+$tHOqa<3XJ1 ztkMQ%RqrVg7C6Y9wZvDoz|4yas38_n9-EW-s|We&dT^KeJhL5WWNumcj?=(FLJ%A< zryz;PBpiB$IrT2i`A+wVrqn)Y_SgTT=WLIbFL(++y+v`NETzUAXrEn4LC>aubHD#< z`H&RL<Zz=G9_=Dflsh>A#1R9^W zjHX+u0x0#2y8Yv{dL&*cV0dH<@w+x+8R{-i*= zG`Qd41)Zf1{q59gv;?(|nKtoLhF;+-EW28Demz{C572ou4coxv($Ig0_c#TvP1Pr7 zi~B1R8wM)udb(r7CV2R4DTs&poTSqnDDSOwn(Flx`!xXVrt5^=2TGQ_a6qp~yIj*!i)27#|RxdKV98w1cvt|}WRqwAE5T3iS~Oy-2%8CLQRhy5$5p>Zv@3_;vcMO^)9X^+WcoefFTx zC-plFH7W&_mEJINM53+_qRiGx5L{QAUQ-agkB{(VJoMN9vXtvnoPR=-Y z3Fq1p2u*9D2Guc;iYBaeaP)>6!wxY1>t74THP$m9oY_OYpnyv>wf&}Q-zKz#JH=Z% z^}=WN)=}XeEO~6%Rbw;{N?UP+lv(pT?9+@yd?yQrn$vifF&-pQY(uMoJjRD(J2na^$;-2}eH1`xwsl zp34M|at?frQWW|L!mMNga&bSn%^lzpPtfm3EQS4GX%GO-O?>b2?n{YQYkFBOne}|% zekS`iYim2Yt75K)IsATb8VXCRBhc+ykdoAVhdR#*zQ6&<+nd%>{y6W}@SE$>%k+$6 zh(lL`_nE(kcDm!?X?S|5LOKU2G1tr3{*?dY2OgK{DaT)>n#RhRLFGIn3m+ijvKKI( z1pCx+OZ^Y|YiO-lnWFNuzzx8y$twC19R>0~!a0x+pthXLR0nZ*uJhNG5BZV{fcIkI z5JC5|h;grvBs@Eb)uxvRlV&0TftE|90}IhUQ0ZdR*fG%GBcQ`TF%=Q7P(yR}e5d=} z)4omely_i~COm~=>613xeWJU~Hbr~Jbr0uzF{JF%dv=*P7|Btf7 zIlCMqE4Vok;)VD6Q;0b3E*(n?>5~ie9M99x<;|>RkaKRYu;w7k?)VnvT|`iHAbsx= zOgK<|r!h|Qo$Gh-&HEf*Pkx7GK&!-^aJBn&w?i+Dw<>d>T#Oif!?A>0!1=|(kj2%H z!K|@Xw&`&zzl>3%1`7Dd?)-WVNI_}s839ymoLjUaoHM$-BKWv^-A=W&$tkK&QiXxyC! zH!;SmPJclyxpTruG%3*SW4p75LXLkeirs6r}6VV zCas&WHxG=6oB!u`w>d}63qK5c>igmSBDdcjNPlszDBI>`f%p?jqSSBwLqta;-tir7 z{9kA}HShi1JhyIoe!jik{Qsou<@EOcal74~pPt6M|AW~ZIk7iLZkK;Ib2DldkuI7D z9LB+2@5?aI=dzm#truW2bwdvlNyBFLW~3q2#nZfzHAwx1{ZI4H>7V02@BjSg$N0~G ze&Vm;pZ<@?BaYA0r@b5o?a&-V{#rrrL#s`RzPM}3I3dE|t@cM!aog%F)7A9HSBVbx zC~fAo7hIAvs7FH8q68BPe8K;{;`_b~%OjLF-pN4C%;)4C-d@+G2T-%T8$hL7s4Ad4 zz?gdYxFsH}qSu1@mhkJYAa%buMZpCw=_ZWT!*h&34Hb$%?0i+fe16_W{dxPm4WHWo z8qlC z+?q+M>y~lNQ(GQ41;wrDd;pXeVroTVQ<|;C%g;#l>^)H_tU&tJ{QqnFlkLcHEo%@S z+hfF9=?Zt58lneCD*n<^eWg`{x+Bf={omrpJ^;xBb^t9hEh6NSZsxr^4!YCsO_>*C zhpn4FUjl3DvPPmUQ2xb#F?5Crw(ySX!>O!+UO*i9I?P>rPAnIHzqpt6DQaaA(4a69 zO2i$YyRN`Fw^=E))e7H$GZhZz{8sLpr;uz?D(GXFCP22|U!t%(WH4JE(DcwJ%Z9uK zs(5dAk^)QBcQqH`EqF82M_f+9PSpQsva!7}ftfY%`@SP5v%#p<7WMHlm5L45)@)5K zT}px~sq|V~{2QH{ozujAb%|?hnSq9GT|nzM^Fxu?@Y8*-K?$bP$YEuRZbv;PQ837A z$ieva4OJ&ONidD*L#KNc1V>3dz!@dAR)G?%?HuhaHO`V8?H=(`TXQ8Q69OH3`1gE3 zAMGma0ZknY6{M%BVw38bubT}zz4McZxjek0NC;MdJH+s551HgZ^A+VBRfpl9(?*((KeT$o{gx5CUHF$H(&@jqu}+35&N+5whYtekc+6fKTE&&R^e?~+Pg z<=F4G+|VA)UpIC3h6{C%K8Brnzc{9LN0Qe0ohxmo!@vK9>y8POZRf#f}n}Db;8P(p1o@fZW`me*Q|d-&DqafV5%FRJ-iY0w-weV>8EvxEbV zF%Nhl`Kq`1@ROPK7*74Wu=9Wt!>Y+LRb3yPD4{Zgs?@|#wdZ<7e#k7V4wRPtkhof| z9;jWCRlf_=V&!EU(B#mW{%+NJ$>n1we~+hl!O2SQq)ARZ#Wk2T)V-ymKii@8(#;M( z;4Vx<4zp;2IWboD$0epqFn1{-7nCggFj;W04)TYx;$U~)d61^-|0*S9VmtzCnCP92 zyHpmpJ&%(@nw4$=dfcrdrhLg`StrbP^&udcqnR^eS^hyd`b8aO;Vzf)QNE+py2?;@ zm-krMmUU-tJ!#Mfd#l6zrX=p`>n4Od-knY1+=`%j7iGGV8c~M$*Xw##ra1Z!IrN28 z!h+IscOHt!)MZft30sQ6Pl|-}C%z1RZem)~DYxhmZWtkVVOpIev0Gq9Ab++u`@dT9 zeO}!a(v)s8`6Z&x({rTso7?L!sHbA->4*39{rdrCcV4_XAX!}yhlOO-s&v`cO&fjD zVX!C4&D$Jq%RXH*moC02*KWoZNW248tKK?jkDh-Xaqc(7-zCzi^liFy@>`JFKgzk> zONo!z@?%^=y2~8LU^3F$xB*DKys;{lHzC2w~&v9(^WE^K*xpD>mycF`l)Ubhn~7*F@|i+=hFs?|u(w+M=NeCv3SJ z$GdFK@R+0_DvSo?ErJ};*(FgtBMP>E49(S0{w$rJl|7O@Kn>5*>qEHTD` zqA`xx*)AKD9r}fJ;K5W5W~4w}45>p7lsAr%ic?Z`Ifa?&Va;6cjDM>1T^L?mCXBrr zDsA->=5^*$tx#=}^^)SaJ5{_8Ne z>%ID{z0qoNd55lgZB0Gp8S~^(ZOJEojQ+5DKauXkI+TNHxqKJ=*p4ooNLxP zL@flJxK%M8RK3_vMT=V-m-@Df82xzkB}Wh zb3$A$$hWOL8X3c6P&Z6;Y{|ZO<*zz*JJKVI6T)#)K1{aZq{!mRfo~9>Auk#En|Re@ zoQFHfnrBA9sE(|173Rs9HkG^Yx*t~y-@t6S!@J51%7Dga>NPR1wg3$?wu9L7YO~-D z-7dq)vLoK5gxCh=_ZnvAWO`)bPb%_>MkqSUXdKe$9FhEi=jkSCsXM1`(B*FO=_v8~ zaX;uGzSQe+fb*r_r3Ri4ykQgio+9JueAX~x!Xo(m2=SQHm)~3;eBUULEGy5M87L_t zMGMko&nvl`)~-t^M0dSJ7rSAN_J(;(T1{O8r#QbR$~cw}L$ync@1uT}gX{01`@Pzq zvfz)14$onxJ|eWqYbsGLJp+m+W1FpcvFz#Tl>ZLsFlGXz z%kKOxVfk{B#QJ7Rd4}3SBIVuDhPd1z@Omfq1J#h%dcjSzu&`)|XDCoAl)p))a6+@t zX$|K0JYlgdcmZN`Q^tNhqu)TXO%!JqC*BGZ4-#|k40rC3AI(?&Da!8yFEm}EGCH7d z+FzEf-(&t~H?14d8~3`_vRYcgC%}We_gcP@ZtxEz5uT zjh~**fb1d~n##+`FO+d~<}pxJ#YsSy*T`eA)Q`CaKZd)v{!;iMSWPxwpV|g=3o(?% z9G#+8_BzN@E|%;sU*=lgrG;K@CPx1sY`&T5K)-H%*}AstI6ne4KAWHlq_J|tXF*Dn z%1qDjkV&UoD$_JvCW4=bzf>Yv7sc=wXA9CgP1Ds4`o)I7!m8X6h*G1=HPIVVKC6oAcfBOdFrlbm1&qF5f7kqQ8Nzc6;7Y$ z`5SO%aN{zT7Vr2(wubv^SW^Mox;=Ac8?lk~2>QXNJj1k{s+y=3t04dSYv@z!{5o5+ zIzub=5P1coXBZQA2m_ct-#@?95jK$s;#Y zdB-2RTODLYTt?GbxyV!f`oxrM|phQoWwodqp4ORt?dW>bQI2i!!Pq(z=E#YQIw| z{Tc6nao4Z@vb#q((W#BuhmvohfJubH>N?Erv5$e0x%8a!nhKir!7Z!#WNVgrw;X80 zjeqb&;VJdKCsB(nX=`I2bSbsD941cE9Nd(?Y(Y8<<9WB#<#W#{wxUmh!BI}NlaQvz zG#S;lAjO~LsvM_U^YPlsTQ!gcP~TKM{2@A4NWO()9~314OGNSwh|dKXM(;NbbEZWS zcq9AHb^!zGZ7vEl0IjvlZFfPd@OvSx?e`2CDB12Ban9Yb^Ou!nFZ=yljb&y}vx=i6 z;*;t|y`>!c|AswR+OUg7E%&A=A%S9L^+-+Bid0^+DXXc6W&ULgwVnEtt8FryI*!z( znenEx|M%5R+VVcv#fhpiQy9 zMp`l8X6iAY8CQS88J&}ka6aFVFFy0{;_`pnVpt(_SpCfU?S3HO^XrjjbD$Mdcl2!9-XltWigqqH5YJ=Fv&}b~1Jc_f_Uwn_IaW@J2FBzZBqQ@uL)lH9DT*IO>9UFWd^0y}2R1TxL zYRWwgZNG;TdfKl^AgFzJy3wj)WMldMfvet+Y^g)Bvq00g(dv#G)s|eH{P9#WtgRllp4F zu_vO3T?oO z&zuaM{w(@WSFN1qFX!ANDTvRT=;;=miVHYUo*5TY6=f1eKsUpgZp+mQ_{0VzF^}0Z zwguG{bvvDC$7>Leh6_$Xp5LCkLdwjTdLd7Qta73ytit@d^?g6g41ay`$3D(B@`{db z`S`BKCy5700J^&6yC$5=#k9NRTir5QckBK;jW&ZEAbOxoi(Ym7aYvGr$mS~;(GID_XS>ray6Gu665&=y2^vb_ z$U*9#^70QXgR@KD-%3^my2M`TK;q8Js|l=eQIK2CxsV2!n2Ngf?|nJ)bP#}+@26<& z2op$!pZx`AL7;(w-5Rt4lpUyaI|cSAiH=8{I?<&zY|=w>E>O+& zx%Nq|N6_>=hQ=i;)~G@c!=;L11K3-cKxUR)$Sp1nLo5R1)MpM-%5DxhRXkMAkx8c% zXv>iwCvpwDAX^>#t3f9fJ@G0*i5Ox%%%xZHbQghf9V2DJsClwK9K2s&NzEFn@aeQc zr<+~o3jlKFdISCAE0Jb4AywFjCXfqqC$p}e`>W0}XZgbvwqQM3D&9de%b$jn6vlL_ zFrn14Gbo;Z6~7K2_BzYoB&_J@M)Eo^bK13^tlX^%^6Tqn)?IB!3xB}{U5ijJD0xZ7 zff&MzTOH)2cob}TbhxU-H4*CUC=>r^47Z6KU|DguD1Mqw5sCX2*!{D*;@ z(e#~5U3h=}HI9G%^;dlHJtc6afKzoj$&e3`DNuHs^pZ~#c@Pck0=tf)7ZL+bH68TN znk8@d82tA9xJ*o>rx($rA+y5@klOV@@ulKHkL&K#!E)Qo1JbzM-j04%TTwnZ#_B@aTR;uvL>y;=Ie z6({NziKAcLDge`K?y5!iRc}B&+L(tVL#2N9kIn3sw2T4>$qA%|?K?E|Oq zRGFtc#TvGoh%sv@e?D)P`dH=>Q9U1ecbM_47vVIVV$9&hSi{CUi95v+s`c~xW2bqn z8d6@{B5L6<#D`K2)Dt*_XJghvuq~->>a557I+QM)Sm9h+tijG7o#Zy0J0H$ev-aSc zxLYBJ4$0DZwRg;Mm#hk7K#NI|)74e4g~D_(Jv3)eI@^6ZXDmb z@ws>IWjz?$+Ea%y57gMHom#N#-A{QBT<={T3$8;=Lf(USnUliI<7LHe-IX|LYE%v+ zx#bc`BG~fghaD{4{9$hZ8y};8m`T&;quircqoJ9YAEm;qb5LLu7cov;x5f>xllA5| zXkcOt6UK&IXX>rNj2A|OV}!{tMxQkgh%W6vl{LS)kB{j4$`3fd6#x9=qa*wwKmLi| zzQn)&@mK9Qm99yA#k-|>8UW{cLP2VE!Zo4nv$=GC=!9I)!V`VCjv&#t!fV1x3BsB3 zGVzMftcca~GhIqM!yO>&rm>GpPx{Q|Cc&jZ>|>$C$==E^(P1{@JoumsY|T*7&^?*; z>L8@5wS;Efa>HkfvKzjNa(X6N^t)O7)$yq(JVbgX(GLK+R)j(vqP+?cA9vc{c@-kX z7ZNEM4RabsOo2wF;O7WKO>~_J=4Wla2Q@rRsNOendiy!JQg?xqA*OywaB+Le#93Zj z2hydD$`a3@zcug7f%6bm`1f1utfCJf$if-o4?BSB*(dIj;9~>MrItFCSUsz+V$@?b zbyRL5u zsU{gxaHN{KWdIhMSed3XuNcb1D)Z3*<-Z}PGOYQ5+f-X6sVPKf(l{AkLdRZ09^GO{ zL&T-71N}yH8?+6I@$F7%msjV3-e+pvH1RQnJdI!a_mna`Xf&C^x{h-(;`%eij8_kG zx^XOx8KCZf6s8`>m%&>-K3(lAPMLpA-4fOah`h56 z%0&uf;s>sxY^d{vi_;GC_%LaTqatZJO27DNI$qpZ^E$|rW{HoF>F&9n&_L8p@ps(! zJq81pLljmCHzKRA!u-7bK~3Tr2_X;$IxM`#HekFc`%zhb4*gM<)sS{d@UpjT$_+s0 z+Z+BQ&9*n-JYRE)(=OUGMd7%V5Q`CU5AW)TK&_nf>Rs-VIRpAbK=XLRh-g=h?kyGf z-Dur-f9rnf=W*QtTi8>d^uL1bE=FSzCRp4HE~X$X@s|5r-JoB$J0(~=C9ks;kxx<~ zGMKjSeRQ2{V7b;p0Kg;(kUGrstZ*XVxttVE-OnCfhC&+R0%9xAANUS^hSbPafw|9U zN?q^{e6kg!wmz1M03G>iFlDe!3kno2Ci!QcDD2Q^yg^6c&00gsBR zDynOmy`d?0r0lH$?(GES^tn1gR8pEyMjN~)MaEbMeg!0I+ZMQe8S!F*nVKcTl#lgm zt2W31U|>*L%4-|LOG2g)Pf`hGeG8-wpyw^%iN1ca^lLm+Yl=Qw?K=eXvZMwNuRspO zjmf#z@ghK)s$rwv=HBvbOwaeMciM+*O$nSQn0P_ZE5+x>3Z&rJC(on>YH&VS5kt)& z2B@Uv8s?A~A|mv)?`)g6E|Ev-RlH3Nq)G}W%1G{5`a>J3Pj&SVAM(70g20!?WHDQL-iqXXl9#UIOj$4T_1DsA53c+|VH2@xAMY}r?lhdi#-jmE zpgE-iGTQSwUO(|9CHAjzqFFDx`W|{rsc21&hZ;PD{DNXojYtu53(}L79}}hhV+2+} zYC3A{Go?jEH(4~uB#67h`SV3VU5Qh?{(6=2>FLjt5HC;fPfx=^+#6Eww%nlnou{giYQ)*AerfXA{ao%#OEC!v zwhL&@qP=u&(HxX$ofxC*NK=r%B2z0rla`HMi`HBtFm$Oq9>gnfL=N@>sEm|2K>l;I z7m6|`^?7^3z2C@xqEkjsKUx$Ko1jcm+*iCrVkQF1s)bK|lc$nvER0i@tvYX-_T4n`+GJ3yx$9mg`VANVOpD*8pC9x~p}c^UE@}_V zqJF*b=2nMU^P&2k-PtfzNL0$!$2G?^46`OnlmKAjsjPyG)_i#xMz6g0YU%)7viN&v zsNub*{@J8MX=v1>a7lSeCtmm@W6=UMHT35vXnynil;P9;h7_m{l&F-^&#(I6Iub%( zaB`||WATcHTOH;B$24DDtOETL&kQ)L@+o2q zK3}B!K#%KpDHu`FlDM2iJV&g$?27bLnA!>|Fn`J#c%RbEwB+$8J#^icqptcIO;Uf@ zoC5EXQwKP0m-@j9oZtI6$<3P*Wlz1FoW9}Zj%1Hs=bp5vl}D1AFX9l)7ZK?vTX+rC z?!55Arb&sKH9u3IMhrVo>GZBQ6{Mz5Jeq;X-q3WCai(X3 zDPp1}t(dl!)(Lw+8-%>mnSu_jQ`Ggen3g*wB0D8@n4h=d&p5u%27KQeC4MC7th%Xd zRrK(ScY&rq!N6|1bI@2+N;#i55M>IQf937uRgtKu6$&5p}?FJM9@rE6OE+k!CTe9Chv@#?<=<#m|U{$hGK z_|VENm`%V{IeR3DiI!SytJ`z955I>$Z*L#!O?LN+1YoSu|j8oNQ0Ke}not3OZaAtfDJZz=c48b=G9Hf2Zu-GT4N zN1xFoJR`yC9x$1p`O>TP4XTKcp@Q@KW~Y2(xR3UAOd(!BBi#aRs5I!@;%fSU88mf( zGy5g33Fm9{M68WiUtR7Y_+8rdwTaeQ^5!QZpa;(Z$@uh0iLf22S7C@iuTBN$Chedp z2;AK5+lta`R~^Z6O5 zUBV|zhbwxi&XzFMykAu!i)w+1PWFMjqcxaf_grz9GxIBIB=H6a9cwKE(AhO{$cT3> zICp*0jex=hq7%b1k7yt7dap6{Rd^?UF{-hNf2jrPZUdz0^Pl?TuI?+OXP4@Xlp0~E z@ijjGjDsFHy_^?cCcftupk9U4{SsqjKMd;GQZ2;KjWC?g#Ta_g~#p;WJb1iy}Rv4al_Fat5Ll{TG6 z9bD=!xoFd&rwgga!2-q1h{$BqP~2uyqAB)to&+WZ77rz`fK(D^ZD((Fpr9$X;l&+u z6t~e7XQXZ!ZKA}PjgAizI}B&^PiLHHB}e;X73JshHibE2@*fyKXP?#Ew3~_0#g#vi z*?IImV_J}N%Yc*K6t0Q1w>r|#TaJ~7S$jA0*AC0^YJr+-~8O%9Zth_pX3??8VYfkf#xjQ9nW5kJopb)t7C1%G|3)6aLz@Z9M{ss>liA7Z_MB>CNI1<54)2NEj(&YV1iEX` zd6bnBkSL8J?Bm7o!1JUa+Km5U9p=woKgm$+&u|5&Scc|l(nUzL>^RShIh_0(ly_7Pn1 zf!Gy{P;7(pi(cmNvko)j9A1t&Yag^Hy-6bZb{%|tM$A?qHQv&N-vo4HztxZZpUaPD z?$-b>8|mW|j}o`2$zrlCKznFqxN#Z8p-5J3vnhV!U%4blbl7ajTNb z=?qm0zD>izfOrD)i)L;!9zQHidMlNZeL|4q1C^ScK_%&WhjtitpGsD|mk!B1?{0ON zUrV8>pSxg&kB^gNUR^&v`OeYJk5{RM^DNj9R)I9#r(F}!)8q8mCPgZb#@bB88gEnX znMloxw%DUu-Kr?(UeX~2%zD_%>0NHyASIdz5D7Ak4mh63)tuH?CYDZudJYt-N#LP! zY1z>Pv#mH@Yj>Wh3uxX}#4XBQ6Io=)BndZGF;*JNc280p^(9pr(5vbDP#$?6hLVn( zmQ6sT5j{UjI~KjnuaU=Z(Eq05F4g=bDLyt3d`di>p@bulQ;zedbSwAidDv<*yORAW z91O1$vvKcyjO{%OLlwV5UFKA(858-&9D(ZQP^7`0sazX<{J8KshWC6Zyg-T`F0|*i z04*!D!z#j~E~D!UPaROgS?VbBPBViNS7-SD#Q!Z*gy*I#*#z|n>D05s6t0hrIw@-G zAE$25sxcZN4k+HL2e`~F35j9jC!;&I>Q5qqd9oBNl_s5l`~Bu z<5U}*iTmqx@uNw>l~)mjxOzcN4yjNgz7KJqa?YXxDLtW$K+g~Yzj#0cWY(>a%3*r- z$e@W+wjHPF$KfQS_W%c!L1ZrwHKSk07D)MHPx+EW%v=FVC%=Ch)t{ME_$^^RBqlUX zNa-%^qGQs@xCW9hJwqBdb86LV0J_^xgpd13?9Rn9s z?TeMSOHR)Uk()GYK)=5amp-}qyYvN(_Y;cX(ai^~4huy%WvbLJrFL3?{&pWu0+yT& zy4L%;JPS@C6LIK4q?u+$m`PpfKxOvKD$ukKHyUXJbM`N@>P#~u=pY4GvJ%DOdw%2F zoPMkUT_XGK3z~lAADC{T8$U*%ApDD))Gfn8B~08-5Y}mcxp|53$%!2>t8^Q#f;y`& zoq9*&a}tANM)3>3NaW|zNF6B1RcQME50*MWbmKWm0mZdX?y6WP^6Lgo{SOXZIw&*> zTP}UzHg(kT*o1WYNHKq_#@APBeTe_qb%&N|5XPjGPG@_Bc@UAG4?QP%l}@LMLmg$@ zq4(_#zam(;y!0f9iXK9$Hx;R;0;eS=E!Tw8E_udKLQTp2QC(*G#3f2|O}^_bB(PQB zNV(e_U#1u~Wv2TxoYeU;^^8xQPL=ldN_lyf6F1QrF@jnhmvt|EJZdE}w>rp(@-^Gh zrfUe7=PDFkp=`1zX>U+vBwnSQs-{4SYFTb1sslaxq@{NoGJtw9bne7PKZ&8F&XGm) z*hmY|5I=re_6+c!67p4`$rlwp<^)66hmJk-oQYQYtoK$$xm3Z2k#EkkCj5HFG3Uh{ zRc(EUdu182FMy_nH&<{Ttv-|_F0(03Y{}4ShBw0F&%&SqDcsOzP>>>159ajgHoo4~ z8sp5Hh;tM=c=-HH(oFNsV&+o$6QrLLQc69o`U+?QtMB`#xA+al64#qlRhbJ|RSW@M zbV-mn5pD}O?*Ww`(FyQ(!yfpxw+H%dPfBBTdL(VB#~eyS!)2#wwn z74Lw+4C+|tFh%ytHR3ct<>Y8Y1ooWsq-KUJ*X4#bk|MN|L3tLuC02LrnsV$+kwH}p zP`*b4zeYp;UPW&}T{Y=caR_)x(c)gTjsxp5eQW%jFKJr+p-;ix0F{Y#t?A=ROIbm% z0`IxQgDSKwp{_iAb#bE}XZEf>Q^_$E5UevQX=5zQUxZFNkFj4_{pl!0Z)#~Xa$A@U zuT-uwBX;sg@2KMBbEMISK=uA1r2I2rHskU(oqO>iKHPPp*GwcqbFOR)elihmRhV-$ zI=@p%YKDG9_p2H)=6luS%hVMZ4aMpgF^O3hNRZF1O$7346xli8ct6pu{%k*SP zQxjZo4h;sp#$4bGBGnvPa8hz~aqjE=1#SAiWKn3M1WT2%W#M@pN2WSX6VChz^Ops0 z)=7UL0p&<_&4N5HzVlu)r}AbA{$ zaeHL(>&E=K#k*rL@$|(1-Xi^kgbuZpZ3xoqJ&&G7glMY82kzjRq9kIo-s>U8?}U-l zR#4O&0UI%llrbTX@5QWYOe-d>JqM5cCMcqGP!;MeqthvIDwpo0sx51mC-Kr<=FZD* z6KIM8bmoh*&ZAH_MWO!|=C3lomV$uP95QbT_sE7m_w5{X(ydhp1S^#I8qRc;LUu_2Mp5@ejhor+-?x2&R!&*szk7T(68pnf7E!v4UZ^)CEbt0{ss zI)=;g2xu@f$zKD|VYp9B{nO4YWBb#_JhKHlbrNBMO+pSQ@bo=L3YRJMy4Hc7ceWD` z*kjL;0ppYIeEfglnJ44cXMUbqK7rYAls3;%J*Fd`3NXHzUZKOl@C*be#m{?G-|9eP z^!Z1PKJL&z6Ea8yoE`eQYIb5Q!7lZPKY64SVa}7l7A&dJo-HWH^osZCtFm-}6a-q{ z?5KR->qp2411TJ0uDT>mEUifivKY~|;2dJe_NRE)#hP~fQs#aMJ<*6w6uBViUrxMd z(*j;J1O2`Y@x8>X$kP50*s|}kywy`ri~vR=lhZlB#+vD0lEj2UB?s{4)hlcVnojWXn_)%9iXCJwZ?af%MH8=0(}U!52y6w7vsw^Eku8 zIZTk#@$&jbue;pqR8>AHRAaIQp%V{4Imf(()I=9<4g2(gg>O-9!7K)(nr?)a+P$;~@A=GT%Jdu;~V*5zsv-H(fI zUeJ8lk^{9yXOX5>oV*t^j7cqe=+UJ4wjuFuDuPQ#CGkbJ{u6s$E+J;F)EI#ro#En`s zxEA#7A!K5jD$cLldwj-+-#%$?4-*H(4jI+;-q2k%R70u;MOTl;gCZAdUaE zEiezPNRK_R_x3U!esb_p|A@Df3?&_vc7+?FDd|+lN%_$Awp{>)6Xz&ighiWgEX)XeQ@W|F3#>y$NeQUj|QEi{63H|Uk;TXwLZ-} zh`ou1iI9l&gTRjNB#T2h)4US7`=JK(6O-W9xnk+JhyS#qy!+0lf(>q__@5&+$~S3| zeGoKuQ-%4(2E$Tk>pau)agYCKE%c@*ugr|fJ<$dl36?Z>BdnknGIl@li!`_?=Xkh1 z6Hxe0UX9am^}*+nVQ@wIrdWS1d!8KSKvkzRf6^Y%PiD~wbnvLn#dqDNh)Dt;+5bSt$!<;su?>4U9s`q90=+AOOib^-u zv-^}IwHOA^Z8}w;`}Zh6VAo=iOjnU^C(KMM0CwTSwL(xC8I!Ctr)z zftm^4q)WkF;8&?<%C_lw65Xx3@GB?Eb9_5z|56|tXzof-u~zzG`FU>^l=wO--JLG} zp}RM6rpn%r-P$>6I`>Iw075-b3|2=#At!~Fr$>PDC*O~EziuQ{jT6C;8lc8Z*@@zY z;?7Cp$X2^ZH;Z4a^zLhVdF_);Q_EvK{h)bRj#H6@@RC7J7%>S>mZ@w~xf|SP z3QD_l52N2HdZtN9Y`I8zFly5HMa`Hz!aMx92hLKWNk!YZiksB+pdMsv?>8njO;`s% z3{SM`gLV9>KPunJaE6sJCPKR*hECC&4uYp5JfCa;ah@P8#`C2=;wm9y&UBxbXD}wd zacQE5f+ju5CYUo&JC{VEI8Zm~C&@-?B5`$Z=oELkP3LT*w%n#^I21Et8secCqNpA&tzFGSc) z;OrdV9fTyMoc$`pW7JX%^+X3iykFck0Y|bek@{P0hyMJ0-#!NQasL?mh5Umz8SV5H z?bFnRd6UkAcSN8EhmY0n6hu%-|2pZet z4#igGosRKDxk<7^DUp=#XwlDXi5+Ug(-{2{P|aE>!rzPCIaIf3*;N#lYQ3Rh%Yc}< zXxj)(3qh%nELvT5+@q^Z-i}EdEb6jFCF_L{G=&ksjJ|j!$nDRq5K6oBfL0ft%H+|m z;!hv96g7$me+c-fmbbe5cltLhl}kFga!(4>aHW@->juyWik>mU$SVPpJFm#kPtsJnO8q=+QBO9cne z!?ET##WIyc4pm~wuWj7Zc*2B4;0u?mszVMRJmg;9c-~ld4~is`sY0F`RPAid`SA>^xHKZP*Zsi)u{NY zdS{}$VvcZzj<~=YuGasRPL5PK@#xwJc1C+^W!gcbdOM?>2vwPH_Yr3WD2Bk*W80SNe9ffksSKh3F+FR4}RG_kD z$2OHI9@3JfiDBHK#I0ovTl)C@97Ai78J1FIHzNH$MjB1{^zt_R8J>b2y^gCNA0woH zLY7EcU-+)LpoHC-!_5j)aQ-2e&?&a`yj458DQM!U8}zg*YG1OksRhzv&_zJMB?f#U z6id%DeLm^6Rbg&{riBml4_onpHjW)6PBA(Zz*A`gGiZ@JbBVX83X|asdw};0+SnfB zL7pq5h6tWLzzL58tVLIFjvDbk~=DK*Sor_@80oF=bT+@V-~)*@j9YdXQ!mD1np zI1f_1jA3U>uGB)DO45VD|&gkFOAg+dT%C?jA)6=hWCcT~h+<1XH5|Tys zB;NGFv6_x4NV9#QWv}4;`aUChOgkQIm=mHt zI|dzCX7}ag>P?@c=jew@sK^FPkuXP8FunyvvQpAg0{R?g^qAdPQ*t<($67s}dtFfx z3MHeI4&0{f^>p!}zOBM3VuP0rhn7<5N{Kq(Ul=zDT|J0*ES_8S=p~IX+ zi6E)Ee{F(2m&OI?>6DFIKokqlNC6ZCClbj@4JBGlFq?60;hV~%xakGjp}eIRY4%0p zf6Ywl4T>O2N34;4?;yRu%XjpTK4E6)eU$J*mTnVdq1C=Ga!d&_ONuFk$RHyQx3VkkC@~lA&sYm~fi_bn%Yg6$)kmJy zk-TLyh$*12BHf`SOf6?NXHWfg;x|f^llqxM$+277Hwu_aQ{_Q4;jE}I?-;Z`>Oeh4 zkmQcWjBZmf-O8YICMVci1`4dm>+b{o`n=hAc6vHMF{*#U&7WK2Ab06|NQtj9BxU24 zC}Slyl;Mb$5F$g$aFg=3Ej0}dRV}*sBxTGr@?T%RpJ>>g(s(}lqa7|fXZ8SYmeh)4D0IHK*gKLvwp9li8VhdZL*!M|mLBonBB)M8qzQjCyHT z4n8Nzr+^-QNfqWcmosNa<0k0s9?tsgHC1pG%`|I~WE^HUPX_LcN}468!`$Fx6y$?V z!c&*0ho&hf0j47K>a&SPqlyRAC5^r+%q>oQSvX+&?dWC00B%Y-!Jw|U$mRr|7F9){ zEU6~mg7oOA|M&<#RY}(|Xp?)7JhMxLHC{4S9m6T8l_gf4tay105}Aox*Xb^^xpwqN z{Nb5Q3PDjPwnx5tlu~`kLlH$Ihc7)#WKeml!{iqz>xG-S9}zFDQzT6URTvM3 z?*xL%j+Gmg<@W57?^K8RlO@oGqWh)j+t^IFHTh0RjX{am#mh9fVu$jHhcJ!e)o`9y zmvWH50$OJTnbWt?!vg`eV(ld*P&9OqX3s^S>l=UABlJGEXs&{x*BKK|J-knLko@K# zL43Q0GTu$z48-KV72=&fmrwlM?jGRZp?rUcl*oNe8`9|$4*3(WuKe_RqT}6Q?AmCy z@cdPh)2md2SrKqZwYvu`YEZ6sE=h{*;zU@74cdY;O*GOMeEX;GeWpwQ6XO!Aypor_ zrYwGi85c~Ea3&bTS)G$T6V7v(c#}5#gVS%~){Wnp>xw-Sb1E`<8WZB0my~tK6FNH4 z05w4Ux_NI(W{T5f^rvWNOZCLHaU8j3c$Xjvp%%4|^R@ zm4_*(Nx%hR(CXe>$KE9khK<; z7oca(FDK6Dt(?u8tW|3woAcTa-RsgiP(#^5RE1u5j1R;^ufU$?3WF{^ddZ-+3Fz24 zBp$Ym?v-Z3dZ>gF7%Zy!+>p^cd9Msf306^lamQ)k$$P@t51@PWR~@|E^b~y(Ok#QX zqpX1_0nMFSbdpg1tRqD?G-Yz{XwAnHdS8Ig<89M74-)M_RV>0r%R76a5@B+J?79?> zGy?v)RdZ!TS+{ws>TVwu?NKBF!CE|IOcl%MjeknDoGRkmcpv-|Wp;~EH~!}XL=iMU z2z6!&1^FU&=13iScu=%&0Xjq47?|8Fze$$1(88p7Cr>0hm=9hI>TcHZdw?iu&+k<1 zfZNmSpFgk9@s@vILOguYl@8dcw_tB5FVWXX8X}6Gs$@=zj!uf7uK|BUn!M#)?e5E* zeZtG&*BkUyp>r-ZID%U zM#GbYEDoD%Hl|3Bl8`9H%@v^O7Oj+SFSnFTozFx-9>|emOV5xxt>`^%0eX&+`1T&D zHc<-7Vyrpt0>@n`qgEaYsa#iq#+N^^T#1y!(8;v?L!htYs4jXabB8v@axB_QSo2FF zQJo%9LlZe1=GQ*VuT^@7rke52FZ(o*?<#!19vxk^zJ-;GIu}V?nslR{a$d;eY^o02 zD=LZ3oEx^9I~hi~rr@aOOQ*p-J4iq(WJfbZ?myvt=<8@58ee|cE)35%% zT6Tb_q@-CVuOQXQ^7)2O;u`E!l^gUZOP-RCK31 zmWbjErA~%hK2PTu%Ql$erb=$=`@3DI_L>OD9TAlF23-c48`G^LC}A(cbVsh~r8Vaq zTS%7$qRr+PRDslFCX)+D0|^VIVq-b%ayh}nYj`fP*;st(I%ZkF4)v9|)#1*URL?87 zr~qR z$CIXoNu5~aW@;wIe1jc-#RJrN)J5aZqEoEJU~@e+93`aA9OiJH$3GnBCMZ$)ayo?> zBYSe^;>muQaZSwz<1Q~$ShexrFrSbNv=gZ98$31}%F|1!&@auY4%f1r4VpR*B->Hvt`J!J|*M-1dJDFU3oGH4B}p;_<#mYZ(Sbd2N&f zMM;M!o|_EHuT71=aYaeuqnFR|hl5u4^V%rn&w7vk4a+8);_|yW^<N1boF*HGnUttFh1*vTnUs$nEQgqt#d?=nXvMg;8x&b^7Q zokBBqyptp!XD(={9p!IZadwC|UgAoZ3>FJB_0_q+nSLuMat%(@QZjev=Rb(#Zrme8Om{O+!x=MRwnlE0a zsh=3NGSzDh)#Cyv?}>rHB;u#Kg?F#hX%h?7`W|IdM!t-KJ+6LhEGLN|-&r7ATHOaX z@|ngxuX&zqe5h1LcI+)!Y*b-S9^ezRs%gTxXU{n^LGIrwt1DYuVl++Uc#Xz)QCKvn z`MuAKU@b^psy8o3DqR5__zG#$7A00y5#K0EM)3E^uihsgZVeZ}AwTuB?D463bN~;M zLkf34zg0bBv+yAo@gKvv{#@Uo7z>SW8U6|JK(X8oNVF%Mx}%hC(jckbr`UoaZ^n&L zch|j%4MHbMRbjzZY6>Tx#r0ZW_u&>6;tv+t@uljZdn7qW-P!0Hr9Ic1*V&AehWlvKn2P2{}w(oRb2(+4lp4$1OdRl`C`L;Rh&`)1$ z;yBfWUJ~&_;1t09+3_S9$Ek^}uxrD)UFZtV28*V2o{8p+K@&=kK{BLv5uHj;J+&7A z=;sO0Xi=1w`lmEBQruGK%vHPmMU4CtGmVtdLofdF(gUZNUwWTCrq^B`w3qqQPQ8IT#Aha4bbu#{Kf)D!P(gH7Xqjq!fi`OLq(nkJh^ zTBXe%R8es%DCuYJ{HA^SI;9?_leE4`ukO<+0VTnuOiWdLpK_je2R$F~>Qz^NeL;~J ziJ$1ehF%ZxpVdBX7`(>37|jPmIiYa#ICr{E-1*^FhxvWWH$1|-qwbAM-4IJxDf~Qb zF(|y8CZD=rlQnI+?zbp^wkUU)o~M&!5(V(-RO2x(Eh={;Ii)3U(e_1iNtf>X7IUsX z^T8K!=c4GiCQdg^gKYsBy^y@8APal4-Pzqz7_x@>y@9#g@>Fy_ukE?wQt~H^Uywfy z;H37JbwI>7${QZjci~ny==UvEZQ=)l0Tsk zzYKEf4unRe?}I_REk7Ol<^@kAcO=1AG15MST-~C6CbOOaYtlFisS>A=_X;eN9JiaD z4@ZnJssEx1vOrmcYd(fS^b5VJ8r3t&Wx5lL&64_W7SD~7;A^3BlUN(tejf8a+j;Aw zT+b+Y)VIJ$(sAh%<1LSZLP{gnk>X7Z=x3yB+s%u90=4P!w)EP>HwTR(wiv1`2>XN2 z3c*#NpEqyqAp3W!YUb+p+9$q@l+trV@PQUF?ThEeFPT{fl0gvWRAFwzpf$+oF+BKs zgOtL8y^)Q6at3^)xksL$n3xCmLf?!K`xGza7}iPd245(3SxT__Sde%z z&Sn!*6-Lu>Pi)=lc7XC{LK%IGGojVJBh(@$L0Ke`o9dx<<#EN&IP;T8-~H6z#Pe6% zaHYT?kN)>#bh7}^(}HFUr@IYSYEV-3mpup4_@!$*J~dtZlZ_t)bJO$5a6ZSgHIyf6 zh0`8%>lpE@1~`*iN@Uug@(WILy?9}91I`yBegNB54P(7Mp}*{f?OHF$WCDPi62aslR}h5@qX1v2llVq|_}?cZ?n@2>%a&x-_EK0`zclbUdn+w~Y|l)K_RajFQf01O1Tf z6-G2AZdH(#N&J17_0fk(AQPgFI_^Kl2xhyE+cVSCw*ejK6)RN~rq5pER-=|Y68fm< zU^I)_Q?TSN*^}7dDk!U0B9xIYFFVyq66=bJPq7Ou$zyTzEDJx;!SL^EbO(#*Cn*>M*1yWCZnb|B8)Eg~8udjQx>_k1!lK8c>?0UzK z5$}c|8dYx z)@0^85*w`J+*2H!Zl^>BIQ;^Hs_Rg;Np^6`x~RAXzz2srKoxgC3u=95r&ib+@LN7a zd!;h+hPUJa*aU$44j35ee5pKMiw)=S+ypmJlpZL7YhxG zRyMDF!`W(C{L8*6KUU;RLqiF3Pg5ukyeTkB-?aor0OIpgKM%e7(LmPaIsLu}j1ED} z@i_js=egSiq`_pjDc3isfdnVWA#uOrcW+2lzr9UEw{0?H5pHn*rC4fr&si}qr`YKc8MEVkQD`muEnqePm_ouTnFWmRxeG?+J5E-BU> z^z=V^pe8?X*r88uN#IffLPPUjb{2(n4wP3?gS=%H@(Rwc6ISBB|1eK@1XeyuQ%_+$ zPtZV%7eIhSj^S@VZz?1P$oK}dKn>BcOCR|mT`qI{^E7_EM7QrB{qfH~tMk-4f$8a? zG7vy^M<%5^UpC=9yGv=$_LqsC=cWW`a|TOQo5iAB)j^v%dtU>ZRM7s(Y$fEd$W0>u z@^K<+0LUTrK9D>+vnbafpqD?jzE)s9l5nV>CVKACs3(c}4byK_sGb{<(qMM z{n${Hw!Zh^1VApAJEvTI00mFQ*|sb^2xg-}YB)|5vCW=f% z!iKskhTmlk}_-!E~cW&~9YI<7!hp2rzn*Btq9=%M(0}PHd z-B0K$y#RGas8_esqlBYqdXHa8n2E*fs_@(Aa#0QE*KLindgL3r1*Yf{gFp_QA`B<@ zhMh~JKy{e#hMjVwq**1FstLz9n|KGQW~@TW)o8AAW>t}|B|?gDtKkIH?(hE1iDBpi zY6(N`Un@N1%eN?XP0}Um!H!x2E%2aQUJ1q8w#-gDvRMQ4AWO-~nTT>LuBuUjAfUV0Z>F4A^B zWN$#~h4*QBLzA3Lu6zh}%iJy%l+^RRq?O86p>m9Ih}X^Ke#RKpanEY}wCU7%E5gH

ZoyT4hP62x zqz@)CbaXUS6ggefhO?q6aXp=!N(lm9snc=wH7g{QI4zAy)rl_8s2!O5fuLIXJc4D(90<1sVBJ4H4pZjS1f0uxvNio1d62YDoDS;;Yx}pC;efU`v{1YL{MoC4Y$_v`iw5Zr-hxO z2K0RC{T67O=g68Jb8*@@ma0TY)?Rh#c7!Q6Khi?0tUVi2?E20zB*k+jQh2^3L z9}KqQC`c)3fl4o z^o&83=2^9iOEhS2vVwVi0}!`(@NvbT#-q&1gH;FU2gXOVi+}Q0)bdvp3Q|;aXxOI* zr~f{cP5OHb+(G{RAXNg^g_YtrlTLlJB6*mWc0N1KCOHO@MsJLO4Z)M!jRCmzMLXwl) zls%`OIGd@EQ^9#+(;<7%8%DpTN~0kLvKZhI-4FMkIT^A6-AXEu)PR28enH_h%rb`K zj^y3tv>ut%kv_OHGHz8_BApBOI+#=;X+gR}ud@DJP3$2Mm6%Y4_-24WJiqhZdhJaU z&e~xbuM1kw2`Hp=Rp>);ns~sR=#Y^nufQaiPltI#$?w>9)7aGDKBdu53@2%^^7ueC ztLK&PQaQYa5=Fdcx;{_lDL0h3_0vSXVwOgJz3QbYVVe0UiKGqR0&@^kmqy;Gi)e7z zhgnaRWS9!<=U{~LT>*5y@O6+mMcx3^Q>`8W$go41_sB__)PsbE;Hv4tCcE6Cj`xb$ z?v`V`3d}v9WP0-PhC*%k^7Ld*k;dZ=8YsvI6}`dW?)Cf)r8bxf&hzZ}!O}NX#%O*? zxVTH>Ev5OxWHuIe+KD;syvvmlH`#F%gUmh7S(h#1dUTWpgRU}WPTz5JYHHT0gZxQ}2v3O!F$K!;pmzxOM)B0$*YyV5G&if^ z;X81;;8bz#lM-q4_`rhQf^-<{%zcILgU+AtF+s5=}b z(Rj}z70FD?D>+(_s%;C-%V)nKN1kpeOAW#?bi(#ryyt=elC;Crl-E7o0Ssxsq#nWlw?AZICw0u_=WKa459c1Cvtdl;-BI06la~ zFVBOLSfmFVdT{aX?-4YiIv9z4G^U=f-e^vq_#sP=zEZgqM5NZXPJt z1ayY;>&Twhl~*ZpLKlRfSTqD~(IRN}lMr9~W$Jbsc6Qm48phQ#M=h#&7HvMvo*br- z`N403nZP{tFB_Qq3*Vp{N8Qe7+tDaFF=KjGWA-X(!cQQ-4?)Irg!=N(jrY4C^SQWD zhxDy*`#2jWY~o3BB*2JNghTSIZCa;y+Ogz#f67d z{c@5Dq!^{fNoqs-I{wnzeF-o9+AIFoCP6F;%vUHPVlN9n0Q*O4TXR{Uz> z9c4BNAT$82H)=gH^F2AE7^R1HT*v2d3Y&H|w?j186qV%33q9z~zB)dT)*gk>_|SXZ z>mmUaiUgFQ=BKJb8?xiuoL&>r7?s7M#|-yA7?vajPFLJd@u_bCst%M+hS`f=0hNvK zr;k2KN4;;V7*Gz}%}Y$Z@>U1>bJG*1*8=6p9)d>*Nsw(0eMHi0o-GN?&-2mD`TeYb zECg3~M+S$jMOzxmU{Oa(@KU?z7;1LxJ@-g#F4|X-Zno(2f-&D)E1b6FEYUay6bM}O z^Mm6qiC?>T=!AIy=+0+l(0Xg`lz_q@HF}9<=hqg^87Pp^WX<~8+M^n^@l5s#PM42= zCc_n^ylN0OGi7kcIgpK*oxLjDWLdL3OFOV1YA~_HF6@Yv1jWw8qrM(_AO_s@25$J zgSM^fQ-hOyT<9gL5PG>%UB~mJXg~n=lKuoc?86%({t)8hckxP{r*hCz zY++o>EoxYIArWCCqgw@mM{=28)`IUfsG4m+Z6+ zNI?lx23wQno$Im+UfU*UDfD5RY-S2uQUMlaDb+c87c|QSIL?x6G;wN%TLa*P_Kl<4 zjF&z#6``|=gBC7Hb318X<$`n%HWZezwT_E70h=-W%38Q2_POX!oHRXR$5W zANxg=w%Ta`QNG8_J4pSRLHXrMl;gzvTEmII=yDxG5_y|wPToe*&&q^Hk#=jj7hAPg z&}SD;@$JfEL!+mIRmB{wE1yA$Em_`jRi=E9qXVvXAHY;8i*3LP6(PD`6l&FNd*;66 z@2@AUsddUzGRQ&jJ}@_nGwTH+tTVTUEl|H#*XpvBwa<~O>rn$xpf2O<&e`gW$!Yb5 zjhdtEdCpAkl&CHTFO#T)piwh}#wi|WC9LUS1&AX^(*hO$2ghbzV1s$@?eYlx+}9nQ zSf@PZH@SKFD`2w{a?3RUqPpRpxsLY9S{`Wb^#}@di1#blCD0NC=iqYaBP&2B*VekLFMN>OwU-kS zWW(B{Uv|i)^tbWFE~S=(q)=Uago*(mfJ9LhU%0M#0o4be2h5w0?p{#GHf>4^(11DA z&pj?x$D+c_%w?xoi)1S6r$65Q7=Q~HiK zsyG2e>|Up7IC9g|-<-IVRI;#Lbg$B6#1<%G72$Og&{;%~z8OCB$1vpI*bm{{+vb95 z=$fT+x{Nozj$u_-9D3eHL0m#f#-#!?5ghtxH#_pQ%E|TA;Pz9K7ZIt027%3T0Om~j zZ)`Wy{(b`BXPQ3QhuPozzwb}u{qOjNQ^Dwn=T@lXcL8Dq_En^dLs&MywDgvFr{P{V%Li0tM zCOq~Z(212^9@6q?jKvw(^?}Lv6hMug7@v2LeisHMm&EpVeCk={|E&bPU-qA_CbjLJ znnpK0I^{cbcP}d+x5_M?51kN68UYs0YO%SEX>N2Oz&n#1|PYTG^nVpEc>e?Daj?;HPPRPeC__HP4|M!2CQa zR8g>hjx%1PPvg_e=-+a-&qIEGjdyiD**byPBqtle#($LOKzUZe`5Z)7?1?bXx$t2W zY5EKwJ@ZGG?=DSAuc-YV;QZt!{Wbp9 zS*)ushPID@$_9z=G7%=Z_f?#AvhU3pblY^XN75AZ`GG{eW`k<}GI=>`yH{ZT+`3;M z>KDmJe~j6T9$cIXMzwnewR<=I)^~#TNJsSPK`^IT7jJzUIjWnpE+zbkJyR7Gt?|7! zBnGb-P|1PHOv+S&i3!sI6`r1AjRBj#8 ziRfPtF?i@PSi0;|UPCuBJ)w1*{@kWa&d@x@(WpJ=lbSHvJw(SAU-QLRnKrBj^MNu{ zB@)RANZAKs3S}dPPV-kO5Z$u4)d2pGk7swMi`)^$iVQ)H<`vbUl?9;$??owa`Y_#0 zb)5b98gWhxvAd7{BSaDsNI}kocgflVgIY6<#MFVhQOCb7F!Hof+2T#o5JiLb43Cq| z`mQJbs0zwYByvn)#`-f~CtF4u4~0A-dkZ1)j-?%pD4$MT^>vhw4C^+L1WvLZ7=T9R zsVg;_3%HbbA~&xAZ_e9<<{7hAN=r?GQ%)n9$J#p`Cyv8FFJ_IZFl%M!p?}@P%XOO2 ziF;ETWW{~&8PWQLQ_qDlsZw|6IndiAiFQ1AusZlHvS?lqF^i$(C=SyClTW~UktwdK z=1lhN)9Mqm*<57q4}dlf>JuNEKv!3Moq{ZoE~zJ?5$U>Sr1|Lg!*)ty$E5YKavz9b z-}k|wM~0y^H!o(CdKx#WflSH!A*GlPV!V5aGMq&rYI>O<_|kiHb=!_#qvq!3Xvvaz zylaN3h?XPd;X=o|(Bei|!4#?iC+Q7=*(zA*3-aU668AxbB&kt_+0LtWhiPhSpgxG@>rjE!4RfZxd_?E$^U87FPmr#`5*-)xa056ay;UV#KQrFY=qA`ie(ILB zPcP4Eu!FTj_-&lDK}P@Hg+c!bL$HGhYs=={sWzXc4oC(q8<0VF@HrHY9>R|u#@eigo)5PX7v^nG4 z^-s1qEeT<3jPE*@%#F9!OtT+UNb-GEoH>oJPILKdpu3J~0d%4-r(RC1ebHwGsB_*~ z+RP+VhsxG?hfZK*jTaC*OuQ8*;4;F;HE%ABKCp2v>M_{nYINlhFaRL>#QSF1m|R zI6BiU;L_ejJ9W}mtHO=Dp>(MBDv;~VyPKWZu`l`#du>U&j$H@=G~5hs&^a;Lg0}pu zgTVWC@9tN1H)zRgrE-2WwJ*?uR+OOy`BadPpq)Q%0&dr6P4!|Rxh>?+b#G(XB8nWk zBi)S%G@>Qnz!EP?yvOQj>228Q;yJx?t6O1WoghIt%+H7tZnQ6hYkX!Mslok;a2+Jf zrl3{v3{VQey3Mt_ZkaHLn?a)LV_n?CmTg^_S8=>(oIq-|#kvQ3VY+jX-dJ0Aw`F>p zuY~$a{C86+`Cn`XY!0AxX)}l^=;Clccj4L(=N7WgrACs}nuOP?=lwA!HkywRGwH`w#-ZQTsAZ``u- zw(1GtQ6waL;%)#n010?31~%vm5nY5V1sT{l8V{>k7abQU5(Q45H|Y3tNc_%0dH>|r zuf_RxnDZVoKw2)IIo?<6h=dppy3~tLHj&evA!YSfcg9J3{VAUKKB0vO8X&`pTh7C_ zj^S1LsI|b&35Bv>aj$o=(YA~nf@Z=X@4$PmB3c(1+TbKT6c=o}g-J>YVcfaKb_)sX zcUa?I=v42c*8r^q_7Hn8<}YXskx~nN9qz~2bF)70+EzCO>}h-^Kam$=)X6 zYXMzhwq?V@^KD3p1T9Uc)=Wr2n+z)^qI{EAUU6QNNBmq%Nl>sbAWtv5eWX_Se6U)O z?)0kv+iPG6eiahS@gDH3aR5-wy!x7|GfhZ0Cu$lEP7*r9gZVGaiOU#9s<;gd2O1r* zS_L+NgaN#Lz_X^tKy0<$tp)9|57SQ;UiY@iRmOgu2VaE8 zD6@oji)TJKQu4hiW4A#4JcK%J&JhYP=O=5`M=&Lp%gaj4?QKb58jXX+5VPl1Pn;`E zTQ%79elNCXkK8@JHz~z*-{5R4EkHLT{{MJtYIU4UouE#or2uv?DO#b@5)XNN5&pJF z!veOq(cK#KejnZ$)c>Y!szd}SsQa1&<$W&J-;yS7^-$gJd#hb?mGAQ|95oEilXk+O z{kC_U&w!Ja1``s`UiIJ?@yo#zaGlI(Au%)5dF>20C$H4$T&b%AZE1P1kJ0EOlC=R2 ztHMOk25X)O!~nYzO;~?^HUps(aY(+U^4I9VUy)5J;w^=Yax|~BEZ*YVz(S8mcBX9~ z^c~K+&Ub&k@1<_siGK&pIR+!Fhd4uGh-kv8i9r2*x~axRANv4ji&h%74aVh%IA+l} z$T@3mH)(e_>{c&MJMngKw#)_5uub#;(R^=soQ>Z0#rJb*-RI}&^BCvzYxJVWyGSA* z{mfu-h;!ns&DW{ywlCB&A|U&KVLQb-p14JW6X`nrA2|N*0a9fm1Tlm&MylTPJWGMo znlv7A0Vlgnld2!F7?AkSr8dlE5HLE@60eiz*FK-GL!JUE=qr$)+xzLZzU`j|qF0!I zYmOX~QXaE-n*tG3$2U}Wgp{G=4dO-;w@Sk$x~ap!iF#sJ)6l!ky2~ zI;YgNaw7kzU-f9rFm3VEm0@a(SrDa$3+NMPS(E~(@OcQZ1t{H?6Q$z7FZqN&JV2&E zBFOQuOX|cEjI`2m2H1Sdr>?Q%3yHclZvN>{DzDtR`p}6*ZRt15y)o4(s~R0=7k8=9 zynGzg(l%y^rX*UN8}xG*>1W*B5uqDjF%k9@&rNQ2;uq8Y1$uIl$jgI>TQsOCO7hX$ z_wz}LR}^Y7HBThDdAVWGFZl-={u3w(d>{q5@|x>@2u@Pkp~_(No2|R%CrPFa+5&S( zje^5k3i0?PHPPsxxS~Wyl!cnkj{kSJ};R?#{TWVsokn{+2pltjuiQ?vu*gxzw3T+Vk_dbx?+`R6-D_$g#(z!K| zbdG#$Mz47mWL80jJKq_w(u+0HoPMIXLnfp<&Id<|Uw_@yYu~@Rak%#(r5&Od`qD?A z>~-jKzMQhBrx&T%n^Z;{101}#2?X*bj9bxnf?GFC3XWCCi0qG*J$9{IBHft>%Qav} z10tTTewHmSpyF7;fjZp=UkRy9+d!td#nI6=jX%+!zg}_u?~m1<=80{iSd4 zNT+(FNAo2Tdur(nDv&0Fc5Oi6-(^ABF@K%J#)rW|)$8hr-Hxt2vnYR};+CKWg_g4) z4>zmlHHPV=-{HgG0d|<+Mg|j8ZLjhV_ z5GKWk-yXrFi&Xv_nEl7cfaWFVAE0?j`Eew2;*Bp7i~a(X2lnW458bVfv+kL0^SO1; zG>f-N6>q5yawO^qC?b-Xp7vh=9dLQAGNWx1Z0Ng&ECDik5RvzLIIYg(^esR`;J9`_ zmt51O+T=SlEFz!7?$N-CO(0T0Yfl~JB9Eqvdx>?54_ly3UsVN)SkgM0sY@NOdwJK2eG9& zU}=_U)N}S((RFAxakrMG&v;gu+pH%)TC3OIQyZtNBi9)Uj`4Z&VIOGlepaMt!+C1b z%hixcx6>tAamj@7yx<-_=_cUytl~_fW75vd`NhW-ZK=)k-{2xgJ0AutA&|y=5~0@S z>~2+zyOZ?)tbcKt?_uI1_XcDif_e|<%P9fPQJ$LS!RknfK@Ys%-qPjd^YK|L{0uvo z8!oBbo=g&X0#i(G-?s5gndwYDVBB%-rmoIZ^ZR75{;pS=KlsHaApUXnD9-ug(*1um zbCSd+4M}##qc5#wM@nxleRpfv0k<52jJ@Y|hLg?O;y|YYW57^7sL~z3^f}VtyEJFj z1a$8+14hTeW0w@=%KCr{HE<6~@Q?(pr_gh&AE}1d=}{JE0iV=quu%nm{OxA#!>!@$ z-!P<3d7PPQ)Xraa*S&8Qs?2d$9I$W6JX7M3ZJJj+{aK+3e5t zsNYU;M4qJ{B+unLY)xpMduB|9zUO=^S&Eo^6V3tT8J2)0eF3)_tX**w?4Bc)D|fy) zNS!1ZFJU`wQd1^L(zIsU$a>u!sq zB(jHJH|K)0-}|=TShJ2Zr<*OjCS&2~7dw|Lgny?AV)S5-!a;U=pqvA@Shuoo$-frTwjn zGfxmu_JL&FT5NjuehN;C3zlI6(yX{CSY1SkTl_JfeNcY|(iyA(jZb}%BpHZ6DINQe zI^IKA3pVXz%{w(L0aN1ebWrh9+`m^c2FtTI>T@7cM>^y=p>ZTa+*H8n#*rEzN9{K6 zs5tYQX9}HEW7HZ7p4^~JT$q%xg1gm0Rwc$#3hJuY44j%q$PBVOPB5H_-zi`EfhhVL zU>><8ZanaQ@s(>LBXZ=-B5f`yemg3hLFv5P?8EO*NR)!58VR z5A>%FVGPy^8}GR=<3t*}cJSxcVbhb?Vf=%!A$v~WYJHcwU1@Ci6eFrP_wjEB=N_KE zyX%@UZAtBB53|myHxVe6ngqW{H5^4b*q+o4ZGyslWMObq00#npH|R$-M>^N^0AXA(}rx+I2x0Lq8*yXfGrV=l>AM|0IzA`OnUwy7hli z8RsvY8`J;wSO4Z*-z(+4w~CUnYnlU7&WfH92Vm*JGq2fs;(We&>^Ib!Cs`2` za1CM;t}X$nh)c5Pr)(m*AXKMNe{Rpu&rdHeFZbWiFVEBG{`}|pclq3PRGDc>RSWgsoBV{E6?_$cjOTEtF0iL#IM$B=*+t8wt_bLE0O#)Aa&k{kXTMO zA5VX;8xQJ4{**zDP8Gsv^zfO*r;3iWHO!}KSp1uR_M`cGbR>2(5E-p|9p}bK(0rCY zm2FveXa}NSFFH5Ts)1#DY;jYmM3LPB^d9DPR2CPi?azA0-ZF_Bz3Bjp@`pb|O+iXf zi6p)0K^Bhs8@4kNk3_f)Pn_cVgudi>y{e^GKD#Wkokzi`Ux$(Dq z>DBN4N%p_H3^B-q|M}?7E&uy7u||f_vYK>61d?I7ha~k5lT@Ik6P_Sg-4Y;8Q%n}j zV73EG1&fd@kC4K|d0xYaFfMy*vw-2Bas9d4ESg%HHT8|5gCM@6AOoR@A&O#S5YU)D_4Pj-l&2=H~8RE5Bq&c^VKd88PG;DrtNmL zcxvMF?K=l2WloD($4!bxah+Pn&T(!hnL;jtLHb~zSa}_3Ha*@cDFQBWkNnLF>EdRn z)Oy{BbiNViNEio955Lk=^mcjiCq4xj6@4zYz!}$QisGX+r*voqXZ-c9R@CMVg0lIA z#Aq=jX3(6YHEBb#2hwrBXkrCy+yup(GPI6Lz*n2HCi-g(i^1n8wt(lGRhg_}S+(q@ zt$oTlE<(%e^IvZq%`y^*;^U2~JnzW@P&WgOk0s_;TYxThOLuFpNmatc6+`xB(06@x zug+Jh>7aJvwJHqimNKa6Zj)YhD^5fkykv*Qr!<5mFYeIE*_4Pe0=n>v9m;#KgG{tf z=Q!!>4P(E<=twl`H+EPfDzQ<0ZW^yx6{!ySkteK3Ga~(!WF+XD z%QII~l8I#4qBF%)cL`{6Q0PRZ9Ag#e7hR}uO?&@jj2U0fQuvR=@L(p47eDmC4UG;g zo7NZisu7v;t%~@BMvtT!$M@lV{{g6`8zjj``Dka56)zjXi{*T|#IGW5O}4dncaL%L zJav7Kf5D#)6Twp0WO3^gL(`-K%t}ffj>Ucn>$DeK$~U%Pr4m`ZVD5jON6kX1RqkPT zFSNSX7;}pE0a$B0KAw??fpU_RE(K`YwqcGDmbG^qZMx|+I?wA=vt?=O!S#?B>0(f5 zH0wU_l5bzoK*qPBWlJ2wo8}DM`fX^^VUG7LoS1l#X%@A5#S>*dMXfD%#sBMqZ}tz= z0(yABEuhX~PgF~2=P2l_J*O*-7IoCN+XD6QLRdENL!1#JHJM6ic4@=9^jAm)^&0+r zq&u6mtwVxricV-Wn<~kG#>ZBcKp`erzX$dX5;lPxrNyH=4A&y;+_3FvkOW;^;QMCh zzewY|tj@Uy@`oM#iD!KcwNz}`$2k?9TQ%+H6BStM8T$59ne6n6!uPAx^T3lA2Vk3Y zZi$0MBA#?IK6QLfBfe5BXw_NDpaJTRQRh!8o2)~Z7GAI<(&#`|2=Pm|Lb|$5yEddR zV6-WC;9VGBOb9Pye5E7Wl>nM;k!OvI?y2i2o-ElN;&dXXf-YG!(7-EIN-TBus2-%S z_Q8$=4L?9g1<4T4)ME;)$t%QvcQe*y&DX<@+kIO1w5L89Yl%`_d)62kn9CB4DsL&Z zS_S$;j6nxGl}~R(Uuiv)UwQrKj#CNgTN*FoyFl}JE5aw~(+0FR4lNB8j3 zR-M;$vZD9u1Wo3bnLy5K<>bs)dcm$CH*!~ZLEWS&%@RChgFtS>ZCYdkJ!q9lT;Uw$ z&&H$$5vYj2F#dAQB6S^t84ig7d|`^R1yV;GaWn(fh_s(%R(9KY9?7jHxhqLsr-lj2 z`cS;8sFEslpVe@F-cp6i4HKFAxb;;ZqlH95prcV|7uY5ZoGuJ4o~e9I!tWLP-kIT{Lr=G-2E(%*4)S2UjEyKDHOw*%B1PL|I|OXzv`c| zOgaUkXdscKxyEkEz?s#O^NI0+y0H$2nQgLE{(zr3y1b8&*Jz}tJXDq9jR~b6wi8)c zhh{@!F=>Dt9WCXYoqI_7zU(Pc4>XiaXNKloJbdPkBUK!!&W>o^wzv2Mzw~oknOGTF zZqH1Gb7M|UD&H%G>B0F53Z3TOvdfJcE$i<=N_s}8xq(?97EYaym}u^z`E?(J;0D#) zpi^IQ9qUcHK>{iXfDOfcx!+CRFxEU3PQC0y^hO1n8x5QZEZ(IlsBf{U@g_f!jV5i* zOQQV+&3YvJB^gxK{pN@VQX2G2RFHmu{d~u8xqpI}{5)=Cp7bf`vai@2SQi&pKtCBW zXk2G6bVUW-CkE}Rb!SPnvmEL=wbK&dk#na>+#$NpiW!$XB5E4dsGIio;Xj7w5A*Rc z3=^65K-cYy(w1yGkLAgjl|NRfJuZ%Ex!eskm- zfKFFGO49I_FwD$WzMu@4n2f4Ow~4YDsUM`L8AyNi*(ahcB=lqq06dA7&PSdzuDHZw zMEncqD8Gp59df1oemZyQFEB0{)oZ*_LP>5?!`#1kn&Rr~7O4Nh#jw9uhjH=${P>s9 zNA90EW!}+fd`4W4gT)GdrFD(SJo+ux?HR&-QNl-vO_K8$sOsx^1|r0Z&CU~=A|5mx ztr|{NJe#~KSko@Q>bs}0J589DujwS~geyRy$p@DKo6QaJzeO;dmNF&#Fu%Tje!j?@ zpET@D<(z^32%||4Kzy<9_c1*G(!;YUaf*ynC!G2~yI2zC8uELXE8W%^r3?N zl?e5Cfc?_l7i8i3#s5Sq--NVsp2C)S>n!pUlSd_~%?AopCT>%9MM=3Z2slh;VlJl& zlUnutT8VlcuC};-azh0hLHg+L`z~%%{*sntQ9%-~b{%Mq#BPVQwjn=3LNAg`DpAc7 z?Bq*|{P>eE{|(S#TKHCzZi$sOnN%tW|liV`Sm>J2WnN;LpI+?~sry4cj4QzsC7 z&;flhp7rqnsF?$O8!{n715h*^IYZSEKzrSRygE$vJAYz?q!uYOAF*y(GUpY@(LO}RJq8=&M2!7Qg?TmFuQ zg&fcsB46unEjU$#XA;W&+~n0)bZ(1?4=6N00?+fRqS%DGxN^5DO3)gGaZp%&cbMaQ zXg~&6emwHlf!9ozNN1)n=Q5>w3sSHfs9YLLP&amcsGAc6;Fur>T~xd*mrkRgL~-ZK zEJ#&D8W(>Xh&WPvNxxT*QSl_bG)d{`o*$-&nH{F7zEg$yd7IzLq2S`-8{zU{A`c3a zCSX&KE;Q?i2{BHd#|%@J4|V%0&iT$~Ju$2=(nsTlU%PkyiNsHqQ9$pOwxr|97GAHD ze4W2lVSewpq(foOVYrx$%>4y|`p#`UQ=AhtZ~~_2JelM@)j-ynb%(7gX4|2cxJ9+a zs#Ao=pz`vA5FxW0)R6wL2D1SSH|nm^0SZs!d;jj;_`c3XwnBwwZ&JuqMTuAWbv!~^ z=ryEO)wFHev_+pFjhTUaO+yvi@e+h$(P(<_(H30=`gzkq2Y$#cKmN~q=f0&E!Vy)N zNkUXk@$Zv_*K1>D&$-ntcrNq`lQHq=3;NzVKPYhpr-EDR$bkl0~ z{JtgAZC$7#I{`z#o;Ya}q=5s4K4>Bh@g>qF2iZaUCn!StH6#80`rboID~y78Kk9g* z(D|CC_sGz4So1A*1yhVVNI}w!u|e?U(G155CR(g7sraAPO-dsic~~bkXU0cA4E>OQ z4guXZvufcXG(`7>&TqS8fY+dhTr^UaCV)GuvW%;6sK--@{eWUALx*Xo^K?dv6a))#)h*=*CM zf-Vj1n&|MwH;Sjo3uZ7O@NKkFM9_E<^9+2zHW`GYB@4~ z7(+b=ZjpDIfTsG0pX(#F!$e5YS#lrGnm8^8PV+Mwk)pE8L|%d^U(v({+=f&Y&T{?+ z(C>Js)cZAFxP0T2!bh~fNTet8+MA-3Ppoh>r+R0=4;%WzbN6Xi zu6?=jIZg;ay-q}{Rh&I^+fHxO%WqSz;7Uy`?@F@1D1+4f9EOHzVw1s)ul^S1_s#V8 zz8jvOpYCJVJrR+m86{2Rj9%^FoFodSj+j0Ex{0ma#s>flFkNEc=pj+2iVLblswc&yt4Oz>j|T?G{yQt>LAy6vl=Va!-e>4l=7OxE zj4(CYVeVxz=S&$>ww0TxFs6L(p_u1+H*ADDC!W(bO&Pr^)L|HRMzi{3sBN5pME|cf zIr);y_Xblqr9zRDUk5tV|9c|n@r(q{qnX#)4gG%`5uBm9=eR>5_fp!_GN+V?gy57` zfkvDD0)fzlE2?#HRALkp0ZHMMn!+iSXx~QnRt@G}W@q(L578eGbD5$Gzf_)_kYz^7OjZoP4y<-6T4c(Y57}UI;1dYXo6X@=XT*+Srwt60s%tA%p$NW z%CUw`2tObN)dKY?9+%XiMYR7QxS`;gYUD`^;L>TzNj)(CBkvfubg}S!z{T6db5`G= z>ru=Wi?+-*r)K@G47-uc`m*n{<-=&`9c88{JZWXw#Xb^j{%8Sp3#!v`|Ez z50hnXiBEht-K_!=-`}Rn4gS>oTlRXAbgeMGz|vZo7QC>x@4(iK4U za*YyaU|>Lh&P__6o!Q$xlTNB`6UUX>j$ z`Rf@dYSz}##qXjI8@DT0eF&U0(gmIU=PJ-t8JlMA4_<()=P40gA#{>RnRw4ijRj8c zCMAtFoR!bjaAaZ&MDsh&{SnOIriEX_{MvyTPv-cpM(>^OAvO4iQch%A>Ly-q(y}|- z@6BeQQ46vJDtb}HxSO>+se`FoNgetXW5UBf;IKKir>qXVZNxDbb22XvLYc2<&}@Nu zKI*oE@`5-fQ*S*0YEsu@3(&*vr{!~bdT1)(fW8LpkCMOi1*0j&^)x=jk$xYTlz)n= zG`&|FakjCtlst+kz2M3$>m9&dC85ril18W;G=v*z)X zd6J^Zq$zuE%yM!nD}UOO&8LSfBwU4|vRK$I|1>_Nk_vVM*7#C=@AbTf<#Z)BqaG*Y z@Os`q-w%`d(w-oh4)bJj6y`~y|7{j$}$^S{h@|xJzqI06bWW*eO4%Bxc za|1sN^kf$_`A=xqApa?Vflu*ytb!>tc~F?hE`k1Rcff7!U;`F8cr1M)Oa7wqr!`_ z%i)_}r(33_2f-3`KB|s&6S^{!Z&RtUe()RYxvDwT%Q)%CQyE7K(+93U5?m3Anv{BM zx}b)cQ*E{EY%2l)>@rq5&aDW6v>-i zXwDBTYU13mlr2fGehuc9xV-K?Hx(M!(%t(&Wp?vhfCloKJ{TuJ@fkNjTZgbURPy?` zL5<=$(g@VrB}eLLti1*3vhLWGJ`aja?*9Ae^okF*p|K>(&S3~gfo{2Q^j#5;m?@Fp zj&$FRcH^c2s*wZ^FQ^UZd{khsT`zv7-=GA!5-ms*jq#tMPvTSk8Zb2%tLdi;+dxdU z97WPz5Jx7G%K7;F&Bae->ovS&T6gr|n{lv5Oc7@=k=P582y1hJF{5M+CJ|+zX7?7O zwhWD3(Jh(AFfAxTYyv+}(v(Y_JJ?fx~MP+GJIBI_ zzI^1VgFKoNG>T|4VsU7ueAdjM?$!X4*>YWUcpAr-VfcOZC?%0WNt~ZCBq1`0W!=px zy+d8u6_B4_`{|u?2dasjkmThBbaZSVirzP0xTGKqia*wYKH};%HNm8Q9TM*jebyJ9 zkup{g55gTwHswgaf%Nlh`o+%|-S*U4()GO`Uu{2jR_fR1@#!_F*B5{P{WLe{PO7N3 zq}#Bim(L3KRJ^mj;nU#}nJ+6i5(*$SgMt;r&7%X5ei8BSXO=b>n>~O^XwNk9 zC>f&)pk?Wnh8!Az4&I1~x@ZoU%1-j@gdhr0eTbf(fl9yA ztdK&@4Yx9;dr zeouctl5n1w7OX+7(H#MG>Xx+HF9TiU-txOQD14-je3bAE#dXqfW`GDa{ygnd>e+C6 zP`_7DX_nb#VVZ7?VVu2_n#yvSZX{Ghnatgx_@9(L;v*_Fm}?|Um4!0-7}7XK)u^Ah zy*;`dt6}h)@VsoA6znBP-Q&rmwgu5YJDkN(7~tMKN>r%D%d zL!lTGCdzc#0yg#MGG5*Iy@`L3pr%EC?)$nwx3$`AG2JF;yiHpkl7zxFMYmpVFln&s z{OOBtQ0E$)UI)$1B>z+*m4XE7`4+JJm9h(O_BqfQ=@Y0nUdJgKb-V-N5h_TUal-g@ zj7x2?{6)$P{EE>}spCwF;%?7#&cCz|_|TaTNYN!l{QewinSqM`kpl-{4QS;~?I|eT zPn6f*IyZ8qW^Gcj#QF9Gv%HQdHhsq&n@LQM|^_8PK#_0*sMH8*=zPZ%jr!Dkn0xRy1KJY_3(I&(kQz%Yz~hD^|Ss! z6VM*JGtJ&M=?~OJGRWam6q9%Y8xDY~96+@IwL_RxlBZ8{s@(z5=6BSfF6vRt@v3#IMtKVk~cq4jxQd*T6R1u~mr(mG5`4(XwIKDT9^2e>q8*g@KOnF+AMEK47e%G)X? z1vNw!=;zJJesEN~?Z;QQ_$GZc=)K=T+E5i2{~ebdaY-PVB*jC$5~{$6cq`PyE6M>i zX-_?pGmDXlA`&=>*$lUbHCJ9Z+S@iD_R?ps3opcAcSpRdd{EHZYwSVPrk}nrNJaJQ^9UK2joeM}S(BtE81!sh|gztro0@mpuKzkxYS@06<0gwQf6nU=4Iwl*zYR_01tW=CksYWa*h5+hFMkbY zeej{*zWlBI5$H0Cr(dD<#PH>34H914TqdnUB?--2f5)J30{3zP{ILiX1W{lBH$CHr z65vT$c$KNhBQWh+MSNi0#TVG@*gD@D^+$oC8vhi>D%{L86-Z2xIqQpG*{nNAn+m4HZGi@v_@);6xNE3Bk#PJpK)r8} z6{+bhoBFyPb}geaM}CO+W26%8+wo%+&)7`=?3KxRsS3<|102wU9T>GeFO5-%PV|O9 z{aNtAE@!FpEjSBx#2M{WPqFFmbzkK96zM$uBp^{>&}b@d!Yn4W#9|hqKHo@nE`j_k zK@vPvcLDPAX3Xnr{O>ZhQ^x$4$)7jloN@o<@cs0jA{$GuR9V+G<&ai4CC~q#FBOd^ zlK6+vU9Y7mwrWu`CoKhiyJXka4bVubj+qEhF(9&1+^vd7EZF%~1;vFjw=K+{0%opD zjBZopDSmPJP^DtDBIjR0`$ZLQl`QWnKND*o+VgW}U|K#+6uXV=HlhccokAZ%vHsD8 zv@?4tP)eB^;?IEssrSPek0F9694ISo7=j+3bSGY|l~3})j7nVi=u#Qm+cNws2;k`f z=-Qxi&vcbo8A32yUsbpBgcuLp6(Wp_IRj;0^dMY)S&lS`Z;y>t>;8xY^ z1VD!|$QshKtZ$m&*?1|Z4uD*!_s|{fY59w!ieyg!<^Cv-Ud8z}X^&?C-#G{zPHuqH zy{f19&k_kdDicN(u1W;P+I!%}c+~>6!p$&jxf%9>ssvOmQxTd{RTwQbLbFR%^1c~s za%mDcKhI1Wk;0C@E^JhF#$LS0xm&c(xcFKj^{dOB%D7JQPL;RPrr*0*KlBIw{_r0k zA$&Z2_cjRi%u$J_oieIWyX=1ihrf>jd3{{pEd9Vt3BDOo-r(cKi zv(pwB!t3jbg-x8&>(BZh7UwGZD$0Q#R`DIc^VB*_9$+2nKqPT}d6XVLRD#8Th6rK< z(tyQVrMUdK$m$AJph*?%AZ8720*$u1=%cJz?oL%pYp1_w{m%dET^=ifRp* zuXldzCxbFlp_g#g(9EgqNQ0U~1y{@L3J^0qy#zJu?m}Je+#NL#xtk5hx0bNX7f?Ns zF=B|*`!(3bZ|MvL1TA29vQ?Q)%oCXy8nsHWyU*4H)^kQ{NEs+l+y?$|DV`3pJ_)-w zY}rH>yw5fu_fJ?84X>oxGo|np^v>|-AU><8tfn{CWICya!}XPFEUeXUkI3OdnoSgx zEt2wqMwPb#k6hpirsv?ni=g~mZ~RNN`vDboQUhW(W%O*&2Basp!K9z@I$lmv+cfv7 zH^}GC!Q5cuU1g9FDmJTICeP6sLgRAa;QstfEZj#M;QcvXgumW?578@7C0Ak_U13HZt(|-agklFI;j+42_r2^{)L?KBkhrh|B%{64aI!!t$2I`w zkLSSEiuA%(LB{oWjD>Z+1cp4&nSQ&1*+kd_avAIUfLpcGENY2BUAM%7n(ovVpta*P zYzh~x>hCrNEn0nX57n5zN&K@5E2pQ=B<@(}z)oOLQlKM&=^X(}yWtiUh zZW#OS^`CM0C%)bPgz+DT^46Lo;ndd_(|7`!z(~3So2~%awcyNBA(j4Ko3xf71Aki^X;UW2==4s^}a=@T}3s?uAvjaKTbPYv}p zrGo?wsqm9uy!|rCz_uXekJ9$j99MCqDH&|28WbS7W$X&+V$%*K_U;pZ4!yPjp zv{QGVr%3_2%&)VCT7*&pRINprwli0eW@a={VEV2jbzQzw`G^9P&L(EjbD*a)srLt`XVWkZQdBWeL>7pRD6Nr_ z!i8)?%5Kl~s!vhN<-Q8KQ5^=7J>05Lo_^l80-buQuGMfJ$0>oKzwG-{D-enn9p!#Z zSqL<|r(R-0O+kFChV&7>sk%@UxqUWq4r$3FKid@+O%s1rkae1jM^1jqr=p({;c5J0 z-62vf(j;9g(lU|?B&HrxaX1XG58P3n^p_&s4(gDW-TRPUbj52LpOUYn3weU``({Sj z{}&3X%Fny9$)x#-QU{e4;!Dt&Pzq(IDh9sI)LS8Sd`yM8)h)W3QfPJMt4Xm79MFRLbm(_dolSAjdtnQ|K?yyNWq=CD-{(Fo(#$Hmzc5)D--8}0 zG`4&B$YKzA+@fLQUe;`=tD8<^z)<1sob8P+S_2BGL^++^!5UPHj^N}@nUCJ?_*!!R zc%2&6(eXkp5PB_wH!)>~WW{bT8U#NFO2TYz+%P^IUIL92gdQvv0WG1{6!v8Fvq6MJ zja}cUU$;@+*EQgC_J=zCY-+U>a9YvB+!gLs1ZsVG2dLX7>e~VOhR+jyeR{qKM zvP7}dXh{hRM@=?F=~=w8oOv|9vMISYWHS{~l^?IXiLW*8XOax&h!0+PB$KEvI ze4-~*v>*FMa_o3NfxtI6;(n;XJol_EEV|)-sD~?4?>Y#S;uD~~;KwOJ z%@qJLsT3_xzlc<$k1t>^W@`Mxz{g|Yjnh`tZWGPxp&DHj0t>=aDqCoIYk+*~(QIS- z51wCL{qW#yeSD=9*m46}ZcK5w^yYAxCa{M);V8S_-W&|M%fwQ|HZ5;sM&z2h?d#j1 z&PTYJU^7jlwuacWqWc=LYFqSMxiWQO$fylq53w@V$4B^T3{`<38xOCQcZbLK&MloQ znTz&cSTzIfx69weoT?$-qKW<~>s&54nMFaLi0A6xul|;=rbYBJpn=V~p6I`Hk+mw= zJ*CZZ@zuMkA#Jf445~Aa=aO~2-%0pV`tBGih-!m+7%JPLejLPAH#Of=1`X^Y1WK~< z?d4j-gk0wh@-1022m+MU`h^^)n;^*G6AHqyX>r%9R z*_ocGzaVxB`k%mEm56j6uu5)fB6{2+{oWwm`q59tPy24_L6LgtSCG96@y^Gby)JEP zmpV1LN`h&!sc?wx?x`>p6s9eye?d_9rE@O6br&zFxg9)74280?60PfOi4-qX4d{VQ zi*5SI(3*#qpd?;zZqDMlCthb1RBIT|uczE=^T^Nw27%(U6kTeNQ5W4`p1t_s)g;lT zI@0f4Qk^pu!)_zsBO|s72`kL2YvRo9wqVnN>aq}41K1mROAEZd%KFz3p(d!VSLpr& zTT$`kN2qL{8lQWW>QGBme!`po^joOygj$xm6hhfa<`_RR|2Kpy{ z%rw0c@C)ON?JmcC)Cq1V;bSls4o(fW{Ol&epDbD2%4;HfI8CCJ`Be% z>Ld3)F{=ou-V$84_<{4PNY~M1bpz6`Tgc65xt(GX69@0#)VRDFu5I9_Oo0hFV3fv7 zrFmL{(-Q`n)i3KvX`}ug{nHrMtsjRcs>PL}_!-iIy6b&81Xd6!N|$;R8j)^3=Z|TV ziZnf@AXdO6nBZa#Ypm82w|fg#y!rFHU-~D5Pu!K4tU7DX#2YGDa8_{tg!4Uy#H-ZZ z>L~H(`@SD1vY*{j+STI8%%H2ls}!GoBUVU@3_4mAT96K7H}p><(Uh$Gz5Q!SHVuHL zxk`s-&t9)ilv|P~`9zUM#yQZq_HX{3KyH+8kJ$9u^alb>e<3j;RMDCm4LTS6g6@)b z*9zz77Uw4kb|za2{P`4cw~83w6DzSh44;o%k}>`1(Mn+)G#z;HL~uH_yh%xB@YI-a z_~T=V_j2~0HkbdUsRXs;PSBbFin8ePvjQK0x@m+eD3#`tDAY?Si9?|G@#!Az_mZgH zGdS^*uK;U7?!OMTT9c76(kwO0nW`>s(dhSiTiA4+kHY*^9cZmLX(zdRAt*&hBDhIqS9DJlU<25* zahd*n{#3yZ_rJ`lWdamx1A7DZ&6>LsM2Xfe45-sRG*Ih{<WJ>bLoKJG^hFSk01=lypgX zgUKPm*WcXyk~DXM@+WnmxTrMQZqRt8^w>Ygw<(@__vS+A<0}=KM!oq(mFv8Y^u(bv zTyUv~#QT=Jd=zGqb=Ly*2krDN`?c@;uxd!DD>qLH3b+P3%qSN=fZgm_eQQ8`1Dom| zqt0WA6w4?zG_I-+1K)*hi!_=~J--c>J?l=MetX%gqLvh!ay}iZ{-JmS9QRjbV*YJ` zYKjh@Vc_ZC>rmNfiX~xrZ6V()4xuU%AM;I6PjV9vEI*r}GF0O;P@&?rGZAsI*#x3g zL*rESU>&?x#hm)8)90+h@5{E&=wA`}kCOaMF+psQ7`~$CyX6YNkWF-2`y~XJw!(?M^8Xv)35K{K z$Kh=zQlRGP(O)OkM+-%^Dg4(?6{nwPR8=QPf3)TPuUx0w_o`8gkf?PHIRTLCWfy9Z z`wgbhrg|Ld&PHv~^$d;@0T};@-*FkGh%~-XdOlKqsr~@x*M#$Dg%ds?>ZaNZF`j;U z8;$j9@ES>$;Z8|vzs{_QRm;EHw*0541(lTEBgbaS%fIOK3LPcXt0t{rp10pNBZ(`u{y@ueU0_)`zMb_z`<#rIW0$ zjLx5hJef5|Sd~lp)v8T@Ao6*S7@uBLh*&3>)J8RQf`lshRzsh6Xw8cq|h_h5?A zwB8ISFl|%Dk}AKrQw``pH4m#~kp-NHYO6?bq^8((;N-ePz7Gbu^H4RZ6~v7y!Tg9( z53s552Y}P628mOpd<$A4eS15i&e=0;16lbr_78P34ErcP`&U9$K9wq4paM%cpQE_+ z{^t%Ak>uW@$HKKLgya>~M5?N9_0^9S?S{2)n!8kvU^W%0sf7|J-+PQw6nT{bs}_06 zs{yO`%AucPPpKk*E*+~(_vJvLL8^qs^pudijY?B##S>5I8A#;R2K7Yczdnf-`JG2~ zzsvhXP$iKQV8p{f|Ai~)l}H5!)u=tGVoo64=Os9C{Of7Tnfnvn{b*1U#guZ!I*m|b zz=ry~t5`8?zvBRG!m0qGtyMFvt7<5lkP?AN0-~9|cn0{pYGj&JZUB7ipEgj}`L>59 zXaaR8f=UBT)am8`_L#4vf;w9_`YrK20MmZ4 zbqiiq;)kH#?<(Gt4!~hlV?Tt!UvRLwZUwUJ(HS(h$XIZo*=q@nK$jxW`%Xjc1G^y2 zrm7ULdQZ)YM^IJBx;0Hu`$bxG~qYstOOw<=3^oS#KXti3%F? zXv4>c9alqY+u^jT@-NC3Xb9+uRJY?Oxet(tU(v2qzpcV_=+UJ}G1jK_`=L=vL!LG1 zD2JjLtkf!3iPGgm_v(C{@GtO+*!t?qC+43d1LMfMW@Dd16{B7uje6Cus7mYquw&E4 znY6g{G5Q0Ws^$^^Lb>8?)rnH2igRnyVf(`)Z3{%EOr3uir?%x~>NxQy8PzpmJ)zP> zHQ3wWA2O=SDWK@E@MuL9{)!YtP*sO{viM8faw!2kIK#(PHMc2Z7}aQqRuzE?XQmA< z>y*}U#>Iahhr1cQF+=K7%mCRdx;jrSIW=b6^$^q21O$D?{ z?sq&=eWp`fovW)vY7=t?m_A~lqJ`wUl=sq8EiF$cRe|PD3RAQ9zXSpCmRfZzv@}z1 zCA60ZO7mwF;|V08ond|-z@*kH=f>g1j4zxztK_NMH|Ys|LZYMqeL`ZWLehjSe}UDR z)UEhcF)6U#6&ea{8U4Rzz~dkMaE?_ThE?%}^6`=?B5JKR+cexDr*sbU^Va*6J9BK+ zed$f@Kl;bf-=y>oS^`wmg+Bu7b1kKAPh`_aZ4P+pPl48xY0-^~PAORQZL#)6*(apP zzXfVfgA~*C8S13;4NxiXZ%OViUYPL;;MHlZ*AX?bC)I#{?X{_Jm|=^wal9YXa}@vU zBAKFjG9~f+qC8o{`Ewh3WGVNZ{PbCS(2YPr_Z7{lZ1niJBc%%(f{?{@8jzkq++X~* zVNYsMS6#d&KucPjC~i~hX71!VQc!%&H~qw<)*mBnNT(#E2Ceie1}Cb>P!~weq)^&| z6j`*%Gg-zaPYzU5Dov3dg^Q0A=2G}J<^5?ckIyDhXE;A^zs#!~o@{ZDPKRh#{j}kj zhycMTC}~9zLg}L7hZ#_ZHszW{3*U62`yUptJK4#$v3dA>UP44G{;SDjo2pcvXhHf* zYhS8;KCNGYt1J|gc#ej58a*mLmsXloiKOsx(WE<{ozCxiWjYPfpph2r-W|#OFr-DW z2R>S9Z*`yzg~@)s_;r$BGO13DUh(C(wz#AG`YDzARtNf(n-jSeH|gmW(gvm+F-d)k z9_<`_lLl7ob75+vufdG5o=kJHb9w4sO`J7l_>hSVaP{LGC3!}oU?Rsgi}Gm$(u1{M z+CRe>uHO0+PagVbDa|AOj#0C}uTqD43)F!AbabALAJbPf{P6y?N780Gwa|umOL01< z%=J7Ue3Q7n&SBzSxQ%b65%GC`#*HeGOM7~jPx}Xde~N-R(Wvq1c=g*p6iI##HF%rb zCmOctZe!Q!d8*%0nKK)jq|xc~UGYG1BJ#LTH6>5#K&u?T!{&}^=t(D}N!h_@lF>Vc z3NJdo`fX6xK?x7~z@^|XCr*V#o!3$0oPxSjk>ll1Asv&#y#}b7$IPz{G`}t75^A16 z_YA71ohxjrS*4$HXYiydC~3(1XxY=()SrC$Ii*8|@HxIuUCHb^L1nM02rW>nWy#R5 zW0OWYPfI-X4(*;r6va8ygC|$YC326NO@EIvdQX4G)d20=(dth#{<}H^ydgn@+b2F$ zGVn}K2~^}NRLkP`(c6dk2#V)p?)Es_rVUMg2`EBn-z$xIzhX8iVx4K7;^`MujA+3+ zZAofFNcB7#QKoq19bJBpr1395nJ0yTQ9 zO}e{PTdHPR=G8}!1}d+`HB~r%5T8m3>2sJBlZb))H={k6{ zr4(f|VD-y8l~fa2unvC)t=>#GDB^U}#OW4Z-P`aS|Frhmt!VQr6^XeaDY{Y$TvJx7RN-`x1#_sbx{{N-@%+jwX+9jLfdC75$K_izDKbe+aNHBDHbwG|GQ z6vN=c@a~>oG_hs5ne^i_R#Nsrb5D#>!I#EG9ntc|p~_t1t0Kkd-1~HO4!e_Gk2?My zBc8#<4QqKPA}2>r5x>qf52rYPZuh&!Jx@2iM@w>AeR&Af>auHIAEqTOViZBWbK1%y zu)(CTzi!l@n|~YUlq7KvJ;c*psoK2Am5@V{hn@t zH;(v>IYsWc{SzmFMz?ezO2;CRRxCeV|57#?6{3^bN$ES*?R~ubb0!AnlPL zil^W68PxdiidpnJ&@_pXTz-a8dZKge(|Fg4=cRi2Pl+_Bc?a1`sJQZ-S~Q}dsWyaj zsQ*b&=Lya^Z#E^-+{f_O=(frJvdDYZFBQN&szI8@p}NAM(l&MY*KA6Mio2-u;U7rf zl@!y%BMFME=Irc>FAcLEEI|RKG!phXM;ZSv5>$Q$y3pt0tG#rkW~iwm>2$nW%YLa9 zR5NKgG(lZX5RoIl@e6CO+S8OtwZ!g)4iBTan-Y=EQ>dZ?&kw4n4Nz}HND|-vV2gu) z+%>bGMVCc~M*#D%FvOeGn{v9(sp3VQ29X<}{z7Sge)M+-kxviR&O?>xM-zFHHKA9| z9bru|YP`w52-rDPF%Y?v9ju*P>UnI!@39fM@=e_&sB6YLz}GY&L2e6A9f^GTu@aGL*+i!JQQ&s{a9`F!5uYDbe-o;tSw=9f}Nv3pYK_ zp;B#1k)Zn-O6rq$sT*Z~i|%byv3vM!rXaAhN8OPCCC1PpmQq+XEOK5Hqo#@JI@X#| zH!4};%&KC(sT`V69Z8Ne0$SWkZi%I# zaew}_JpUP~Hj`8R|9q)V7?H?x1PPJyWR)+{D#9k#U!x{q75ouYJ z(|nQoC!GWoDQYl}?o~p5k0D`vbPghbTKJE#9z|`5I0 zDIPe909(ZpG>AT37x#c!Azj?+j+m3snN&ymee3!=GX7ykm-zIC-0ZC`FaPqgTIxzg zrrMis1$SY1A*D=Zo9r6yW*iV+%NST=%=5YH2W5+Z6Hlnr0LNcY}c|#naT1k zyIA9yYq09)Ep*Fr-;+JrmaycrJ~GtjsdHXqSsdxiyw-sUrx%tpVeLZG~SU&K~Hjw|HIT-b3BW#-FZ;86sf_Q z5&^po{K>`rA2u#Pq9v_?-CX^w*)#v*f!V%kU`{~Wr(n$pVry1?tKy7E$$px{{GY1- z*tqbH=b-SgRe-3f18Pa3J~91#u?F8j=!) zY=Y|IDGU?qRHjeu(L*)%KHGS|s#0&FHSzj~>a6D#6)r;&-2{}jH0p?c6`T*Ont{19_mReK35<01>D653BRw<`x=^E0X z+jNYkA+%xJ3V8WurD}zu0+2xyP%$=HK!r+uJJR({HOvNzZGiFbH3+BZDV*z0;zHF4W#n)3afPsRyXnI6V~rjtVX-%@aO5t-68GfrhTA} zu&xSgJAwqDc=4n~;UZAq72LU@$gDxlk%lO0HvL)G)s|QQeKynwa;*fw`G}zJB2la^ zp8#9t8=q6U_@mJ_2?D5C0R= zO!CrA{P_(~@yj0?N5?B_h%#YA@L4%s^|~cmL6q}7_9O%Ci%UypErmK}dI|i(NLq)g z8whDowv3SgL=%0{RxPtKiDmDAytP=i-U7Zs+Hkfg8o&%)^vC0_EtK75SClPn8Ubj) zdYj`Y%K!DFOmWumfp(w|64HBcQ~^FKn6z|h)S_42BM=a`t2VWzDbsn|R013OKroXI zDw5vt7GOM1!7PrTOmAz;v69eB1E`nAvW09)S^23Jj2=*-R#2JGnw4E!pt?SFR_Qfu z;^T)u`K-FPn50G8QgEb&!%75dT|QH%K!0xYp!aSE*keegAo|Q0t)hUA{%U`SRjbrp z*M!wkgN4(in>0ExroKAX4F1@aYA6>FAf#gW+$WO0VR5cB?SM1_bPn}9I#;+;&HG8M zWyQ1B8VVH(xquy@-yG=56CC=?pO8N>61cl{jnxz`tK6d{McM*$v>5)=Wj#<9 zdJih7bgkAb`g6dZzI>c>>lu(%{$yl>1hoD#x6g#M$OL(vyuf{&pEJ&5Qkvh{sL{b% zm#s>h?wB-rfv#7IOzOBk4Pb4~RBo<3G3XbytkTAOb!!^BWlKDArjjD1i+eT30=m%q zAh3(C;u48U1J?7ge+s#`!xc?8E80Oo`r;u>R~{=-IxCinK`n+o!@AG))Wl1Sf-smg z6>1jzxr%C3rKHv#fkKsD~KZ12**7_(&YF$Jcu-6v8V9~l%LhTZV{w2Hi zYk>4|doe&0i59FG!1UWoYYL;P;t4P`SY5~2Lj17`TQ4N8HL3yV3D(8cs%}m%SF4x? z0qa;p^wC_PWLi%iK%a5|NmGlyyKt^7=c$N6V1T#M>0uHLohp!fLzB z0>r@Dft9#-g+w9xf3#YTzoKCE80yyybxK#EBV4E_Vts~v|GYoF|M@fiZC}@2HtIAJ zI7e3TR+S<`MR?6Os_bsYFQ;E*RqC2eLHUcSrK2yiGc|lHwakyTw<6C9?1Cj&(PGoZ z#|lgUg;@dCF5ar5Myn>Qthzb@G}HI*^v9LnH72F%67>s6ii$ovWKx)C9WPlFgsVvN zbajOa;KBtk3F=uA{fcgt7nohCa)-9{|0nTeM_?(FhptwCJ+Hrh&<40V$8G&=P1aAkDGfFo%8AxHyht zaPMns@>X93dQ`|;3(O>QqZXqJK@fpz6=+`k6UsNeZL&WPR&&(?+=TJpO7}s_VwZ+R zl!#_p65V?h=g-ZXU-yBE6yfM-T}@Y|%!GuZdn&e-P zul;zp$4C#T35!ZlNMt1zdSt!lyHqg&jfm5eB3ad*oA7`B%)@qZ>Fp!kAjJ76Y@oRn zTjo^@f^XE`r4}t<>$&QShks^yk7zYI!8q9`s{WSUQJUP8W|Pte=zBp0pkSf!6Mxnt zGI-*0b6q<4;8p0+_}#0ZN~Eh7N|<(}N1ufEGV@a^R|?;I68z;bRRPo8(y_0hWH9?_ zM70c1#CqLL)4URHP{gJNbv1rLDLw?$Ig_+g8<2k9Ua5RO&U!SKk?HLpd~^1YmLfl3 zfB|W~E{(cerYg8;UaiHb2=OT-NwGe@4KFAF4o}a6dU;#jQ1uMuG*{iZ7)8Z@eDUb7 z)*sun$*SdO)&A##O<9uKfj`y9YZWpBejjK@%>F_&A&pCI=%GJuv40?!V$fCzaX(F+ z^=$DCbE8y{lt`)X(jq%0hq>}gO?Gk|q#XUz?iF<+N0O?)=&1qYBWA%w5uGY7q1;(D zO)J;HBB+Xp`<^HZ%6(8D2WuDy_5Y2N<*3mfEPz6+U@z*lUDEpSEwFFnA3FMHY#>)l zD^vV05mlkK<~zP^i@|m0vxZ{bz&DT;dCo^&gW4eXV5D}FGk8X4V3sKS)QkN-fT96Vy1)u-i+V*;}f(k&M+Ch=4^u?H630BKg zSZT8=`%IXV^cM!~CDi$@-dfaQr;=ICN|ZttZ2!8`LhI(DJUnC5 zicvhjJd;_`ZvVu00OVu2Su-bqDsw<(lTC^$QdC&Gk z->;eK>(drgw?H%qsj~P_FolH!)BWfsNreW`x4-|kzbtiF%+Kwg)7zkRiUk}th_wPI zhLJj6wsYRHvm{eSRracJZ^7K@#xm3J`tbK`SeLJ|t4C}mTyd8CLSZdy4F%dVbVW(9 z<)z(S{&_~6W|}YO*i-ysK)52HSwSqy%NB@O-BQ1FxqkT=*v;obVmKZ8fcmvBnwNAL zyXrI_*sue(hsnn+-WM)_S*n^U!ZAK{4r=A@5NgtZ zF-=hG=l|NnI+N`}`U`~%&S5CErE*}-v&Lr&{layWb4w790;b^*5?Z48`{MCW?EsJ_ z1dF9Gy6~~%m9@O7S|!S@cBK2C)C=G)KdICVpah0(yXdd*#+8C(ow^o~b+5t%<4U5c zU-ej@3^8Beg@iPr!>!5{)d}ky32fmGAXM#_t#%8j{>wGC%K2y7dC}dk&g})@31sQs?AT- zKf~8 zf4)ep?&pV0d-L)ahU{Nas(5~;-c>*Td6(~F|Gv9h%K%%;od6r88xdvz>7MAzZa`CJ zSh`eUJ+~f`PJ-Gjyn zKRi|qBa9iUiZ_6jkOknY&ykA6x7=n_I!gVTwftXR0F+}#FCfN(@aFR)Rd@i9uK?%L z@ngCm&F6jkGIdu!kuF5M82VqQ?I3&Ifa&;)L=F+)=#M_ ze2{4k*J+GXP%Xe=98hV~MLS?YxlJ7*UEBfJu+qn1OOy(Rmm3%$t4FblKw(G*zEwGm zks!XEG`n~X6OD^u45u4)2kLq0Vxb2`w!+0bH9~O)B?c_H69qKeKl{`yH+@pC*%y|_OOgN2tb z{P1TO{`4;*yoLVH+n@NvzCEem*^WULTu-3UjPK4$lSCGTzgmVgLfv31*vZdf{6zOK z=-i_mJ=4p>D6{(^L7+jB-i2sl3AQ70k}fo%HP3JTo8d$sJv%48+-|>?us%=#5*=v} zRPcthf}2=J*l4P#qMX_$=V}9eUC0UUz1a-T)n(@wM+q$YxPgk|gX4Zu7cz}b#`<-F zH9B>k+Pv<=fCy<}QTI!jcd!Re_=z}|9yGLikh}IFO(N}@SEqizX7e5mrV5%^zk0W) z)IM6LUGJg#{2h^`Ks9(2tDwO__gSEpuqU0Ft$IBzWoeJ5L5Z3R| zKAFZgWA1xTGYS1FW#)?Fj#l&x)s4D1%uAgVUtTF4;YMhITxWhtSrfP-LTPXnnPaH* zymw7dhn)|6Q=h(b@v|S^x5#P44U2A%qB+r(RR~Gk?^T?YJY+Wa)k<^{cSm%md+ku- zDYq);DUEwIOa2km&l9LPR&RgIb9|T3hA_UEQK|kl5C7*2_v%)is(a9U(-3<0hL_9= zPGvM`P_s|fsuVU`cRHMYeKIlm_lB}25`wQ~7qu&LQKloexsG&`i^^60Ak#}5>>w_) z)TA3AhD#lhs#>o5v<8}+{jUREu7SgDpL%td8q*Z`)P!~Dnvnj)sESDSFJNV?Z?G#- z6di=3jYbtjRckzP!1Jx@iU3cfcFHDBsrxBG@!sM%8gq1 zf-3ofb)2` zxhXCFm|v$Szps$2AT8}W(CAy=V?Y4`sDmxK zLD~?zBE|+!VNd1W@?Zt-t`e7$H{=4=fd2Vo%%4AZ`4oNC7*fA<*0|{k*mqY|QIHh& zsLRtLb%8qo&NOqaOA270l{E-x1*^P2OZP!}{^(`LwVw#Uj#B(nEh4+tVZU?+*IHBQ z#0!16`huJl)wKHcA(iVVDi6}3*B=evp=_JP2GGmt_gSo_0jqe`q929H?Q%rWBlz3!Hmp=Y8ha6u28ss? zjJ16ILszW7O&PWppg!IvgPTS^cE=O0ws_#`2T7%&QMw`%STHK5R5a>4RhU1w5M%vi z*rTDvqt#bUqFM?Q^rD!jE4Y7!&QeuXq%l?zgZ_O-MqiKXiP{Hjs-}rg4ACJ3Fh!hF zP}GXVr;O&_8p`D~yf-`Q^bC7<^`r6PlpIp^=`A4TugRH!mpeaa7k;cMI zRRh$o&r{#er+?tVYRp9Lj;Ku;;hpztPGBqWf56DPcn1QPfe^&~lxa#Uut8IDkGtV* zsbdtNlld3D^8%oS7xRcX2oWNP?3h54C!(>8OJ zXEtpk)zf$mB2=%Dxq`@=6b$TQRs7U7A@wf)MU?tOZu#t!2tX!PoUenn+TTa*Az)WK^ABQIRTs z6{BKyp6B`foJ02n-i%Mb@2im16_K7)hxr~<`=VWC+%JM(MSRXX)%$dJYuT=p^pkQ3 zVOUC(3vX(1N3Bq=WMB3;NmK2|l01>OXzyi@k}Bv^DV8Qy%BC}>d$2q_fh}VOeRhF1 za8jny4`#YdeE3}mYpiy0{XFdIzayw1 zCKsl~tmoNDa;?@EY`wE9k&}R$CVHqyR&y;C75&gfqD~}EN^z^+1D}&f^;7bdTuM=z zA=iTT%-Rz7BSRfc8sZ-4dRH_msDeII7&j49S8gnj=uQgd{pj^Lu02$<-2oK}<$SJ? zlqQKf!{R9=I6I9dO2VX2d}`0*0EJ~L`pu!LbCPC-B2;}NWLxysD52I_-*m0kO!|3C zbCTnnklaxzyJk~Hper!j1$x3lqDYA{{&$(7q{*x2H$#}9#XHf9F%78zQg#XSB`t!(JrJBgT zJh_32+(E%lRrFchf>ThHvSQEGu=Ejyp1XYgXa8Cl8kT_W6JP!!LoIz4{%e4WRsUyl zt^Q$O50}5et;%IIq$Vt;=ETaX%fK8ZR8lhaf>k%4^dk6k{=5SZ&?43+vuYAnJ~V6M zPgTcQKNqYqDqXMru=@6qh6>84IYVW2$h-5Ks=%r#``0yLg@)dJ!nL1Bx#R)NV_lJgP@Vo!-itu8+T7av1Y3`V)>VC@{Q2Va1cNFzbD;Ox{Vq{9m@&yQ!jMDMqPtK|yhUepf=?9Ph#6u!U`djxd@O)6oOpCfO7*i@6 z;FA9{ibFyUf`tQvmr)eONQ3SlKl5@K# zFz7AWio`>uZs!)ok0}LkX#Zb|+4ZpA~O3RXffQT=^Z z7aua^)z5y5+WDl4>JzV%jyo2Me~49Q_4oQl{k(OH-`_79Km7^Th6n(SUm;MBl9oIG z(8WdNEzkcQV$40Yq(l}Lc$vx1e2QY3QX&YT7lh4J413S;(exe|)CE_OAGB zztV-X>N+k-acZe)!%)wE;*}t{j)Nqvt>)Y$~;7T{c)dGP@>`LHaXM-hk0HD zYRaoin?>qF8mhA#>_YHiluA_U3)asAthezWJ^m9v!$1E#{qy=y_$LfH{uw_nI{x%B zihovKwcER?iYGOd10{ND1dS^e73>Jrxv3s#gdn-MbF+SZX103gq?aZ1; z6H7>2*fr+T4~zj;!&cb9UB_4cx*wSJDO6(l)A9Cn>peTatB*ijed3ezC~Ha*rXdfY zV=U#$bVUj{{>p=itbVQzmi**cHKc$|9tHQ{Bj8Lls$k6$L_(LYstQg9^td%?-&3vA zkRn-$9DvdMvNC0@3Uw8gZ`D(z1oaNqLHKO~U;HwpQnNyN1JKj8 z-*C`0GP}m3B#49r4U@}JTFAY!i&_I%xt!uj4|enHH+6iY^p8A+2d`5x_6wA5R(Uj_ z_$FSre+AX`Zig@Z4D|EX{|4#9J2F(%=l`0}afyo4;pDyNla7)^2axeykA9>QDN9aJY~q@b ze*8&7zEk&?|4{ft9n(3Lag^V=@RJ@5ST$t4^gA{^r3&6D@Z(W+q5?AU&K4eiOAmd$ zu=Q1y$iL)@yI&`lKtIjHth$+wCVCIw|%ms;tZ8>3B!HMiU4(1f(;(e@|fY4W`* zLsRsBg!1r<97vr)O=Ql$(&;CY+7-3u#;US`O=Cyjm)Vs^Co)a^lhR8SE5o!!xw``z zl?dWXDx*&t|2B;ALs?Geih^*e`PXpnyrXRA-f5C${)yy(=(ho4(@)eR8Eq@ey034LS5Z z)og~ZjC4wcP}Jc3mASeOONe)k9{^6R0XhB3-Rd|$Z!X%y|q?Z52wr)D~|)k2Io-2q2f zeWvwn!HOr}yiTfqX$PEdfkyQKun&{73IziG3aqfwM1pU^dQXQ7pY`$I$J)&4UW$A# zb}feK>B@q1?sv|m9^m|;U^-=fUnXEXRe=4%-)fL{OP5**5;LfZjZxxV%?xzxvj;TZ zS^tHle-#Pj^Ap^bmt0>_Dt%db*5%BvTsv1w;4i5I7zGtoIM-emUQ3uS*u%w7SoahN zicf!N*YieY2SF;(W&5J_#}}6?c~?@-kLw}I)KNS2_#PAL?B^(^UwjMK_QQjwh26PR2>bHZuHzeoE%&McHW#CnYpE0_FZs~@;l3Wn&D ze^mvVyq4VjvFcBof4n}EMjZ;!$uWAJ1t`XTC6QB*%JXLu&pHJ9NrOPo6r0#e$jt*eTQqQ|}g;Y|T zl50b%S*~KxtFh0Wl$CBRQ)wc1mvFDRV21A0%`BId>~crtjPz$8X~e1%BsHq1_(jC) zG;Z37Hd*37df9KNI2o|pY7#P}g|9-rpPjhXO!^J=bOU~Z|B8nR*S>J>$u0b)dhZTs zhy#d)+Cqa!$9upPNXuN1O@#S%pouTr;RX=&g>Hl72dK%im%OTEQ)GZhg~AAn+In%S ziu{VSNir`{)SN;6jCa7FKW_A36s4wAH2%49lBFD<)h5K&)l}eGKb-^RBlK_$UMEJ? zQ?$1e(a`IJmK2zlrz%nc@F+PF%gEv&^Vx|gCbp&f}HIpW1#qXw751)Vr zrPn17J9YrV>C?AmL|9A?wvt}-?$`&EIal$FP;pGGJn zP*<{PY=}xKoVrWdLc|M@xPJn5;oq$NG8G zuh9d5MRmWZQ^QG7(+uln7hq44c!M?f|0Kr>lNEKWKa^pMPfNe%f#2H~m+#j-Spc>j ziYi8Gu(r2J;`t2lICTB5u@7V3!D-HF=n`DS1}pK*RknRE*}g-t%`99rgM)9IW-Mx! zO>OJpkXP$byKd!k7poS%;1H;#q;M4{!hskMpRE2CXWvS|5zi;VOO>8SeKg_%<<58}qDJWcX*B@Cnsl9ehmem+e z8S~J}&>QMmAMy-K`g-Y|-DiR2b}e4oB=#GjHkj2eeamgdp(epMB7+n*P@k>kxz-g_ z3Si>-YT5zIPHlu*ZwTCx`#FMYO-f+2KwZJg8FzC6wc#Du2tF~SzCb$3tyFYiI zCoxy%h_tpL)v{qOXF!YR%K0gKRl}J>mvt;Kmon@l8+AvQwm`bng{9`rK`lt9L)EfR z8x&MmLci?j>y9b?N@A{1sdjY)^M_#Oae(w#y~#Ahu!1cg8wp(rmn=3dgue5Q>TVT# zQjkU6K_6~z2Lj9P+!{3QUH+|aRVhozuXO!99qP}|ExS_({Z@NBpk5h17Te)_ zp|}Rp3d3rke z-nSdN;q}^)2w8ST(IPJ0#JXF@eAD6YgwhX3q3!h@yrte}Pz!I{(_yo>2IJm_uu``A z_g38%31L{(#;hXK@0{-gx@=EF+DAn~3gs2HZH?VRHMi=1#Tn~I*H1J-dqsHnKcZ-{ zSk;#CR>(z-Ey&LqE{Ehu4`~|`!eisF*2z9=2DYj;ejO+ zNYs&{RfT6y*S=*hpR0{t99w_8y2%HPdlT&n+opckru8noJt9uFQ;Tk~bf)5&s{w^K zpgeKc7HxQ|jj&I~dEV8;Y2r9e&J(AotAT7kj?mvR8|)rnFb_Qq^!qq*m~sZ0Rh~{wVum;=9zE8XUlKDj(2nskr#n`LX%zFwxM?%IeZ0Nh@7Mq??tv$^ z2}0Q043mpBb>yzeEp|C>8B#tro^Atp169M~=?YXF=u=uqx0tozJy=s5F54EsQJYc! zpZLF}WP3OM*B_d7LQ7QAln!#L+LSr<51bS79M2AO%*C|;vP+sCt{E48AqB9*&@9SP z&ui{+UCvv<6wNM*I^JaFx;lnQn|8dzvQKBFm=>-}xfv$104f zl>mQ;*nYH2p71H|18RNj7d=6-%gHvikC${DLKb~aG5(`g7irZpleO;!a$Rsw zzn=6g$kq27i|*?%%VOoB7Kr(APj{3n)J!tWiQho4k?z_uK2AZt4<>qnf9|9CW8L^3 zMxwSNZ;;gZ63$aG6IsRSOPtRO%5QWCe&zHU0`2dw_?x73@r@1p70=QAx9Xq%m@#z9 zjOtJSIvO?JpPpXte<02NH1{%OF_ZtMRxl~CsBrLi7Sx<9=-0+fiJmZ}c!^@s`N1*4 z%JZvImH9H|I0tMZRztotR1tosh5XzuTzWu?V>@no)J- zHSH+=L5r7zq*q}^bGX#>=Iz|lbSj3?5B~i91K7~ap$jLpfzV?5;3afsEyzx^`m zt^XtLVNfc#LAYQ{=k7|G;qA$YKW6v2UF|rz_FYX}$QbFxE1C#9_zie=ET-k0KlnG8 z9y|KrI{EWDVCLmdh|gW%L$3#M!5-o99*ZFNu=&Z*CiRiFoD4l?9$}5jD@h36={NH% zMAOL-h5xd{$yQ=rn51$HBX$kD2AeMG>73w;dmwU=8`m|id!RAxF7PvKkNno}v_-cO z7InGyDC*dDfmEb)WJSrz`!F@)X%-Z*OD2TMyVh>g5Bt4O7=ReTnm# zq5P)w)c2ALAd4=Z1dR+l)9p9(V>iKy2Pn{37HoR&*Pb;201q%27lP%Bxyh4Mj1>R( zyD_|t=FOxRPLTWR4LPwPnQwP7PlI{#TcfLBd(x^l#mf;EBZZVGS29E0KguOz;{!a?eukOCrh7a`y}EYcdPC6d6Lt2XS*pJ?Iu*0 zX$p~;DE`e#4nuVC;)?x_izY<8-|_}x(k?A4DT~Vu_gXx$MCWa}oP&|;d{+(N?4zva zxuy$l7}S$$K!`WE%*JTsH{(0u(Z=mu;9ltg%-zsqHss+azHon>orv-GJ?Aysc;W1E zI={s)jZP}q{8UbG>65G+`+Vp3j7o=3yKMy5^KiM|C8F;f&_xn`0p`raC{_AZCN`hk zjWH>skcfuw!|*~+#%9)RZ(mAZ21`4s*kn$)_(&B8BE@zbWk(8`9OTY=r%Ut;8Ul%B7*73Gn9RAX=Ha+g`F*>%r%&yG3MATk*JQOWwwOSD1H!N|cD*p`P#r zlUl~^fB*eA<%C^7+@rg&BQn2B-?qvxt)=Bx)Y6MlqnoxcOD8W|gd?O3=;uwnyu9SX eI|}e)>LY$VF2l=xso6QK|NjFqE)HdPaR>l(AaSh# diff --git a/tests/refactoring.txt b/tests/refactoring.txt index 75db3c7..bc7d086 100644 --- a/tests/refactoring.txt +++ b/tests/refactoring.txt @@ -57,7 +57,7 @@ overlap() -> getOverlap() Backwards incompatible changes: ================================ -1. Empty cigarstring now returns None (intstead of '') +1. Empty cigarstring now returns None (instead of '') 2. Empty cigar now returns None (instead of []) diff --git a/tests/samtools_test.py b/tests/samtools_test.py index f0d52c9..7c40237 100644 --- a/tests/samtools_test.py +++ b/tests/samtools_test.py @@ -18,12 +18,16 @@ import pysam.samtools import pysam.bcftools from TestUtils import checkBinaryEqual, check_lines_equal, \ check_samtools_view_equal, get_temp_filename, force_bytes, WORKDIR, \ - BAM_DATADIR + make_data_files, BAM_DATADIR IS_PYTHON3 = sys.version_info[0] >= 3 +def setUpModule(): + make_data_files(BAM_DATADIR) + + def run_command(cmd): '''run a samtools command''' try: @@ -93,7 +97,7 @@ class SamtoolsTest(unittest.TestCase): # Samtools-htslib-API: bam_get_library() not yet implemented # causes downstream problems # TODO: The following cause subsequent commands to fail - # unknow option + # unknown option # "rmdup -s ex1.bam %(out)s_ex1.rmdup.bam", # "merge -f %(out)s_ex1.merge.bam ex1.bam ex1.bam", "reheader ex2.sam ex1.bam > %(out)s_ex1.reheader.bam", @@ -242,9 +246,10 @@ class SamtoolsTest(unittest.TestCase): def testStatements(self): for statement in self.statements: command = self.get_command(statement, map_to_internal=False) - # bam2fq differs between version 1.5 and 1.6 - reenable if + # bam2fq differs between version 1.5 and 1.6 - re-enable if # bioconda samtools will be available. - if command in ("bedcov", "stats", "dict", "bam2fq"): + # flagstat differs between version <=1.12 and >=1.13 + if command in ("bedcov", "stats", "dict", "bam2fq", "flagstat"): continue if (command == "calmd" and @@ -401,7 +406,7 @@ if sys.platform != "darwin": # # "filter -s A ex1.vcf.gz > %(out)s_ex1.filter", # # exit # # "gtcheck -s A ex1.vcf.gz > %(out)s_ex1.gtcheck", -# # segfauld, used to work wit bcftools 1.3 +# # segfault, used to work with bcftools 1.3 # # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh", # "stats ex1.vcf.gz > %(out)s_ex1.stats", # ] @@ -415,9 +420,6 @@ if sys.platform != "darwin": if __name__ == "__main__": - # build data files - print("building data files") - subprocess.call("make -C %s" % BAM_DATADIR, shell=True) print("starting tests") unittest.main() print("completed tests") diff --git a/tests/tabix_data/Makefile b/tests/tabix_data/Makefile new file mode 100644 index 0000000..22e5f55 --- /dev/null +++ b/tests/tabix_data/Makefile @@ -0,0 +1,7 @@ +all: all.stamp + +all.stamp: + touch $@ + +clean: + -rm -f all.stamp diff --git a/tests/tabix_data/example.bed.gz.tbi b/tests/tabix_data/example.bed.gz.tbi index a529607bb4551fa61b3ed359b8e33c2d9e92bf5d..cf79b95ae232e068968aa9d863a03164350253bd 100644 GIT binary patch delta 40 tcmX@WxQ|g#zMF%E0R;ZDF*rFhF)W-Y*u?F$xj9IRjX~6Z|CNag6ac@<3f2Gs delta 42 vcmdnTcz{t*zMF%E0R;ZDF*rFhF)W@a*u=X?XfL0oK?4JWtJ+@OiSrZy(P;{_ diff --git a/tests/tabix_data/example.gff3.gz.tbi b/tests/tabix_data/example.gff3.gz.tbi index 855e13926d35975f3b8d38b5cba65228dd4ecd00..d23afbb6fc39c07e57879c620a1b4b113dd33f9c 100644 GIT binary patch delta 1388 zcmV-y1(W)*3$6_5FNcI}Bv=ob8;m_kr=+_j{f_&+q@)&Ytr;-7OvY`FZge@zLS0g5j^#%8h)# z|G<&Dyq=M0d2DpOBA$G2Z8)ns^0!+ezOWE8tqrR*4*zV3*yFLfB=YY+7jbueSh-Q{*%|Tf ze;32EY9IGU-4o$@mEYwalr`Fqf4+3LeootayKkRXeV>i|HI1_~+r#fPP6~@7z9Ba# zf4L~)lZ(T8^{@NY^QNvc@(Im%&-=65kH?>n^Ml&i?d$$&&^TQ8OvHIB^t|TptjCF^U<=fUH$8R{a|7E>*B~)EQ)wYc4|NFKd%Ru zw7s`;KVMe)U2c!-POUS399Ph#{kR;j7KR;aAGgc!i2Q# zG#@+)rwO8Z7{qNVcL9N$br#+7b zHLpCM+>fuT9lhRrJiMuK@A_Ps7rv!-bo;p{qyVoJ1VE^<8}3@ z=9}l4Uypj#Zf-BH&;MvVcpP}$_*dn3x&8Y?zUKYthsR8+blc>b?(3PZe>a<~$-l!5 zA!6F~-yXT9|CU0;?APxBWQzV9499vO5i!H{dnVJZ-?PEY(C@uZmwwM5e-YEK|A#nP z2b1*wpt4Q>uL=^*XO~#p_KRcbN-a<^HzNQCa zCg^JnAtGj*zQ$FinS6~ce;nWIh=@2wUwai1F;n!lkkh5Fh5R7zWVyc9DW+CmYZVa@ z5fKqF)%u9Te!A|fIpA|fKDf67_g5L4$hL_|bH zL`1|1cxF3hqrJ_Dh=>nwq6HCga^6NnL_|bHM4Un|>_EiH+TV(Zh*Rm|ors8tnC-Uh znw%wVx#3+zL_|bHL_|bHM8wSXKc;m(NdW%<03VA81ONa4009360763o0CWKD#X$;0 zPyhf>XHjz&jE`}dPARuhZll~r*(zl-*=WiP-rKxrOrnKt+qP}nwr$(CZQH)yeQMk1 uy3MwKKW*RrkZ&Ic-{Hx&KOTzzxGTTAsrFsH{EHv;v9D(G5}Z|&kp&K4=;h(%aJ16=KkCXT@gJujHT)&5uQ@S*HQJ<0E6b|xE}(KV!PCaCpv zv`C=^rrkxE^l>(`wKbcNr+zzovS(IwS=wgc5=Fn|Wb>Lc z09fZp)$n@?+!QcRT~0<52+Wu2EA0KIK4D~}i2k9PaC%CWmfskAKB{(~oH!9=;Zci-x7mC}puy8}ZZ9HHG+(OCGg$a;*hI_o`K9Uax2tKx+Te5AqtOuO@{8ZxTTO8pvNJteRY60YvsAY%QO79l^}xI;r!wl%QmBy| z0i`f0IHL_{T5%jM`El385bYYOlB#2Fgf_u)rpr9*9L5z-U)F7x2Uj@I&%y>STF0c^ z7@VwO?~|wbY{8_(MsXwOhHMs=0JNYW5vB?hIJ`?Nlt^uh(uZjPpVUM-IYMk=*`6UY z@^r`Y>&)*8SS}$qf*g4J#Jd6J_KAlILBJwQ+pl73YrIHXFidM_&=gg}Lvx@Z z;f-7LB0^{qq~uJ`*UPD1G=8rE`6+Gl%Oo9PRCmgXDPCS^OsMPwFopQ&M~juGr5Jao zM$dC-4ljc9mnM-1i?YOMHPxA!Q{0fu(T!DkBp^wc%6Bd=1{LEEs>Aglg^W&J?$vdi zd7Vo@WF+}~@?ot-*K-@NK8$cRGL;6VRyk9wBfB5!8?ETYpIYDit3k z&Z&| z=+JSnfFY?6$MQE3xnmXSLa-esca_>O=7II9veGdgh0I|t;XJbFJZ5?~$Uvzir`(Y+ zR>fKuiKj@H)yK-f5;Mt$@X40RX0Q#lhu=BJhnc#}_b$i=J6*@b;YiwIM6ls*7f@$^ zL1X<-xE}Zc@rtac|HPhE!DS(OAPgZ)UW_c7RV5$rP!iDve@hgBLL~@@2EzIoUORy` zB}p1*X9NNv{FUq!#GAz)H(TJ(bl6WU>s8iotb1mJ@OVE4v2?HIo}Ty~t7!+Nhe}AB zyy6?Yi>(yljE4YU%tO*DX0Jiqw~;3UP#u2XmG6ab{3# zFv`ktO{_$65l87m??KX;HVj{p?<-*!(20QqLoi@pVE6dS%<3QiX!6H~uy4_dJ=N``mR)x7Y;~+)|8KT vNq9C_H4>yIEZTYpjC8@y$eT*=z}+QhwdE6SEyLDc@eSO#<_{K diff --git a/tests/tabix_data/example_badcomments.bed.gz.tbi b/tests/tabix_data/example_badcomments.bed.gz.tbi index 04631805ed33127ce32fccbbfe15d90115e442f4..0ab947f612fe56e7947ef2af0285085eebdba6dc 100644 GIT binary patch delta 50 ycmX@ac!+VrbnS(=95s2F8NTk^es>=u1A}}w2MYrT{AXiua%N(XW?%+$K?DGiPYW*q delta 50 ycmX@ac!+VrbnU=)w;h~J44%8b9Nx#sz#!kv!NLFn|JfLvoS7J;8JNLb5CH&rR|=B= diff --git a/tests/tabix_data/example_badcomments.gtf.gz.tbi b/tests/tabix_data/example_badcomments.gtf.gz.tbi index c7731fc26fc6d9d607c81bbae0a274f1e775cc29..16fb1355db063d4ecb95544d85244910b42d1f89 100644 GIT binary patch delta 21 dcmX@cc#LtvZ0=REQ!W`WFwC{xTQzZ+0svkd2#Npz delta 21 dcmX@cc#LtvZ0^#Gme~pn3_A~Z@J(E%003Nf2uc6| diff --git a/tests/tabix_data/example_badcomments.vcf.gz.tbi b/tests/tabix_data/example_badcomments.vcf.gz.tbi index 366004b49e71da8b701f63febfc6b5e4a5930b21..38f4b591f25cea7ada5d617a324070f784b2cfd2 100644 GIT binary patch delta 46 zcmdnRxPwtpzMF%E0R;ZDF*rFhG0dJQSSEP!t;|yQOo0{4C$up#`2Bk|d*Unw03wbL AO#lD@ delta 48 zcmdnNxQkIxzMF%E0R;ZDF*rFhG0dGPSSIZLGEiz}+fv1%OqZx;28P=~+A0%gC;$K} CI}OnQ diff --git a/tests/tabix_data/example_comments.bed.gz.tbi b/tests/tabix_data/example_comments.bed.gz.tbi index 42544b2390d8b8a725b4fd64161c47dff4d6ff36..89b1bb3384da9062c4055b64eb115855b3170981 100644 GIT binary patch delta 51 zcmX@ac!+VrG@Y(A_N68~3=DTU*FW6H$iN`q&B4L|0{__jW(hSUCE{FgCUjPZj diff --git a/tests/tabix_data/example_comments.gtf.gz.tbi b/tests/tabix_data/example_comments.gtf.gz.tbi index 2f33d40ff588ca01095f089f0d184bd4aca2cd9d..54f5389f960f8bb24e6375befeb007509d2bc953 100644 GIT binary patch delta 21 dcmX@cc#LtvZ0=REQ!W`WF!cQGPoB6;0RUj*2=o8| delta 21 dcmX@cc#LtvZ0^#Gme~pn42KpoY@N7F0RUU-2vPt5 diff --git a/tests/tabix_data/example_comments.vcf.gz.tbi b/tests/tabix_data/example_comments.vcf.gz.tbi index 366004b49e71da8b701f63febfc6b5e4a5930b21..38f4b591f25cea7ada5d617a324070f784b2cfd2 100644 GIT binary patch delta 46 zcmdnRxPwtpzMF%E0R;ZDF*rFhG0dJQSSEP!t;|yQOo0{4C$up#`2Bk|d*Unw03wbL AO#lD@ delta 48 zcmdnNxQkIxzMF%E0R;ZDF*rFhG0dGPSSIZLGEiz}+fv1%OqZx;28P=~+A0%gC;$K} CI}OnQ diff --git a/tests/tabix_test.py b/tests/tabix_test.py index c17f7ff..7546175 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -11,15 +11,18 @@ import shutil import gzip import pysam import unittest -import subprocess import glob import re from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \ - load_and_convert, TABIX_DATADIR, get_temp_filename + load_and_convert, make_data_files, TABIX_DATADIR, get_temp_filename IS_PYTHON3 = sys.version_info[0] >= 3 +def setUpModule(): + make_data_files(TABIX_DATADIR) + + def myzip_open(infile, mode="r"): '''open compressed file and decode.''' @@ -1239,5 +1242,4 @@ class TestMultithreadTabixFile(unittest.TestCase): if __name__ == "__main__": - subprocess.call("make -C %s" % TABIX_DATADIR, shell=True) unittest.main() diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py index 7ad7db0..1806909 100644 --- a/tests/tabixproxies_test.py +++ b/tests/tabixproxies_test.py @@ -5,7 +5,11 @@ import sys import re import copy import gzip -from TestUtils import load_and_convert, TABIX_DATADIR +from TestUtils import load_and_convert, make_data_files, TABIX_DATADIR + + +def setUpModule(): + make_data_files(TABIX_DATADIR) class TestParser(unittest.TestCase): diff --git a/tests/test_samtools_python.py b/tests/test_samtools_python.py index f30ff9c..da4d332 100644 --- a/tests/test_samtools_python.py +++ b/tests/test_samtools_python.py @@ -1,7 +1,11 @@ import pysam import os import pytest -from TestUtils import BAM_DATADIR +from TestUtils import make_data_files, BAM_DATADIR + + +def setUpModule(): + make_data_files(BAM_DATADIR) def test_idxstats_parse_split_lines(): -- 2.30.2