From b9d055f3408a398a66a7581be362596073d52690 Mon Sep 17 00:00:00 2001 From: Andreas Tille Date: Tue, 7 Dec 2021 09:21:05 +0100 Subject: [PATCH] New upstream version 0.18.0+ds --- .github/workflows/ci.yaml | 11 +- .github/workflows/release.yaml | 26 +- Containerfile | 13 + README.rst | 2 +- bcftools/abuf.c | 128 ++++- bcftools/abuf.c.pysam.c | 128 ++++- bcftools/bam2bcf.c | 30 +- bcftools/bam2bcf.c.pysam.c | 30 +- bcftools/bam2bcf.h | 1 + bcftools/bam2bcf_indel.c | 17 +- bcftools/bam2bcf_indel.c.pysam.c | 17 +- bcftools/bcftools.h | 1 + bcftools/consensus.c | 2 +- bcftools/consensus.c.pysam.c | 2 +- bcftools/convert.c | 143 +++-- bcftools/convert.c.pysam.c | 143 +++-- bcftools/convert.h | 4 +- bcftools/csq.c | 221 ++++--- bcftools/csq.c.pysam.c | 221 ++++--- bcftools/mcall.c | 20 +- bcftools/mcall.c.pysam.c | 20 +- bcftools/mpileup.c | 129 +++-- bcftools/mpileup.c.pysam.c | 129 +++-- bcftools/vcfannotate.c | 411 +++++++++---- bcftools/vcfannotate.c.pysam.c | 411 +++++++++---- bcftools/vcfcall.c | 92 +-- bcftools/vcfcall.c.pysam.c | 92 +-- bcftools/vcfcnv.c | 73 ++- bcftools/vcfcnv.c.pysam.c | 73 ++- bcftools/vcfconcat.c | 222 ++++--- bcftools/vcfconcat.c.pysam.c | 222 ++++--- bcftools/vcfconvert.c | 136 +++-- bcftools/vcfconvert.c.pysam.c | 136 +++-- bcftools/vcffilter.c | 69 ++- bcftools/vcffilter.c.pysam.c | 69 ++- bcftools/vcfgtcheck.c | 31 +- bcftools/vcfgtcheck.c.pysam.c | 31 +- bcftools/vcfindex.c | 51 +- bcftools/vcfindex.c.pysam.c | 51 +- bcftools/vcfisec.c | 89 ++- bcftools/vcfisec.c.pysam.c | 89 ++- bcftools/vcfmerge.c | 87 ++- bcftools/vcfmerge.c.pysam.c | 87 ++- bcftools/vcfnorm.c | 61 +- bcftools/vcfnorm.c.pysam.c | 61 +- bcftools/vcfplugin.c | 67 ++- bcftools/vcfplugin.c.pysam.c | 67 ++- bcftools/vcfquery.c | 55 +- bcftools/vcfquery.c.pysam.c | 55 +- bcftools/vcfroh.c | 20 + bcftools/vcfroh.c.pysam.c | 20 + bcftools/vcfsort.c | 120 +++- bcftools/vcfsort.c.pysam.c | 120 +++- bcftools/vcfstats.c | 34 +- bcftools/vcfstats.c.pysam.c | 34 +- bcftools/vcfview.c | 103 ++-- bcftools/vcfview.c.pysam.c | 103 ++-- bcftools/version.c | 20 + bcftools/version.c.pysam.c | 20 + bcftools/version.sh | 2 +- devtools/import.py | 5 +- doc/index.rst | 2 +- doc/release.rst | 10 + pysam/libcalignmentfile.pyx | 9 + pysam/libctabixproxies.pxd | 6 +- pysam/libctabixproxies.pyx | 33 +- pysam/libcutils.pyx | 3 + pysam/samtools.py | 1 + pysam/version.h | 6 +- pysam/version.py | 8 +- samtools/README | 10 +- samtools/bam.c | 59 +- samtools/bam.c.pysam.c | 59 +- samtools/bam.h | 542 +----------------- samtools/bam2bcf.c | 28 +- samtools/bam2bcf.c.pysam.c | 28 +- samtools/bam2depth.c | 11 +- samtools/bam2depth.c.pysam.c | 11 +- samtools/bam_aux.c | 16 +- samtools/bam_aux.c.pysam.c | 16 +- samtools/bam_endian.h | 66 --- samtools/bam_fastq.c | 8 +- samtools/bam_fastq.c.pysam.c | 26 +- samtools/bam_import.c | 12 +- samtools/bam_import.c.pysam.c | 12 +- samtools/bam_markdup.c | 6 +- samtools/bam_markdup.c.pysam.c | 6 +- samtools/bam_plcmd.c | 208 +++++-- samtools/bam_plcmd.c.pysam.c | 208 +++++-- samtools/bam_reheader.c | 2 +- samtools/bam_reheader.c.pysam.c | 2 +- samtools/bam_samples.c | 433 ++++++++++++++ samtools/bam_samples.c.pysam.c | 435 ++++++++++++++ samtools/bam_sort.c | 154 +++-- samtools/bam_sort.c.pysam.c | 154 +++-- samtools/bamshuf.c.pysam.c | 16 +- samtools/bamtk.c | 3 + samtools/bamtk.c.pysam.c | 3 + samtools/coverage.c | 2 +- samtools/coverage.c.pysam.c | 2 +- samtools/sam.c | 147 ----- samtools/sam.c.pysam.c | 149 ----- samtools/sam.h | 151 ----- samtools/sam_utils.c | 13 + samtools/sam_utils.c.pysam.c | 13 + samtools/sam_view.c | 231 ++++++-- samtools/sam_view.c.pysam.c | 231 ++++++-- samtools/samtools.h | 10 + samtools/stats.c | 61 +- samtools/stats.c.pysam.c | 61 +- samtools/version.sh | 2 +- tests/tabix_data/Makefile | 37 +- tests/tabix_data/empty.bed.gz.tbi | Bin 75 -> 0 bytes tests/tabix_data/example.bed.gz | Bin 819 -> 0 bytes tests/tabix_data/example.bed.gz.tbi | Bin 190 -> 0 bytes tests/tabix_data/example.gff2.gz.tbi | Bin 107 -> 0 bytes tests/tabix_data/example.gff3.gz.tbi | Bin 1454 -> 0 bytes tests/tabix_data/example.sam.gz.tbi | Bin 128 -> 0 bytes tests/tabix_data/example.vcf.gz.tbi | Bin 180 -> 0 bytes .../tabix_data/example_badcomments.bed.gz.tbi | Bin 194 -> 0 bytes .../tabix_data/example_badcomments.gtf.gz.tbi | Bin 198 -> 0 bytes .../tabix_data/example_badcomments.sam.gz.tbi | Bin 128 -> 0 bytes .../tabix_data/example_badcomments.vcf.gz.tbi | Bin 184 -> 0 bytes tests/tabix_data/example_comments.bed.gz.tbi | Bin 194 -> 0 bytes tests/tabix_data/example_comments.gtf.gz.tbi | Bin 198 -> 0 bytes tests/tabix_data/example_comments.sam.gz.tbi | Bin 128 -> 0 bytes tests/tabix_data/example_comments.vcf.gz.tbi | Bin 184 -> 0 bytes tests/tabix_data/example_large.bed.gz.tbi | Bin 7877 -> 0 bytes tests/tabix_data/fivecolumns.bed | 4 + tests/tabix_data/vcf/16.vcf | 2 +- tests/tabix_data/vcf/2.vcf | 2 +- tests/tabix_data/vcf/20.vcf | 2 +- tests/tabix_data/vcf/23.vcf | 2 +- tests/tabix_test.py | 4 +- tests/tabixproxies_test.py | 35 +- 135 files changed, 5645 insertions(+), 3055 deletions(-) create mode 100644 Containerfile delete mode 100644 samtools/bam_endian.h create mode 100644 samtools/bam_samples.c create mode 100644 samtools/bam_samples.c.pysam.c delete mode 100644 samtools/sam.c delete mode 100644 samtools/sam.c.pysam.c delete mode 100644 samtools/sam.h delete mode 100644 tests/tabix_data/empty.bed.gz.tbi delete mode 100644 tests/tabix_data/example.bed.gz delete mode 100644 tests/tabix_data/example.bed.gz.tbi delete mode 100644 tests/tabix_data/example.gff2.gz.tbi delete mode 100644 tests/tabix_data/example.gff3.gz.tbi delete mode 100644 tests/tabix_data/example.sam.gz.tbi delete mode 100644 tests/tabix_data/example.vcf.gz.tbi delete mode 100644 tests/tabix_data/example_badcomments.bed.gz.tbi delete mode 100644 tests/tabix_data/example_badcomments.gtf.gz.tbi delete mode 100644 tests/tabix_data/example_badcomments.sam.gz.tbi delete mode 100644 tests/tabix_data/example_badcomments.vcf.gz.tbi delete mode 100644 tests/tabix_data/example_comments.bed.gz.tbi delete mode 100644 tests/tabix_data/example_comments.gtf.gz.tbi delete mode 100644 tests/tabix_data/example_comments.sam.gz.tbi delete mode 100644 tests/tabix_data/example_comments.vcf.gz.tbi delete mode 100644 tests/tabix_data/example_large.bed.gz.tbi create mode 100644 tests/tabix_data/fivecolumns.bed diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4075f1c..ef562f9 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,7 +1,6 @@ name: CI -# on: [push, pull_request] -on: [pull_request] +on: [push, pull_request] jobs: direct: @@ -9,7 +8,7 @@ jobs: strategy: matrix: os: [ubuntu, macos] - python-version: [2.7, 3.6, 3.7, 3.8, 3.9] + python-version: ['2.7', '3.6', '3.7', '3.8', '3.9', '3.10'] exclude: # Run only the latest 2.x and 3.x on macOS - os: macos @@ -18,6 +17,8 @@ jobs: python-version: 3.7 - os: macos python-version: 3.8 + - os: macos + python-version: 3.9 steps: - name: Checkout pysam @@ -63,7 +64,7 @@ jobs: strategy: matrix: os: [ubuntu, macos] - python-version: [3.9] + python-version: ['3.10'] steps: - name: Checkout pysam @@ -119,7 +120,7 @@ jobs: strategy: matrix: os: [ubuntu] - python-version: [3.7] + python-version: ['3.7'] defaults: run: shell: bash -l {0} # needed for conda activation diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index bbc954f..3285fc7 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -16,8 +16,12 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04, macos-10.15] # windows-2019, - + os: [ubuntu-latest, macos-10.15] # windows-2019, + cibw_archs: ["auto"] + # include: + # - os: ubuntu-latest + # cibw_archs: "aarch64" + steps: - name: Checkout pysam uses: actions/checkout@v2 @@ -26,27 +30,28 @@ jobs: uses: actions/setup-python@v2 with: python-version: '3.8' - + - name: Install prerequisite Python libraries run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip pip install cython pytest pytest-pep8 - name: Build wheels for linux if: runner.os == 'Linux' - uses: pypa/cibuildwheel@v2.1.2 + uses: pypa/cibuildwheel@v2.2.2 env: - CIBW_BUILD: cp36-* cp37-* cp38-* cp39-* + CIBW_BUILD: cp36-* cp37-* cp38-* cp39-* cp310-* + CIBW_SKIP: "*musllinux*" CIBW_BEFORE_BUILD: yum install -y libcurl-devel zlib-devel bzip2-devel xz-devel && pip install cython - CIBW_MANYLINUX_X86_64_IMAGE: manylinux1 - CIBW_MANYLINUX_I686_IMAGE: manylinux1 + CIBW_ARCHS: ${{ matrix.cibw_archs }} - name: Build wheels for macos if: runner.os != 'Linux' - uses: pypa/cibuildwheel@v2.1.2 + uses: pypa/cibuildwheel@v2.2.2 env: - CIBW_BUILD: cp36-* cp37-* cp38-* cp39-* + CIBW_BUILD: cp36-* cp37-* cp38-* cp39-* cp310-* CIBW_BEFORE_BUILD: pip install cython + CIBW_ARCHS: ${{ matrix.cibw_archs }} - name: Upload artifacts uses: actions/upload-artifact@v2 @@ -110,6 +115,5 @@ jobs: if: github.event_name == 'release' && github.event.action == 'published' uses: pypa/gh-action-pypi-publish@master with: - user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/Containerfile b/Containerfile new file mode 100644 index 0000000..633f07e --- /dev/null +++ b/Containerfile @@ -0,0 +1,13 @@ +FROM ubi8:latest + +RUN yum update \ + && yum install -y python3-pip python3-devel pigz \ + && cd /usr/local/bin \ + && ln -s /usr/bin/python3 python \ + && pip3 --no-cache-dir install --upgrade pip \ + && yum clean all \ + && echo "system packages installed" + +RUN python -m pip install pysam + +WORKDIR /opt/ diff --git a/README.rst b/README.rst index 368984a..547868b 100644 --- a/README.rst +++ b/README.rst @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured compilation options. Especially for OS X this will potentially save a lot of trouble. -The current version of pysam wraps 3rd-party code from htslib-1.13, samtools-1.13, and bcftools-1.13. +The current version of pysam wraps 3rd-party code from htslib-1.14, samtools-1.14, and bcftools-1.14. Pysam is available through `pypi `_. To install, type:: diff --git a/bcftools/abuf.c b/bcftools/abuf.c index 5e45e9e..a97332a 100644 --- a/bcftools/abuf.c +++ b/bcftools/abuf.c @@ -295,13 +295,13 @@ static void _split_table_set_chrom_qual(abuf_t *buf) bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt); } } +int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode) { const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key); int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key); int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key); if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags - if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings if ( type==BCF_HT_LONG ) return; // todo: 64bit integers bcf1_t *rec = buf->split.rec; @@ -311,7 +311,10 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo // Check for incorrect number of values. Note this check does not consider all values missing // and will remove annotations that don't pass. - if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return; + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) + { + if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return; + } if ( buf->mtmp2 < buf->mtmp ) { @@ -320,9 +323,14 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo buf->mtmp2 = buf->mtmp; } + const int num_size = 4; + assert( num_size==sizeof(int32_t) && num_size==sizeof(float) ); int32_t missing = bcf_int32_missing; void *missing_ptr = (void*)&missing; if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + int32_t vector_end = bcf_int32_vector_end; + void *vector_end_ptr = (void*)&vector_end; + if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr)); int iout,i; for (iout=0; ioutsplit.nout; iout++) @@ -332,21 +340,40 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo int ret = 0; if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type); - else if ( len==BCF_VL_A ) + else if ( len==BCF_VL_A && type!=BCF_HT_STR ) { int iori = buf->split.atoms[iout]->ial - 1; assert( ioritmp2,buf->tmp+4*iori,4); + if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) ) + memcpy(buf->tmp2,missing_ptr,num_size); + else + memcpy(buf->tmp2,buf->tmp+num_size*iori,num_size); if ( star_allele ) - memcpy(buf->tmp2+4,missing_ptr,4); + memcpy(buf->tmp2+num_size,missing_ptr,num_size); ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type); } - else if ( len==BCF_VL_R ) + else if ( len==BCF_VL_A && type==BCF_HT_STR ) { - memcpy(buf->tmp2,buf->tmp,4); // REF contributes to all records + int iori = buf->split.atoms[iout]->ial - 1; + kstring_t dst; + dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2; + kputc('.',&dst); + if ( star_allele ) kputs(",.",&dst); + copy_string_field(buf->tmp, iori, nval, &dst, 0); + if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1); + buf->mtmp2 = dst.m; + buf->tmp2 = dst.s; + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type); + } + else if ( len==BCF_VL_R && type!=BCF_HT_STR ) + { + memcpy(buf->tmp2,buf->tmp,num_size); // REF contributes to all records int iori = buf->split.atoms[iout]->ial; assert( iorisplit.nori ); - memcpy(buf->tmp2+4,buf->tmp+4*iori,4); + if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) ) + memcpy(buf->tmp2+num_size,missing_ptr,num_size); + else + memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size); if ( type==BCF_HT_INT && mode==M_SUM ) { uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; @@ -356,9 +383,23 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo } } if ( star_allele ) - memcpy(buf->tmp2+8,missing_ptr,4); + memcpy(buf->tmp2+2*num_size,missing_ptr,num_size); ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type); } + else if ( len==BCF_VL_R && type==BCF_HT_STR ) + { + int iori = buf->split.atoms[iout]->ial - 1; + kstring_t dst; + dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2; + kputs(".,.",&dst); + if ( star_allele ) kputs(",.",&dst); + copy_string_field(buf->tmp, 0, nval, &dst, 0); + copy_string_field(buf->tmp, iori+1, nval, &dst, 1); + if ( star_allele ) copy_string_field(".", 0, 1, &dst, 2); + buf->mtmp2 = dst.m; + buf->tmp2 = dst.s; + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type); + } if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag); } } @@ -449,7 +490,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id); int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id); - if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings + if ( type==BCF_HT_STR && len==BCF_VL_G ) return; // possible todo: Number=G for strings if ( type==BCF_HT_LONG ) return; // todo: 64bit integers const int num_size = 4; @@ -457,23 +498,37 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo int32_t missing = bcf_int32_missing; void *missing_ptr = (void*)&missing; if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + int32_t vector_end = bcf_int32_vector_end; + void *vector_end_ptr = (void*)&vector_end; + if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr)); bcf1_t *rec = buf->split.rec; int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type); if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes - if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) + { + if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid - // Check for incorrect number of values. Note this check does not consider all values missing - // and will remove annotations that don't pass. - if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return; + // Check for incorrect number of values. Note this check does not consider all values missing + // and will remove annotations that don't pass. + if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return; + } // Increase buffer size to accommodate star allele int nval1 = nval / nsmpl; mtmp = buf->mtmp; - if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele - else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3); + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) + { + if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele + else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3); + } + else if ( type==BCF_HT_STR ) + { + if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < nsmpl*(nval1+2) ) mtmp = nsmpl*(nval1+2); // +2 for the possibility of the star allele, ",." + else if ( len==BCF_VL_G && mtmp < nsmpl*(nval1+6) ) mtmp = nsmpl*(nval1+6); + } if ( buf->mtmp2 < mtmp ) { @@ -490,7 +545,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo int ret = 0; if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type); - else if ( len==BCF_VL_A ) + else if ( len==BCF_VL_A && type!=BCF_HT_STR ) { int iori = buf->split.atoms[iout]->ial - 1; assert( ioritmp + nval1*num_size*i; void *dst = buf->tmp2 + num_size*i*(star_allele+1); - memcpy(dst,src+iori*num_size,num_size); + if ( !memcmp(vector_end_ptr,src+iori*num_size,num_size) ) + memcpy(dst,missing_ptr,num_size); + else + memcpy(dst,src+iori*num_size,num_size); if ( star_allele ) memcpy(dst+num_size,missing_ptr,num_size); } ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type); } - else if ( len==BCF_VL_R ) + else if ( (len==BCF_VL_A || len==BCF_VL_R) && type==BCF_HT_STR ) + { + int ioff = len==BCF_VL_R ? 1 : 0; + int iori = buf->split.atoms[iout]->ial - 1; + int nval1_dst = star_allele ? nval1 + 2 : nval1; + memset(buf->tmp2,0,nval1_dst*nsmpl); + for (i=0; itmp2 + nval1_dst*i; + kputc_('.',&dst); + if ( star_allele ) kputsn_(",.",2,&dst); + if ( len==BCF_VL_R ) + { + kputsn_(",.",2,&dst); + copy_string_field(buf->tmp+nval1*i, 0, nval1, &dst, 0); + } + copy_string_field(buf->tmp+nval1*i, iori+ioff, nval1, &dst, 0+ioff); + if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1+ioff); + } + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nval1_dst*nsmpl, type); + } + else if ( len==BCF_VL_R && type!=BCF_HT_STR ) { int iori = buf->split.atoms[iout]->ial; assert( iori<=nval ); @@ -514,7 +594,6 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo void *dst = buf->tmp2 + num_size*i*(star_allele+2); memcpy(dst,src,num_size); memcpy(dst+num_size,src+iori*num_size,num_size); - if ( type==BCF_HT_INT && mode==M_SUM ) { uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; @@ -526,7 +605,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo } ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type); } - else if ( len==BCF_VL_G ) + else if ( len==BCF_VL_G && type!=BCF_HT_STR ) { int iori = buf->split.atoms[iout]->ial; int i01 = bcf_alleles2gt(0,iori); @@ -691,6 +770,13 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec) _split_table_set_format(buf, &rec->d.fmt[i], mode); } + + // Check that at least one FORMAT field was added, if not, the number of samples must be set manually + for (i=0; isplit.nout; i++) + { + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)]; + if ( !out->n_sample ) out->n_sample = rec->n_sample; + } } void abuf_push(abuf_t *buf, bcf1_t *rec) diff --git a/bcftools/abuf.c.pysam.c b/bcftools/abuf.c.pysam.c index 811ef10..a727836 100644 --- a/bcftools/abuf.c.pysam.c +++ b/bcftools/abuf.c.pysam.c @@ -297,13 +297,13 @@ static void _split_table_set_chrom_qual(abuf_t *buf) bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt); } } +int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode) { const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key); int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key); int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key); if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags - if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings if ( type==BCF_HT_LONG ) return; // todo: 64bit integers bcf1_t *rec = buf->split.rec; @@ -313,7 +313,10 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo // Check for incorrect number of values. Note this check does not consider all values missing // and will remove annotations that don't pass. - if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return; + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) + { + if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return; + } if ( buf->mtmp2 < buf->mtmp ) { @@ -322,9 +325,14 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo buf->mtmp2 = buf->mtmp; } + const int num_size = 4; + assert( num_size==sizeof(int32_t) && num_size==sizeof(float) ); int32_t missing = bcf_int32_missing; void *missing_ptr = (void*)&missing; if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + int32_t vector_end = bcf_int32_vector_end; + void *vector_end_ptr = (void*)&vector_end; + if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr)); int iout,i; for (iout=0; ioutsplit.nout; iout++) @@ -334,21 +342,40 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo int ret = 0; if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type); - else if ( len==BCF_VL_A ) + else if ( len==BCF_VL_A && type!=BCF_HT_STR ) { int iori = buf->split.atoms[iout]->ial - 1; assert( ioritmp2,buf->tmp+4*iori,4); + if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) ) + memcpy(buf->tmp2,missing_ptr,num_size); + else + memcpy(buf->tmp2,buf->tmp+num_size*iori,num_size); if ( star_allele ) - memcpy(buf->tmp2+4,missing_ptr,4); + memcpy(buf->tmp2+num_size,missing_ptr,num_size); ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type); } - else if ( len==BCF_VL_R ) + else if ( len==BCF_VL_A && type==BCF_HT_STR ) { - memcpy(buf->tmp2,buf->tmp,4); // REF contributes to all records + int iori = buf->split.atoms[iout]->ial - 1; + kstring_t dst; + dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2; + kputc('.',&dst); + if ( star_allele ) kputs(",.",&dst); + copy_string_field(buf->tmp, iori, nval, &dst, 0); + if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1); + buf->mtmp2 = dst.m; + buf->tmp2 = dst.s; + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type); + } + else if ( len==BCF_VL_R && type!=BCF_HT_STR ) + { + memcpy(buf->tmp2,buf->tmp,num_size); // REF contributes to all records int iori = buf->split.atoms[iout]->ial; assert( iorisplit.nori ); - memcpy(buf->tmp2+4,buf->tmp+4*iori,4); + if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) ) + memcpy(buf->tmp2+num_size,missing_ptr,num_size); + else + memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size); if ( type==BCF_HT_INT && mode==M_SUM ) { uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; @@ -358,9 +385,23 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo } } if ( star_allele ) - memcpy(buf->tmp2+8,missing_ptr,4); + memcpy(buf->tmp2+2*num_size,missing_ptr,num_size); ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type); } + else if ( len==BCF_VL_R && type==BCF_HT_STR ) + { + int iori = buf->split.atoms[iout]->ial - 1; + kstring_t dst; + dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2; + kputs(".,.",&dst); + if ( star_allele ) kputs(",.",&dst); + copy_string_field(buf->tmp, 0, nval, &dst, 0); + copy_string_field(buf->tmp, iori+1, nval, &dst, 1); + if ( star_allele ) copy_string_field(".", 0, 1, &dst, 2); + buf->mtmp2 = dst.m; + buf->tmp2 = dst.s; + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type); + } if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag); } } @@ -451,7 +492,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id); int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id); - if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings + if ( type==BCF_HT_STR && len==BCF_VL_G ) return; // possible todo: Number=G for strings if ( type==BCF_HT_LONG ) return; // todo: 64bit integers const int num_size = 4; @@ -459,23 +500,37 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo int32_t missing = bcf_int32_missing; void *missing_ptr = (void*)&missing; if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + int32_t vector_end = bcf_int32_vector_end; + void *vector_end_ptr = (void*)&vector_end; + if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr)); bcf1_t *rec = buf->split.rec; int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type); if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes - if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) + { + if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid - // Check for incorrect number of values. Note this check does not consider all values missing - // and will remove annotations that don't pass. - if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return; + // Check for incorrect number of values. Note this check does not consider all values missing + // and will remove annotations that don't pass. + if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return; + } // Increase buffer size to accommodate star allele int nval1 = nval / nsmpl; mtmp = buf->mtmp; - if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele - else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3); + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) + { + if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele + else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3); + } + else if ( type==BCF_HT_STR ) + { + if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < nsmpl*(nval1+2) ) mtmp = nsmpl*(nval1+2); // +2 for the possibility of the star allele, ",." + else if ( len==BCF_VL_G && mtmp < nsmpl*(nval1+6) ) mtmp = nsmpl*(nval1+6); + } if ( buf->mtmp2 < mtmp ) { @@ -492,7 +547,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo int ret = 0; if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type); - else if ( len==BCF_VL_A ) + else if ( len==BCF_VL_A && type!=BCF_HT_STR ) { int iori = buf->split.atoms[iout]->ial - 1; assert( ioritmp + nval1*num_size*i; void *dst = buf->tmp2 + num_size*i*(star_allele+1); - memcpy(dst,src+iori*num_size,num_size); + if ( !memcmp(vector_end_ptr,src+iori*num_size,num_size) ) + memcpy(dst,missing_ptr,num_size); + else + memcpy(dst,src+iori*num_size,num_size); if ( star_allele ) memcpy(dst+num_size,missing_ptr,num_size); } ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type); } - else if ( len==BCF_VL_R ) + else if ( (len==BCF_VL_A || len==BCF_VL_R) && type==BCF_HT_STR ) + { + int ioff = len==BCF_VL_R ? 1 : 0; + int iori = buf->split.atoms[iout]->ial - 1; + int nval1_dst = star_allele ? nval1 + 2 : nval1; + memset(buf->tmp2,0,nval1_dst*nsmpl); + for (i=0; itmp2 + nval1_dst*i; + kputc_('.',&dst); + if ( star_allele ) kputsn_(",.",2,&dst); + if ( len==BCF_VL_R ) + { + kputsn_(",.",2,&dst); + copy_string_field(buf->tmp+nval1*i, 0, nval1, &dst, 0); + } + copy_string_field(buf->tmp+nval1*i, iori+ioff, nval1, &dst, 0+ioff); + if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1+ioff); + } + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nval1_dst*nsmpl, type); + } + else if ( len==BCF_VL_R && type!=BCF_HT_STR ) { int iori = buf->split.atoms[iout]->ial; assert( iori<=nval ); @@ -516,7 +596,6 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo void *dst = buf->tmp2 + num_size*i*(star_allele+2); memcpy(dst,src,num_size); memcpy(dst+num_size,src+iori*num_size,num_size); - if ( type==BCF_HT_INT && mode==M_SUM ) { uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; @@ -528,7 +607,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo } ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type); } - else if ( len==BCF_VL_G ) + else if ( len==BCF_VL_G && type!=BCF_HT_STR ) { int iori = buf->split.atoms[iout]->ial; int i01 = bcf_alleles2gt(0,iori); @@ -693,6 +772,13 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec) _split_table_set_format(buf, &rec->d.fmt[i], mode); } + + // Check that at least one FORMAT field was added, if not, the number of samples must be set manually + for (i=0; isplit.nout; i++) + { + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)]; + if ( !out->n_sample ) out->n_sample = rec->n_sample; + } } void abuf_push(abuf_t *buf, bcf1_t *rec) diff --git a/bcftools/bam2bcf.c b/bcftools/bam2bcf.c index 336e2f6..76a0d43 100644 --- a/bcftools/bam2bcf.c +++ b/bcftools/bam2bcf.c @@ -337,10 +337,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // Compensate for AD not being counted on low quality REF indel matches. if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 ) { - for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0? + for (i=0; i<4; i++) { - r->ADR[i] += ADR_ref_missed[i]; - r->ADF[i] += ADF_ref_missed[i]; + r->ADR[0] += ADR_ref_missed[i]; + r->ADF[0] += ADF_ref_missed[i]; } } else if ( r->ADF && bca->ambig_reads==B2B_INC_AD ) @@ -502,17 +502,18 @@ double mann_whitney_1947_cdf(int n, int m, int U) double calc_mwu_bias_cdf(int *a, int *b, int n) { int na = 0, nb = 0, i; - double U = 0, ties = 0; + double U = 0; + //double ties = 0; for (i=0; iADF && bca->ambig_reads==B2B_INC_AD0 ) { - for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0? + for (i=0; i<4; i++) { - r->ADR[i] += ADR_ref_missed[i]; - r->ADF[i] += ADF_ref_missed[i]; + r->ADR[0] += ADR_ref_missed[i]; + r->ADF[0] += ADF_ref_missed[i]; } } else if ( r->ADF && bca->ambig_reads==B2B_INC_AD ) @@ -504,17 +504,18 @@ double mann_whitney_1947_cdf(int n, int m, int U) double calc_mwu_bias_cdf(int *a, int *b, int n) { int na = 0, nb = 0, i; - double U = 0, ties = 0; + double U = 0; + //double ties = 0; for (i=0; iindel_win_size < max_rd_len + ?2*bca->indel_win_size : max_rd_len); for (i=pos; i(i-pos) ) { @@ -720,8 +719,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; + left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0; + right = pos + bca->indel_win_size; if (types[0] < 0) right -= types[0]; // in case the alignments stand out the reference @@ -865,10 +864,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // long read data needs less context. It also tends to // have many more candidate indels to investigate so // speed here matters more. - if (pos - left >= INDEL_WINDOW_SIZE) - left2 += INDEL_WINDOW_SIZE/2; - if (right-pos >= INDEL_WINDOW_SIZE) - right2 -= INDEL_WINDOW_SIZE/2; + if (pos - left >= bca->indel_win_size) + left2 += bca->indel_win_size/2; + if (right-pos >= bca->indel_win_size) + right2 -= bca->indel_win_size/2; } int r_start = p->b->core.pos; diff --git a/bcftools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c index 82bf31c..c2287de 100644 --- a/bcftools/bam2bcf_indel.c.pysam.c +++ b/bcftools/bam2bcf_indel.c.pysam.c @@ -39,7 +39,6 @@ DEALINGS IN THE SOFTWARE. */ KSORT_INIT_GENERIC(uint32_t) #define MINUS_CONST 0x10000000 -#define INDEL_WINDOW_SIZE 110 #define MAX_TYPES 64 @@ -237,8 +236,8 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, // To prevent long stretches of N's to be mistaken for indels // (sometimes thousands of bases), check the number of N's in the // sequence and skip places where half or more reference bases are Ns. - int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len - ?2*INDEL_WINDOW_SIZE : max_rd_len); + int nN=0, i_end = pos + (2*bca->indel_win_size < max_rd_len + ?2*bca->indel_win_size : max_rd_len); for (i=pos; i(i-pos) ) { @@ -722,8 +721,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; + left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0; + right = pos + bca->indel_win_size; if (types[0] < 0) right -= types[0]; // in case the alignments stand out the reference @@ -867,10 +866,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // long read data needs less context. It also tends to // have many more candidate indels to investigate so // speed here matters more. - if (pos - left >= INDEL_WINDOW_SIZE) - left2 += INDEL_WINDOW_SIZE/2; - if (right-pos >= INDEL_WINDOW_SIZE) - right2 -= INDEL_WINDOW_SIZE/2; + if (pos - left >= bca->indel_win_size) + left2 += bca->indel_win_size/2; + if (right-pos >= bca->indel_win_size) + right2 -= bca->indel_win_size/2; } int r_start = p->b->core.pos; diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index 953cf6b..b188e98 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -51,6 +51,7 @@ void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); const char *hts_bcf_wmode2(int file_type, char *fname); +void set_wmode(char dst[8], int file_type, char *fname, int compression_level); // clevel: 0-9 with or zb type, -1 unset char *init_tmp_prefix(const char *prefix); void *smalloc(size_t size); // safe malloc diff --git a/bcftools/consensus.c b/bcftools/consensus.c index a232174..2db7fff 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -975,7 +975,7 @@ static void consensus(args_t *args) // determine if uppercase or lowercase is used in this fasta file if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0; - if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l); + if ( args->mask ) mask_region(args, str.s, str.l); kputs(str.s, &args->fa_buf); bcf1_t **rec_ptr = NULL; diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index 5105a2e..fa5c14b 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -977,7 +977,7 @@ static void consensus(args_t *args) // determine if uppercase or lowercase is used in this fasta file if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0; - if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l); + if ( args->mask ) mask_region(args, str.s, str.l); kputs(str.s, &args->fa_buf); bcf1_t **rec_ptr = NULL; diff --git a/bcftools/convert.c b/bcftools/convert.c index 71dfb51..7fca60b 100644 --- a/bcftools/convert.c +++ b/bcftools/convert.c @@ -38,6 +38,7 @@ THE SOFTWARE. */ #include #include #include +#include #include "bcftools.h" #include "variantkey.h" #include "convert.h" @@ -101,6 +102,9 @@ struct _convert_t void *dat; int ndat; char *undef_info_tag; + void *used_tags_hash; + char **used_tags_list; + int nused_tags; int allow_undef_tags; uint8_t **subset_samples; }; @@ -781,14 +785,13 @@ static void process_gp_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, in n /= convert->nsamples; for (i=0; insamples; i++) { - float sum = 0, *ptr = (float*)convert->dat + i*n; + float *ptr = (float*)convert->dat + i*n; int j; for (j=0; j1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]); - sum+=ptr[j]; } if ( j==line->n_allele ) ksprintf(str," %f %f %f",ptr[0],0.,ptr[1]); // haploid @@ -1205,7 +1208,35 @@ invalid: kputc('.', str); } -static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) +static void _used_tags_add(convert_t *convert, int type, char *key) +{ + kstring_t str = {0,0,0}; + ksprintf(&str,"%s/%s",type==T_INFO?"INFO":"FORMAT",key); + khash_str2int_inc(convert->used_tags_hash,str.s); + convert->nused_tags++; + convert->used_tags_list = (char**)realloc(convert->used_tags_list,sizeof(*convert->used_tags_list)*convert->nused_tags); + convert->used_tags_list[convert->nused_tags-1] = str.s; +} + + +#define _SET_NON_FORMAT_TAGS(function,key,...) \ + if ( !strcmp("CHROM",key) ) { function(__VA_ARGS__, T_CHROM); } \ + else if ( !strcmp("POS",key) ) { function(__VA_ARGS__, T_POS); } \ + else if ( !strcmp("POS0",key) ) { function(__VA_ARGS__, T_POS0); } \ + else if ( !strcmp("END",key) ) { function(__VA_ARGS__, T_END); } \ + else if ( !strcmp("END0",key) ) { function(__VA_ARGS__, T_END0); } \ + else if ( !strcmp("ID",key) ) { function(__VA_ARGS__, T_ID); } \ + else if ( !strcmp("REF",key) ) { function(__VA_ARGS__, T_REF); } \ + else if ( !strcmp("FIRST_ALT",key) ) { function(__VA_ARGS__, T_FIRST_ALT); } \ + else if ( !strcmp("QUAL",key) ) { function(__VA_ARGS__, T_QUAL); } \ + else if ( !strcmp("TYPE",key) ) { function(__VA_ARGS__, T_TYPE); } \ + else if ( !strcmp("FILTER",key) ) { function(__VA_ARGS__, T_FILTER); } \ + else if ( !strcmp("IS_TS",key) ) { function(__VA_ARGS__, T_IS_TS); } \ + else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \ + else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); } + +static void set_type(fmt_t *fmt, int type) { fmt->type = type; } +static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) { convert->nfmt++; if ( convert->nfmt > convert->mfmt ) @@ -1227,26 +1258,22 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key); if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) ) { - if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; } - else if ( !strcmp("POS",key) ) { fmt->type = T_POS; } - else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; } - else if ( !strcmp("END",key) ) { fmt->type = T_END; } - else if ( !strcmp("END0",key) ) { fmt->type = T_END0; } - else if ( !strcmp("ID",key) ) { fmt->type = T_ID; } - else if ( !strcmp("REF",key) ) { fmt->type = T_REF; } - else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } - else if ( !strcmp("FIRST_ALT",key) ) { fmt->type = T_FIRST_ALT; } - else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; } - else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; } - else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } + _SET_NON_FORMAT_TAGS(set_type,key,fmt) + else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } + else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; } else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } - else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; } + else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) + { + fmt->type = T_INFO; + _used_tags_add(convert,T_INFO,key); + } } else if ( fmt->type==T_PBINOM ) { fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key); + _used_tags_add(convert,T_FORMAT,key); } else if ( fmt->type==T_NPASS ) { @@ -1326,12 +1353,12 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) kputsn(p, q-p, &str); if ( is_gtf ) { - if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf); - else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf); - else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf); + if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); + else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT); + else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT); else if ( !strcmp(str.s, "TBCSQ") ) { - fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf); + fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ); fmt->subscript = parse_subscript(&q); if ( fmt->subscript==-1 ) { @@ -1339,7 +1366,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) } else fmt->subscript++; } - else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf); + else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, "GT", is_gtf, T_IUPAC_GT); else if ( !strcmp(str.s, "INFO") ) { if ( *q!='/' ) @@ -1355,8 +1382,9 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO); fmt->subscript = parse_subscript(&q); + _used_tags_add(convert,T_INFO,str.s); } else if ( !strcmp(str.s,"PBINOM") ) { @@ -1366,47 +1394,33 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) while ( *q && *q!=')' ) q++; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - register_tag(convert, T_PBINOM, str.s, is_gtf); + register_tag(convert, str.s, is_gtf, T_PBINOM); q++; } else if ( !strcmp(str.s,"N_PASS") ) error("N_PASS() must be placed outside the square brackets\n"); else { - fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_FORMAT); fmt->subscript = parse_subscript(&q); } } else { - if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf); - else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf); - else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf); - else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf); - else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf); - else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf); - else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf); + _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) else if ( !strcmp(str.s, "ALT") ) { - fmt_t *fmt = register_tag(convert, T_ALT, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); fmt->subscript = parse_subscript(&q); } - else if ( !strcmp(str.s, "FIRST_ALT") ) register_tag(convert, T_FIRST_ALT, str.s, is_gtf); - else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf); - else if ( !strcmp(str.s, "FILTER") ) register_tag(convert, T_FILTER, str.s, is_gtf); - else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf); - else if ( !strcmp(str.s, "IS_TS") ) register_tag(convert, T_IS_TS, str.s, is_gtf); - else if ( !strcmp(str.s, "TYPE") ) register_tag(convert, T_TYPE, str.s, is_gtf); - else if ( !strcmp(str.s, "MASK") ) register_tag(convert, T_MASK, str.s, is_gtf); - else if ( !strcmp(str.s, "LINE") ) register_tag(convert, T_LINE, str.s, is_gtf); - else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, T_CHROM_POS_ID, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, T_GT_TO_PROB3, str.s, is_gtf); - else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, T_PL_TO_PROB3, str.s, is_gtf); - else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf); - else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf); - else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf); + else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, str.s, is_gtf, T_CHROM_POS_ID); + else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_GT_TO_PROB3); + else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_PL_TO_PROB3); + else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_GP_TO_PROB3); + else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, str.s, is_gtf, T_GT_TO_HAP); + else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, str.s, is_gtf, T_GT_TO_HAP2); + else if ( !strcmp(str.s, "RSX") ) register_tag(convert, str.s, is_gtf, T_RSX); + else if ( !strcmp(str.s, "VKX") ) register_tag(convert, str.s, is_gtf, T_VKX); else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n"); else if ( !strcmp(str.s, "INFO") ) { @@ -1417,14 +1431,15 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO); fmt->subscript = parse_subscript(&q); + _used_tags_add(convert,T_INFO,str.s); } else - register_tag(convert, T_INFO, NULL, is_gtf); // the whole INFO + register_tag(convert, NULL, is_gtf, T_INFO); // the whole INFO } else if ( !strcmp(str.s, "FORMAT") ) - register_tag(convert, T_FORMAT, NULL, 0); + register_tag(convert, NULL, 0, T_FORMAT); else if ( !strcmp(str.s,"N_PASS") ) { if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str); @@ -1439,12 +1454,13 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) } if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p-1, &str); - register_tag(convert, T_NPASS, str.s, is_gtf); + register_tag(convert, str.s, is_gtf, T_NPASS); } else { - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO); fmt->subscript = parse_subscript(&q); + _used_tags_add(convert,T_INFO,str.s); } } free(str.s); @@ -1468,7 +1484,7 @@ static char *parse_sep(convert_t *convert, char *p, int is_gtf) q++; } if ( !str.l ) error("Could not parse format string: %s\n", convert->format_str); - register_tag(convert, T_SEP, str.s, is_gtf); + register_tag(convert, str.s, is_gtf, T_SEP); free(str.s); return q; } @@ -1479,6 +1495,7 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char * convert->header = hdr; convert->format_str = strdup(format_str); convert->max_unpack = BCF_UN_STR; + convert->used_tags_hash = khash_str2int_init(); int i, is_gtf = 0; char *p = convert->format_str; @@ -1488,7 +1505,7 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char * switch (*p) { case '[': is_gtf = 1; p++; break; - case ']': is_gtf = 0; register_tag(convert, T_SEP, NULL, 0); p++; break; + case ']': is_gtf = 0; register_tag(convert, NULL, 0, T_SEP); p++; break; case '%': p = parse_tag(convert, p, is_gtf); break; default: p = parse_sep(convert, p, is_gtf); break; } @@ -1519,6 +1536,12 @@ void convert_destroy(convert_t *convert) if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr); free(convert->fmt[i].key); } + if ( convert->nused_tags ) + { + for (i=0; inused_tags; i++) free(convert->used_tags_list[i]); + free(convert->used_tags_list); + } + khash_str2int_destroy(convert->used_tags_hash); free(convert->fmt); free(convert->undef_info_tag); free(convert->dat); @@ -1675,3 +1698,13 @@ int convert_max_unpack(convert_t *convert) return convert->max_unpack; } +int convert_is_tag_used(convert_t *convert, char *tag) +{ + return khash_str2int_has_key(convert->used_tags_hash, tag); +} +const char **convert_list_used_tags(convert_t *convert, int *ntags) +{ + *ntags = convert->nused_tags; + return (const char **)convert->used_tags_list; +} + diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index e3c995f..86cf9e8 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -40,6 +40,7 @@ THE SOFTWARE. */ #include #include #include +#include #include "bcftools.h" #include "variantkey.h" #include "convert.h" @@ -103,6 +104,9 @@ struct _convert_t void *dat; int ndat; char *undef_info_tag; + void *used_tags_hash; + char **used_tags_list; + int nused_tags; int allow_undef_tags; uint8_t **subset_samples; }; @@ -783,14 +787,13 @@ static void process_gp_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, in n /= convert->nsamples; for (i=0; insamples; i++) { - float sum = 0, *ptr = (float*)convert->dat + i*n; + float *ptr = (float*)convert->dat + i*n; int j; for (j=0; j1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]); - sum+=ptr[j]; } if ( j==line->n_allele ) ksprintf(str," %f %f %f",ptr[0],0.,ptr[1]); // haploid @@ -1207,7 +1210,35 @@ invalid: kputc('.', str); } -static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) +static void _used_tags_add(convert_t *convert, int type, char *key) +{ + kstring_t str = {0,0,0}; + ksprintf(&str,"%s/%s",type==T_INFO?"INFO":"FORMAT",key); + khash_str2int_inc(convert->used_tags_hash,str.s); + convert->nused_tags++; + convert->used_tags_list = (char**)realloc(convert->used_tags_list,sizeof(*convert->used_tags_list)*convert->nused_tags); + convert->used_tags_list[convert->nused_tags-1] = str.s; +} + + +#define _SET_NON_FORMAT_TAGS(function,key,...) \ + if ( !strcmp("CHROM",key) ) { function(__VA_ARGS__, T_CHROM); } \ + else if ( !strcmp("POS",key) ) { function(__VA_ARGS__, T_POS); } \ + else if ( !strcmp("POS0",key) ) { function(__VA_ARGS__, T_POS0); } \ + else if ( !strcmp("END",key) ) { function(__VA_ARGS__, T_END); } \ + else if ( !strcmp("END0",key) ) { function(__VA_ARGS__, T_END0); } \ + else if ( !strcmp("ID",key) ) { function(__VA_ARGS__, T_ID); } \ + else if ( !strcmp("REF",key) ) { function(__VA_ARGS__, T_REF); } \ + else if ( !strcmp("FIRST_ALT",key) ) { function(__VA_ARGS__, T_FIRST_ALT); } \ + else if ( !strcmp("QUAL",key) ) { function(__VA_ARGS__, T_QUAL); } \ + else if ( !strcmp("TYPE",key) ) { function(__VA_ARGS__, T_TYPE); } \ + else if ( !strcmp("FILTER",key) ) { function(__VA_ARGS__, T_FILTER); } \ + else if ( !strcmp("IS_TS",key) ) { function(__VA_ARGS__, T_IS_TS); } \ + else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \ + else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); } + +static void set_type(fmt_t *fmt, int type) { fmt->type = type; } +static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type) { convert->nfmt++; if ( convert->nfmt > convert->mfmt ) @@ -1229,26 +1260,22 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key); if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) ) { - if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; } - else if ( !strcmp("POS",key) ) { fmt->type = T_POS; } - else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; } - else if ( !strcmp("END",key) ) { fmt->type = T_END; } - else if ( !strcmp("END0",key) ) { fmt->type = T_END0; } - else if ( !strcmp("ID",key) ) { fmt->type = T_ID; } - else if ( !strcmp("REF",key) ) { fmt->type = T_REF; } - else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } - else if ( !strcmp("FIRST_ALT",key) ) { fmt->type = T_FIRST_ALT; } - else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; } - else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; } - else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } + _SET_NON_FORMAT_TAGS(set_type,key,fmt) + else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; } + else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; } else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } - else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; } + else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) + { + fmt->type = T_INFO; + _used_tags_add(convert,T_INFO,key); + } } else if ( fmt->type==T_PBINOM ) { fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key); + _used_tags_add(convert,T_FORMAT,key); } else if ( fmt->type==T_NPASS ) { @@ -1328,12 +1355,12 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) kputsn(p, q-p, &str); if ( is_gtf ) { - if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf); - else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf); - else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf); + if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); + else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT); + else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT); else if ( !strcmp(str.s, "TBCSQ") ) { - fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf); + fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ); fmt->subscript = parse_subscript(&q); if ( fmt->subscript==-1 ) { @@ -1341,7 +1368,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) } else fmt->subscript++; } - else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf); + else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, "GT", is_gtf, T_IUPAC_GT); else if ( !strcmp(str.s, "INFO") ) { if ( *q!='/' ) @@ -1357,8 +1384,9 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO); fmt->subscript = parse_subscript(&q); + _used_tags_add(convert,T_INFO,str.s); } else if ( !strcmp(str.s,"PBINOM") ) { @@ -1368,47 +1396,33 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) while ( *q && *q!=')' ) q++; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - register_tag(convert, T_PBINOM, str.s, is_gtf); + register_tag(convert, str.s, is_gtf, T_PBINOM); q++; } else if ( !strcmp(str.s,"N_PASS") ) error("N_PASS() must be placed outside the square brackets\n"); else { - fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_FORMAT); fmt->subscript = parse_subscript(&q); } } else { - if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf); - else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf); - else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf); - else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf); - else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf); - else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf); - else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf); + _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) else if ( !strcmp(str.s, "ALT") ) { - fmt_t *fmt = register_tag(convert, T_ALT, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); fmt->subscript = parse_subscript(&q); } - else if ( !strcmp(str.s, "FIRST_ALT") ) register_tag(convert, T_FIRST_ALT, str.s, is_gtf); - else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf); - else if ( !strcmp(str.s, "FILTER") ) register_tag(convert, T_FILTER, str.s, is_gtf); - else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf); - else if ( !strcmp(str.s, "IS_TS") ) register_tag(convert, T_IS_TS, str.s, is_gtf); - else if ( !strcmp(str.s, "TYPE") ) register_tag(convert, T_TYPE, str.s, is_gtf); - else if ( !strcmp(str.s, "MASK") ) register_tag(convert, T_MASK, str.s, is_gtf); - else if ( !strcmp(str.s, "LINE") ) register_tag(convert, T_LINE, str.s, is_gtf); - else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, T_CHROM_POS_ID, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, T_GT_TO_PROB3, str.s, is_gtf); - else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, T_PL_TO_PROB3, str.s, is_gtf); - else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf); - else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf); - else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf); + else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, str.s, is_gtf, T_CHROM_POS_ID); + else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_GT_TO_PROB3); + else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_PL_TO_PROB3); + else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_GP_TO_PROB3); + else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, str.s, is_gtf, T_GT_TO_HAP); + else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, str.s, is_gtf, T_GT_TO_HAP2); + else if ( !strcmp(str.s, "RSX") ) register_tag(convert, str.s, is_gtf, T_RSX); + else if ( !strcmp(str.s, "VKX") ) register_tag(convert, str.s, is_gtf, T_VKX); else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n"); else if ( !strcmp(str.s, "INFO") ) { @@ -1419,14 +1433,15 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p, &str); - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO); fmt->subscript = parse_subscript(&q); + _used_tags_add(convert,T_INFO,str.s); } else - register_tag(convert, T_INFO, NULL, is_gtf); // the whole INFO + register_tag(convert, NULL, is_gtf, T_INFO); // the whole INFO } else if ( !strcmp(str.s, "FORMAT") ) - register_tag(convert, T_FORMAT, NULL, 0); + register_tag(convert, NULL, 0, T_FORMAT); else if ( !strcmp(str.s,"N_PASS") ) { if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str); @@ -1441,12 +1456,13 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) } if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str); kputsn(p, q-p-1, &str); - register_tag(convert, T_NPASS, str.s, is_gtf); + register_tag(convert, str.s, is_gtf, T_NPASS); } else { - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); + fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO); fmt->subscript = parse_subscript(&q); + _used_tags_add(convert,T_INFO,str.s); } } free(str.s); @@ -1470,7 +1486,7 @@ static char *parse_sep(convert_t *convert, char *p, int is_gtf) q++; } if ( !str.l ) error("Could not parse format string: %s\n", convert->format_str); - register_tag(convert, T_SEP, str.s, is_gtf); + register_tag(convert, str.s, is_gtf, T_SEP); free(str.s); return q; } @@ -1481,6 +1497,7 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char * convert->header = hdr; convert->format_str = strdup(format_str); convert->max_unpack = BCF_UN_STR; + convert->used_tags_hash = khash_str2int_init(); int i, is_gtf = 0; char *p = convert->format_str; @@ -1490,7 +1507,7 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char * switch (*p) { case '[': is_gtf = 1; p++; break; - case ']': is_gtf = 0; register_tag(convert, T_SEP, NULL, 0); p++; break; + case ']': is_gtf = 0; register_tag(convert, NULL, 0, T_SEP); p++; break; case '%': p = parse_tag(convert, p, is_gtf); break; default: p = parse_sep(convert, p, is_gtf); break; } @@ -1521,6 +1538,12 @@ void convert_destroy(convert_t *convert) if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr); free(convert->fmt[i].key); } + if ( convert->nused_tags ) + { + for (i=0; inused_tags; i++) free(convert->used_tags_list[i]); + free(convert->used_tags_list); + } + khash_str2int_destroy(convert->used_tags_hash); free(convert->fmt); free(convert->undef_info_tag); free(convert->dat); @@ -1677,3 +1700,13 @@ int convert_max_unpack(convert_t *convert) return convert->max_unpack; } +int convert_is_tag_used(convert_t *convert, char *tag) +{ + return khash_str2int_has_key(convert->used_tags_hash, tag); +} +const char **convert_list_used_tags(convert_t *convert, int *ntags) +{ + *ntags = convert->nused_tags; + return (const char **)convert->used_tags_list; +} + diff --git a/bcftools/convert.h b/bcftools/convert.h index 11e892d..5bbbc2c 100644 --- a/bcftools/convert.h +++ b/bcftools/convert.h @@ -1,6 +1,6 @@ /* convert.h -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -40,6 +40,8 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...); int convert_header(convert_t *convert, kstring_t *str); int convert_line(convert_t *convert, bcf1_t *rec, kstring_t *str); int convert_max_unpack(convert_t *convert); +int convert_is_tag_used(convert_t *convert, char *tag); +const char **convert_list_used_tags(convert_t *convert, int *ntags); #endif diff --git a/bcftools/csq.c b/bcftools/csq.c index 8e3ee3b..6217987 100644 --- a/bcftools/csq.c +++ b/bcftools/csq.c @@ -371,7 +371,7 @@ gf_utr_t; vcsq_t information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T" - vcrec_t + vrec_t single VCF record and csq tied to this record. (Haplotype can have multiple consequences in several VCF records. Each record can have multiple consequences from multiple haplotypes.) @@ -390,6 +390,7 @@ struct _vcsq_t uint32_t strand:1, type:31; // one of CSQ_* types uint32_t trid; + uint32_t vcf_ial; uint32_t biotype; // one of GF_* types char *gene; // gene name bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234" @@ -398,7 +399,7 @@ struct _vcsq_t typedef struct { bcf1_t *line; - uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved + uint32_t *fmt_bm; // bitmask of sample consequences with first/second haplotype interleaved uint32_t nfmt:4, // the bitmask size (the number of integers per sample) nvcsq:28, mvcsq; vcsq_t *vcsq; // there can be multiple consequences for a single VCF record @@ -450,9 +451,10 @@ struct _hap_node_t hap_node_t **child, *prev; // children haplotypes and previous coding node int nchild, mchild; bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record + int vcf_ial; // which VCF allele generated this node uint32_t nend; // number of haplotypes ending in this node int *cur_child, mcur_child; // mapping from the allele to the currently active child - csq_t *csq_list; // list of haplotype's consequences, broken by position + csq_t *csq_list; // list of haplotype's consequences, broken by position (each corresponds to a VCF record) int ncsq_list, mcsq_list; }; struct _tscript_t @@ -588,7 +590,7 @@ typedef struct _args_t char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; char *bcsq_tag; - int argc, output_type; + int argc, output_type, clevel; int phase, verbosity, local_csq, record_cmd_line; int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; @@ -689,7 +691,7 @@ static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end) new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not else { - new_chr = malloc(len+3); // gff does not have the prefix, faidx has + new_chr = malloc(len+4); // gff does not have the prefix, faidx has memcpy(new_chr,"chr",3); memcpy(new_chr+3,chr_beg,len); new_chr[len+3] = 0; @@ -1445,7 +1447,9 @@ void init_data(args_t *args) } else { - args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads > 0) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); @@ -1501,7 +1505,7 @@ void destroy_data(args_t *args) { if ( !vbuf->vrec[j] ) continue; if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line); - free(vbuf->vrec[j]->smpl); + free(vbuf->vrec[j]->fmt_bm); free(vbuf->vrec[j]->vcsq); free(vbuf->vrec[j]); } @@ -1534,7 +1538,7 @@ typedef struct { tscript_t *tr; struct { - int32_t pos, rlen, alen; + int32_t pos, rlen, alen, ial; char *ref, *alt; bcf1_t *rec; } vcf; @@ -1668,7 +1672,7 @@ fprintf(stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s); #endif } void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); -static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type) +static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type, int ial) { while ( regitr_overlap(itr) ) { @@ -1682,13 +1686,14 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = ial; csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); return csq.type.type; } return 0; } -static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type) +static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial) { #if XDBG fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); @@ -1701,6 +1706,7 @@ fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = ial; csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); } @@ -1732,7 +1738,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { - ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); if ( ret!=0 ) { regitr_destroy(itr); @@ -1759,7 +1765,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_OUTSIDE; } if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon @@ -1770,7 +1776,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr { - ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); if ( ret!=0 ) { regitr_destroy(itr); @@ -1797,7 +1803,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_OUTSIDE; } // overlaps the exon or inside the exon @@ -1833,7 +1839,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut splice->vcf.rlen -= splice->tbeg + splice->tend - 1; if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; } } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_INSIDE; } @@ -1948,7 +1954,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% regitr_t *itr = regitr_init(NULL); const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr - csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); } if ( !csq ) @@ -2004,7 +2010,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% regitr_t *itr = regitr_init(NULL); const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr - csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); } if ( !csq ) @@ -2036,7 +2042,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% } if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end ) { - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_OUTSIDE; } if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1 @@ -2067,7 +2073,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% return SPLICE_OVERLAP; } } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_INSIDE; } @@ -2093,7 +2099,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut regitr_t *itr = regitr_init(NULL); const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr - csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); } if ( !csq ) @@ -2123,7 +2129,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut regitr_t *itr = regitr_init(NULL); const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr - csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); } if ( !csq ) @@ -2145,7 +2151,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut } if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end ) { - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_OUTSIDE; } @@ -2167,7 +2173,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_INSIDE; } static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) @@ -2209,10 +2215,12 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, kstring_t str = {0,0,0}; tscript_t *tr = cds->tr; child->icds = cds->icds; // index of cds in the tscript's list of exons + child->vcf_ial = ial; splice_t splice; splice_init(&splice, rec); splice.tr = tr; + splice.vcf.ial = ial; splice.vcf.alt = rec->d.allele[ial]; splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1; if ( !(tr->trim & TRIM_5PRIME) ) @@ -2627,6 +2635,8 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue; if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; if ( csq->type.gene != vrec->vcsq[i].gene ) continue; + if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue; + if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) { // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function @@ -2672,8 +2682,7 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); } // no such csq yet in this vcf record csq->vrec = vrec; - csq->idx = i; - vrec->nvcsq++; + csq->idx = vrec->nvcsq++; hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq); vrec->vcsq[i] = csq->type; return 0; @@ -2760,6 +2769,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq_t *csq = &node->csq_list[icsq]; csq->pos = hap->stack[ref_node].node->rec->pos; csq->type.trid = tr->id; + csq->type.vcf_ial = node->vcf_ial; csq->type.gene = tr->gene->name; csq->type.strand = tr->strand; csq->type.biotype = tr->type; @@ -2823,6 +2833,8 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.type |= CSQ_INFRAME_DELETION; else csq->type.type |= CSQ_INFRAME_INSERTION; + if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' ) + csq->type.type |= CSQ_STOP_GAINED; } else { @@ -2838,6 +2850,12 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.type |= CSQ_MISSENSE_VARIANT; } } + // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored + if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' ) + { + rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING; + csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED; + } if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP; csq->type.type &= ~rm_csq; @@ -2889,6 +2907,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; tmp_csq->pos = hap->stack[i].node->rec->pos; tmp_csq->type.trid = tr->id; + //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants tmp_csq->type.gene = tr->gene->name; tmp_csq->type.strand = tr->strand; tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq; @@ -2904,6 +2923,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; tmp_csq->pos = hap->stack[i].node->rec->pos; tmp_csq->type.trid = tr->id; + //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants tmp_csq->type.gene = tr->gene->name; tmp_csq->type.strand = tr->strand; tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq; @@ -2991,8 +3011,12 @@ void hap_finalize(args_t *args, hap_t *hap) if ( ibeg==-1 ) ibeg = i; continue; } + // the last base of the current variant vs the first base of the next + // variant: are they in the same codon? (forward strand) int icur = node2sbeg(i); int inext = node2sbeg(i+1); + if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen; + else if ( hap->stack[i].node->dlen < 0 ) icur++; if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet { if ( ibeg==-1 ) ibeg = i; @@ -3045,8 +3069,13 @@ void hap_finalize(args_t *args, hap_t *hap) if ( ibeg==-1 ) ibeg = i; continue; } + // the last base of the current variant vs the first base of the next + // variant: are they in the same codon? (reverse strand) int icur = sseq.m - 1 - node2sbeg(i); int inext = sseq.m - 1 - node2sbeg(i-1); + if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen - 1; + else if ( hap->stack[i].node->dlen < 0 ) icur -= hap->stack[i].node->dlen; + if ( hap->stack[i-1].node->dlen > 0 ) inext -= hap->stack[i-1].node->dlen; if ( icur/3 == inext/3 ) { if ( ibeg==-1 ) ibeg = i; @@ -3155,7 +3184,7 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha int ival, ibit; icsq2_to_bit(icsq2, &ival,&ibit); if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; - vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit; + vrec->fmt_bm[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit; } } @@ -3232,8 +3261,8 @@ vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) vrec_t *vrec = vbuf->vrec[vbuf->n - 1]; if ( args->phase!=PHASE_DROP_GT && args->smpl->n ) { - if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq); - else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq); + if ( !vrec->fmt_bm ) vrec->fmt_bm = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->fmt_bm) * args->nfmt_bcsq); + else memset(vrec->fmt_bm,0,args->hdr_nsmpl*sizeof(*vrec->fmt_bm) * args->nfmt_bcsq); } if ( !vrec->line ) vrec->line = bcf_init1(); SWAP(bcf1_t*, (*rec_ptr), vrec->line); @@ -3293,8 +3322,8 @@ void vbuf_flush(args_t *args, uint32_t pos) { if ( vrec->nfmt < args->nfmt_bcsq ) for (j=1; jhdr_nsmpl; j++) - memmove(&vrec->smpl[j*vrec->nfmt], &vrec->smpl[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->smpl)); - bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt); + memmove(&vrec->fmt_bm[j*vrec->nfmt], &vrec->fmt_bm[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->fmt_bm)); + bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->fmt_bm, args->hdr_nsmpl*vrec->nfmt); } vrec->nvcsq = 0; if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); @@ -3409,11 +3438,12 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = i; csq.type.gene = tr->gene->name; int csq_type = node.csq; - // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though + // code repetition: it would be nice to reuse the code from hap_add_csq, needs refactoring though if ( node.type == HAP_SSS ) { csq.type.type = csq_type; @@ -3478,6 +3508,8 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_type |= CSQ_INFRAME_DELETION; else csq_type |= CSQ_INFRAME_INSERTION; + if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' ) + csq_type |= CSQ_STOP_GAINED; } else { @@ -3603,6 +3635,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = 1; csq.type.gene = tr->gene->name; csq.type.type = child->csq; csq_stage(args, &csq, rec); @@ -3715,6 +3748,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = ial; csq.type.gene = tr->gene->name; csq.type.type = child->csq; csq_stage(args, &csq, rec); @@ -3746,7 +3780,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) // known issues: tab output leads to unsorted output. This is because // coding haplotypes are printed in one go and buffering is not used // with tab output. VCF output is OK though. - if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists + if ( csq_push(args, csq, rec)!=0 && args->phase==PHASE_DROP_GT ) return; // the consequence already exists int i,j,ngt = 0; if ( args->phase!=PHASE_DROP_GT ) @@ -3769,7 +3803,9 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; for (j=0; jtype.vcf_ial ) continue; csq_print_text(args, csq, args->smpl->idx[i],j+1); } } @@ -3782,7 +3818,9 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; for (j=0; jtype.vcf_ial ) continue; int icsq2 = 2*csq->idx + j; if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT @@ -3803,7 +3841,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) int ival, ibit; icsq2_to_bit(icsq2, &ival,&ibit); if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; - vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit; + vrec->fmt_bm[i*args->nfmt_bcsq + ival] |= 1 << ibit; } } } @@ -3835,6 +3873,7 @@ int test_utr(args_t *args, bcf1_t *rec) csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = i; csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); ret = 1; @@ -4045,7 +4084,17 @@ static void process(args_t *args, bcf1_t **rec_ptr) bcf1_t *rec = *rec_ptr; static int32_t prev_rid = -1, prev_pos = -1; - if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; prev_pos = rec->pos; } + if ( prev_rid!=rec->rid ) + { + prev_rid = rec->rid; + prev_pos = rec->pos; + + // Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX). + // Perform a simple sanity check (that does not catch much), the chromosome must be present in the + // reference file + if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) ) + error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + } if ( prev_pos > rec->pos ) error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); @@ -4105,36 +4154,38 @@ static const char *usage(void) "Usage: bcftools csq [OPTIONS] in.vcf\n" "\n" "Required options:\n" - " -f, --fasta-ref FILE reference file in fasta format\n" - " -g, --gff-annot FILE gff3 annotation file\n" + " -f, --fasta-ref FILE Reference file in fasta format\n" + " -g, --gff-annot FILE GFF3 annotation file\n" "\n" "CSQ options:\n" - " -B, --trim-protein-seq INT abbreviate protein-changing predictions to max INT aminoacids\n" - " -c, --custom-tag STRING use this tag instead of the default BCSQ\n" - " -l, --local-csq localized predictions, consider only one VCF record at a time\n" - " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n" - " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n" - " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" - " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" - " r: require phased GTs, throw an error on unphased het GTs\n" - " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" - " s: skip unphased hets\n" + " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" + " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n" + " -l, --local-csq Localized predictions, consider only one VCF record at a time\n" + " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n" + " -p, --phase a|m|r|R|s How to handle unphased heterozygous genotypes: [r]\n" + " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" + " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" + " r: require phased GTs, throw an error on unphased het GTs\n" + " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" + " s: skip unphased hets\n" "Options:\n" - " -e, --exclude EXPR exclude sites for which the expression is true\n" - " --force run even if some sanity checks fail\n" - " -i, --include EXPR select sites for which the expression is true\n" - " --no-version do not append version and command line to the header\n" - " -o, --output FILE write output to a file [standard output]\n" - " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" - " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" - " -r, --regions REGION restrict to comma-separated list of regions\n" - " -R, --regions-file FILE restrict to regions listed in a file\n" - " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n" - " -S, --samples-file FILE samples to include\n" - " -t, --targets REGION similar to -r but streams rather than index-jumps\n" - " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" - " --threads INT use multithreading with worker threads [0]\n" - " -v, --verbose INT verbosity level 0-2 [1]\n" + " -e, --exclude EXPR Exclude sites for which the expression is true\n" + " --force Run even if some sanity checks fail\n" + " -i, --include EXPR Select sites for which the expression is true\n" + " --no-version Do not append version and command line to the header\n" + " -o, --output FILE Write output to a file [standard output]\n" + " -O, --output-type b|u|z|v|t[0-9] b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" + " v: uncompressed VCF, t: plain tab-delimited text output, 0-9: compression level [v]\n" + " -r, --regions REGION Restrict to comma-separated list of regions\n" + " -R, --regions-file FILE Restrict to regions listed in a file\n" + " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n" + " -s, --samples -|LIST Samples to include or \"-\" to apply all variants and ignore samples\n" + " -S, --samples-file FILE Samples to include\n" + " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" + " --threads INT Use multithreading with worker threads [0]\n" + " -v, --verbose INT Verbosity level 0-2 [1]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4154,6 +4205,7 @@ int main_csq(int argc, char *argv[]) args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values args->verbosity = 1; args->record_cmd_line = 1; + args->clevel = -1; static struct option loptions[] = { @@ -4176,14 +4228,18 @@ int main_csq(int argc, char *argv[]) {"verbose",1,0,'v'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,4}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,5}, {"no-version",no_argument,NULL,3}, {0,0,0,0} }; int c, targets_is_file = 0, regions_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; char *targets_list = NULL, *regions_list = NULL, *tmp; while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0) { @@ -4235,7 +4291,16 @@ int main_csq(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1); } break; case 'e': @@ -4250,6 +4315,18 @@ int main_csq(int argc, char *argv[]) case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; case 't': targets_list = optarg; break; case 'T': targets_list = optarg; targets_is_file = 1; break; + case 4 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 5 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': error("%s",usage()); default: error("The option not recognised: %s\n\n", optarg); break; @@ -4266,10 +4343,18 @@ int main_csq(int argc, char *argv[]) if ( !args->fa_fname ) error("Missing the --fa-ref option\n"); if ( !args->gff_fname ) error("Missing the --gff option\n"); args->sr = bcf_sr_init(); - if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 ) - error("Failed to read the targets: %s\n", targets_list); - if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", regions_list); + if ( targets_list ) + { + bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,targets_overlap); + if ( bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 ) + error("Failed to read the targets: %s\n", targets_list); + } + if ( regions_list ) + { + bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,regions_overlap); + if ( bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", regions_list); + } if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads); if ( !bcf_sr_add_reader(args->sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum)); diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index e7f6a70..db46c8b 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -373,7 +373,7 @@ gf_utr_t; vcsq_t information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T" - vcrec_t + vrec_t single VCF record and csq tied to this record. (Haplotype can have multiple consequences in several VCF records. Each record can have multiple consequences from multiple haplotypes.) @@ -392,6 +392,7 @@ struct _vcsq_t uint32_t strand:1, type:31; // one of CSQ_* types uint32_t trid; + uint32_t vcf_ial; uint32_t biotype; // one of GF_* types char *gene; // gene name bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234" @@ -400,7 +401,7 @@ struct _vcsq_t typedef struct { bcf1_t *line; - uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved + uint32_t *fmt_bm; // bitmask of sample consequences with first/second haplotype interleaved uint32_t nfmt:4, // the bitmask size (the number of integers per sample) nvcsq:28, mvcsq; vcsq_t *vcsq; // there can be multiple consequences for a single VCF record @@ -452,9 +453,10 @@ struct _hap_node_t hap_node_t **child, *prev; // children haplotypes and previous coding node int nchild, mchild; bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record + int vcf_ial; // which VCF allele generated this node uint32_t nend; // number of haplotypes ending in this node int *cur_child, mcur_child; // mapping from the allele to the currently active child - csq_t *csq_list; // list of haplotype's consequences, broken by position + csq_t *csq_list; // list of haplotype's consequences, broken by position (each corresponds to a VCF record) int ncsq_list, mcsq_list; }; struct _tscript_t @@ -590,7 +592,7 @@ typedef struct _args_t char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; char *bcsq_tag; - int argc, output_type; + int argc, output_type, clevel; int phase, verbosity, local_csq, record_cmd_line; int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; @@ -691,7 +693,7 @@ static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end) new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not else { - new_chr = malloc(len+3); // gff does not have the prefix, faidx has + new_chr = malloc(len+4); // gff does not have the prefix, faidx has memcpy(new_chr,"chr",3); memcpy(new_chr+3,chr_beg,len); new_chr[len+3] = 0; @@ -1447,7 +1449,9 @@ void init_data(args_t *args) } else { - args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads > 0) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); @@ -1503,7 +1507,7 @@ void destroy_data(args_t *args) { if ( !vbuf->vrec[j] ) continue; if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line); - free(vbuf->vrec[j]->smpl); + free(vbuf->vrec[j]->fmt_bm); free(vbuf->vrec[j]->vcsq); free(vbuf->vrec[j]); } @@ -1536,7 +1540,7 @@ typedef struct { tscript_t *tr; struct { - int32_t pos, rlen, alen; + int32_t pos, rlen, alen, ial; char *ref, *alt; bcf1_t *rec; } vcf; @@ -1670,7 +1674,7 @@ fprintf(bcftools_stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s); #endif } void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); -static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type) +static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type, int ial) { while ( regitr_overlap(itr) ) { @@ -1684,13 +1688,14 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = ial; csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); return csq.type.type; } return 0; } -static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type) +static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial) { #if XDBG fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); @@ -1703,6 +1708,7 @@ fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = ial; csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); } @@ -1734,7 +1740,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { - ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); if ( ret!=0 ) { regitr_destroy(itr); @@ -1761,7 +1767,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_OUTSIDE; } if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon @@ -1772,7 +1778,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr { - ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); if ( ret!=0 ) { regitr_destroy(itr); @@ -1799,7 +1805,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_OUTSIDE; } // overlaps the exon or inside the exon @@ -1835,7 +1841,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d splice->vcf.rlen -= splice->tbeg + splice->tend - 1; if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; } } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_INSIDE; } @@ -1950,7 +1956,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, regitr_t *itr = regitr_init(NULL); const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr - csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); } if ( !csq ) @@ -2006,7 +2012,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, regitr_t *itr = regitr_init(NULL); const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr - csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); } if ( !csq ) @@ -2038,7 +2044,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, } if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end ) { - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_OUTSIDE; } if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1 @@ -2069,7 +2075,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, return SPLICE_OVERLAP; } } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_INSIDE; } @@ -2095,7 +2101,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d regitr_t *itr = regitr_init(NULL); const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr - csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); } if ( !csq ) @@ -2125,7 +2131,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d regitr_t *itr = regitr_init(NULL); const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr - csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); + csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); regitr_destroy(itr); } if ( !csq ) @@ -2147,7 +2153,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d } if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end ) { - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_OUTSIDE; } @@ -2169,7 +2175,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); } - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); + csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_INSIDE; } static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) @@ -2211,10 +2217,12 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, kstring_t str = {0,0,0}; tscript_t *tr = cds->tr; child->icds = cds->icds; // index of cds in the tscript's list of exons + child->vcf_ial = ial; splice_t splice; splice_init(&splice, rec); splice.tr = tr; + splice.vcf.ial = ial; splice.vcf.alt = rec->d.allele[ial]; splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1; if ( !(tr->trim & TRIM_5PRIME) ) @@ -2629,6 +2637,8 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue; if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; if ( csq->type.gene != vrec->vcsq[i].gene ) continue; + if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue; + if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) { // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function @@ -2674,8 +2684,7 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); } // no such csq yet in this vcf record csq->vrec = vrec; - csq->idx = i; - vrec->nvcsq++; + csq->idx = vrec->nvcsq++; hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq); vrec->vcsq[i] = csq->type; return 0; @@ -2762,6 +2771,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq_t *csq = &node->csq_list[icsq]; csq->pos = hap->stack[ref_node].node->rec->pos; csq->type.trid = tr->id; + csq->type.vcf_ial = node->vcf_ial; csq->type.gene = tr->gene->name; csq->type.strand = tr->strand; csq->type.biotype = tr->type; @@ -2825,6 +2835,8 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.type |= CSQ_INFRAME_DELETION; else csq->type.type |= CSQ_INFRAME_INSERTION; + if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' ) + csq->type.type |= CSQ_STOP_GAINED; } else { @@ -2840,6 +2852,12 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.type |= CSQ_MISSENSE_VARIANT; } } + // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored + if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' ) + { + rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING; + csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED; + } if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP; csq->type.type &= ~rm_csq; @@ -2891,6 +2909,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; tmp_csq->pos = hap->stack[i].node->rec->pos; tmp_csq->type.trid = tr->id; + //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants tmp_csq->type.gene = tr->gene->name; tmp_csq->type.strand = tr->strand; tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq; @@ -2906,6 +2925,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; tmp_csq->pos = hap->stack[i].node->rec->pos; tmp_csq->type.trid = tr->id; + //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants tmp_csq->type.gene = tr->gene->name; tmp_csq->type.strand = tr->strand; tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq; @@ -2993,8 +3013,12 @@ void hap_finalize(args_t *args, hap_t *hap) if ( ibeg==-1 ) ibeg = i; continue; } + // the last base of the current variant vs the first base of the next + // variant: are they in the same codon? (forward strand) int icur = node2sbeg(i); int inext = node2sbeg(i+1); + if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen; + else if ( hap->stack[i].node->dlen < 0 ) icur++; if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet { if ( ibeg==-1 ) ibeg = i; @@ -3047,8 +3071,13 @@ void hap_finalize(args_t *args, hap_t *hap) if ( ibeg==-1 ) ibeg = i; continue; } + // the last base of the current variant vs the first base of the next + // variant: are they in the same codon? (reverse strand) int icur = sseq.m - 1 - node2sbeg(i); int inext = sseq.m - 1 - node2sbeg(i-1); + if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen - 1; + else if ( hap->stack[i].node->dlen < 0 ) icur -= hap->stack[i].node->dlen; + if ( hap->stack[i-1].node->dlen > 0 ) inext -= hap->stack[i-1].node->dlen; if ( icur/3 == inext/3 ) { if ( ibeg==-1 ) ibeg = i; @@ -3157,7 +3186,7 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha int ival, ibit; icsq2_to_bit(icsq2, &ival,&ibit); if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; - vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit; + vrec->fmt_bm[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit; } } @@ -3234,8 +3263,8 @@ vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) vrec_t *vrec = vbuf->vrec[vbuf->n - 1]; if ( args->phase!=PHASE_DROP_GT && args->smpl->n ) { - if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq); - else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq); + if ( !vrec->fmt_bm ) vrec->fmt_bm = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->fmt_bm) * args->nfmt_bcsq); + else memset(vrec->fmt_bm,0,args->hdr_nsmpl*sizeof(*vrec->fmt_bm) * args->nfmt_bcsq); } if ( !vrec->line ) vrec->line = bcf_init1(); SWAP(bcf1_t*, (*rec_ptr), vrec->line); @@ -3295,8 +3324,8 @@ void vbuf_flush(args_t *args, uint32_t pos) { if ( vrec->nfmt < args->nfmt_bcsq ) for (j=1; jhdr_nsmpl; j++) - memmove(&vrec->smpl[j*vrec->nfmt], &vrec->smpl[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->smpl)); - bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt); + memmove(&vrec->fmt_bm[j*vrec->nfmt], &vrec->fmt_bm[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->fmt_bm)); + bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->fmt_bm, args->hdr_nsmpl*vrec->nfmt); } vrec->nvcsq = 0; if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); @@ -3411,11 +3440,12 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = i; csq.type.gene = tr->gene->name; int csq_type = node.csq; - // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though + // code repetition: it would be nice to reuse the code from hap_add_csq, needs refactoring though if ( node.type == HAP_SSS ) { csq.type.type = csq_type; @@ -3480,6 +3510,8 @@ int test_cds_local(args_t *args, bcf1_t *rec) csq_type |= CSQ_INFRAME_DELETION; else csq_type |= CSQ_INFRAME_INSERTION; + if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' ) + csq_type |= CSQ_STOP_GAINED; } else { @@ -3605,6 +3637,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = 1; csq.type.gene = tr->gene->name; csq.type.type = child->csq; csq_stage(args, &csq, rec); @@ -3717,6 +3750,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = ial; csq.type.gene = tr->gene->name; csq.type.type = child->csq; csq_stage(args, &csq, rec); @@ -3748,7 +3782,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) // known issues: tab output leads to unsorted output. This is because // coding haplotypes are printed in one go and buffering is not used // with tab output. VCF output is OK though. - if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists + if ( csq_push(args, csq, rec)!=0 && args->phase==PHASE_DROP_GT ) return; // the consequence already exists int i,j,ngt = 0; if ( args->phase!=PHASE_DROP_GT ) @@ -3771,7 +3805,9 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; for (j=0; jtype.vcf_ial ) continue; csq_print_text(args, csq, args->smpl->idx[i],j+1); } } @@ -3784,7 +3820,9 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; for (j=0; jtype.vcf_ial ) continue; int icsq2 = 2*csq->idx + j; if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT @@ -3805,7 +3843,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) int ival, ibit; icsq2_to_bit(icsq2, &ival,&ibit); if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; - vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit; + vrec->fmt_bm[i*args->nfmt_bcsq + ival] |= 1 << ibit; } } } @@ -3837,6 +3875,7 @@ int test_utr(args_t *args, bcf1_t *rec) csq.type.biotype = tr->type; csq.type.strand = tr->strand; csq.type.trid = tr->id; + csq.type.vcf_ial = i; csq.type.gene = tr->gene->name; csq_stage(args, &csq, rec); ret = 1; @@ -4047,7 +4086,17 @@ static void process(args_t *args, bcf1_t **rec_ptr) bcf1_t *rec = *rec_ptr; static int32_t prev_rid = -1, prev_pos = -1; - if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; prev_pos = rec->pos; } + if ( prev_rid!=rec->rid ) + { + prev_rid = rec->rid; + prev_pos = rec->pos; + + // Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX). + // Perform a simple sanity check (that does not catch much), the chromosome must be present in the + // reference file + if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) ) + error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); + } if ( prev_pos > rec->pos ) error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); @@ -4107,36 +4156,38 @@ static const char *usage(void) "Usage: bcftools csq [OPTIONS] in.vcf\n" "\n" "Required options:\n" - " -f, --fasta-ref FILE reference file in fasta format\n" - " -g, --gff-annot FILE gff3 annotation file\n" + " -f, --fasta-ref FILE Reference file in fasta format\n" + " -g, --gff-annot FILE GFF3 annotation file\n" "\n" "CSQ options:\n" - " -B, --trim-protein-seq INT abbreviate protein-changing predictions to max INT aminoacids\n" - " -c, --custom-tag STRING use this tag instead of the default BCSQ\n" - " -l, --local-csq localized predictions, consider only one VCF record at a time\n" - " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n" - " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n" - " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" - " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" - " r: require phased GTs, throw an error on unphased het GTs\n" - " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" - " s: skip unphased hets\n" + " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" + " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n" + " -l, --local-csq Localized predictions, consider only one VCF record at a time\n" + " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n" + " -p, --phase a|m|r|R|s How to handle unphased heterozygous genotypes: [r]\n" + " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" + " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" + " r: require phased GTs, throw an error on unphased het GTs\n" + " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" + " s: skip unphased hets\n" "Options:\n" - " -e, --exclude EXPR exclude sites for which the expression is true\n" - " --force run even if some sanity checks fail\n" - " -i, --include EXPR select sites for which the expression is true\n" - " --no-version do not append version and command line to the header\n" - " -o, --output FILE write output to a file [standard output]\n" - " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" - " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" - " -r, --regions REGION restrict to comma-separated list of regions\n" - " -R, --regions-file FILE restrict to regions listed in a file\n" - " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n" - " -S, --samples-file FILE samples to include\n" - " -t, --targets REGION similar to -r but streams rather than index-jumps\n" - " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" - " --threads INT use multithreading with worker threads [0]\n" - " -v, --verbose INT verbosity level 0-2 [1]\n" + " -e, --exclude EXPR Exclude sites for which the expression is true\n" + " --force Run even if some sanity checks fail\n" + " -i, --include EXPR Select sites for which the expression is true\n" + " --no-version Do not append version and command line to the header\n" + " -o, --output FILE Write output to a file [standard output]\n" + " -O, --output-type b|u|z|v|t[0-9] b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" + " v: uncompressed VCF, t: plain tab-delimited text output, 0-9: compression level [v]\n" + " -r, --regions REGION Restrict to comma-separated list of regions\n" + " -R, --regions-file FILE Restrict to regions listed in a file\n" + " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n" + " -s, --samples -|LIST Samples to include or \"-\" to apply all variants and ignore samples\n" + " -S, --samples-file FILE Samples to include\n" + " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" + " --threads INT Use multithreading with worker threads [0]\n" + " -v, --verbose INT Verbosity level 0-2 [1]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4156,6 +4207,7 @@ int main_csq(int argc, char *argv[]) args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values args->verbosity = 1; args->record_cmd_line = 1; + args->clevel = -1; static struct option loptions[] = { @@ -4178,14 +4230,18 @@ int main_csq(int argc, char *argv[]) {"verbose",1,0,'v'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,4}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,5}, {"no-version",no_argument,NULL,3}, {0,0,0,0} }; int c, targets_is_file = 0, regions_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; char *targets_list = NULL, *regions_list = NULL, *tmp; while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0) { @@ -4237,7 +4293,16 @@ int main_csq(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1); } break; case 'e': @@ -4252,6 +4317,18 @@ int main_csq(int argc, char *argv[]) case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; case 't': targets_list = optarg; break; case 'T': targets_list = optarg; targets_is_file = 1; break; + case 4 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 5 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': error("%s",usage()); default: error("The option not recognised: %s\n\n", optarg); break; @@ -4268,10 +4345,18 @@ int main_csq(int argc, char *argv[]) if ( !args->fa_fname ) error("Missing the --fa-ref option\n"); if ( !args->gff_fname ) error("Missing the --gff option\n"); args->sr = bcf_sr_init(); - if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 ) - error("Failed to read the targets: %s\n", targets_list); - if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", regions_list); + if ( targets_list ) + { + bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,targets_overlap); + if ( bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 ) + error("Failed to read the targets: %s\n", targets_list); + } + if ( regions_list ) + { + bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,regions_overlap); + if ( bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", regions_list); + } if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads); if ( !bcf_sr_add_reader(args->sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum)); diff --git a/bcftools/mcall.c b/bcftools/mcall.c index e96d41d..5761896 100644 --- a/bcftools/mcall.c +++ b/bcftools/mcall.c @@ -1291,16 +1291,16 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) call->als[nals] = call->tgt_als->allele[i]; j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]); - if ( j+1==*unseen ) - { - fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s). VCF=",i,call->tgt_als->allele[i]); - int k; - for (k=0; kn_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]); - fprintf(stderr,"\tTAB="); - for (k=0; ktgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); - fprintf(stderr,"\n"); - return -1; - } + // if ( j+1==*unseen ) + // { + // fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen); + // int k; + // for (k=0; kn_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]); + // fprintf(stderr,"\tTAB="); + // for (k=0; ktgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); + // fprintf(stderr,"\n"); + // return -1; + // } if ( j>=0 ) { diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c index c2d38a6..ea57344 100644 --- a/bcftools/mcall.c.pysam.c +++ b/bcftools/mcall.c.pysam.c @@ -1293,16 +1293,16 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) call->als[nals] = call->tgt_als->allele[i]; j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]); - if ( j+1==*unseen ) - { - fprintf(bcftools_stderr,"Fixme? Cannot constrain to %d-th allele (%s). VCF=",i,call->tgt_als->allele[i]); - int k; - for (k=0; kn_allele; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",rec->d.allele[k]); - fprintf(bcftools_stderr,"\tTAB="); - for (k=0; ktgt_als->n; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); - fprintf(bcftools_stderr,"\n"); - return -1; - } + // if ( j+1==*unseen ) + // { + // fprintf(bcftools_stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen); + // int k; + // for (k=0; kn_allele; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",rec->d.allele[k]); + // fprintf(bcftools_stderr,"\tTAB="); + // for (k=0; ktgt_als->n; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); + // fprintf(bcftools_stderr,"\n"); + // return -1; + // } if ( j>=0 ) { diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c index 1f40eff..eb0cc64 100644 --- a/bcftools/mpileup.c +++ b/bcftools/mpileup.c @@ -70,11 +70,11 @@ typedef struct { int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, max_indel_depth, max_read_len, fmt_flag, ambig_reads; int rflag_require, rflag_filter, output_type; - int openQ, extQ, tandemQ, min_support; // for indels + int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels double indel_bias; char *reg_fname, *pl_list, *fai_fname, *output_fname; - int reg_is_file, record_cmd_line, n_threads; + int reg_is_file, record_cmd_line, n_threads, clevel; faidx_t *fai; regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions regitr_t *bed_itr, *reg_itr; @@ -315,15 +315,16 @@ static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) } if (ma->conf->flag & MPLP_REALN) { - int i, tot_ins = 0; + int i; + // int tot_ins = 0; + // int p = 0; uint32_t *cigar = bam_get_cigar(b); - int p = 0; for (i=0; icore.n_cigar; i++) { int cig = cigar[i] & BAM_CIGAR_MASK; - if (bam_cigar_type(cig) & 2) - p += cigar[i] >> BAM_CIGAR_SHIFT; + // if (bam_cigar_type(cig) & 2) + // p += cigar[i] >> BAM_CIGAR_SHIFT; if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) { - tot_ins += cigar[i] >> BAM_CIGAR_SHIFT; + // tot_ins += cigar[i] >> BAM_CIGAR_SHIFT; // Possible further optimsation, check tot_ins==1 later // (and remove break) so we can detect single bp indels. // We may want to focus BAQ on more complex regions only. @@ -718,7 +719,9 @@ static int mpileup(mplp_conf_t *conf) fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header - conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname)); + char wmode[8]; + set_wmode(wmode,conf->output_type,conf->output_fname,conf->clevel); + conf->bcf_fp = hts_open(conf->output_fname ? conf->output_fname : "-", wmode); if (conf->bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); @@ -843,6 +846,7 @@ static int mpileup(mplp_conf_t *conf) conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bca->fmt_flag = conf->fmt_flag; conf->bca->ambig_reads = conf->ambig_reads; + conf->bca->indel_win_size = conf->indel_win_size; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; @@ -1095,74 +1099,76 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n" "\n" "Input options:\n" - " -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n" - " -A, --count-orphans do not discard anomalous read pairs\n" - " -b, --bam-list FILE list of input BAM filenames, one per line\n" - " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" - " -C, --adjust-MQ INT adjust mapping quality [0]\n" + " -6, --illumina1.3+ Quality is in the Illumina-1.3+ encoding\n" + " -A, --count-orphans Do not discard anomalous read pairs\n" + " -b, --bam-list FILE List of input BAM filenames, one per line\n" + " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" + " -C, --adjust-MQ INT Adjust mapping quality [0]\n" " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" - " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); fprintf(fp, - " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" - " -f, --fasta-ref FILE faidx indexed reference sequence file\n" - " --no-reference do not require fasta reference file\n" - " -G, --read-groups FILE select or exclude read groups listed in the file\n" - " -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n" + " -f, --fasta-ref FILE Faidx indexed reference sequence file\n" + " --no-reference Do not require fasta reference file\n" + " -G, --read-groups FILE Select or exclude read groups listed in the file\n" + " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); fprintf(fp, - " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); fprintf(fp, - " --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); + " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); fprintf(fp, " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ); fprintf(fp, - " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" - " -R, --regions-file FILE restrict to regions listed in a file\n" - " --ignore-RG ignore RG tags (one BAM = one sample)\n" - " --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); + " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" + " -R, --regions-file FILE Restrict to regions listed in a file\n" + " --ignore-RG Ignore RG tags (one BAM = one sample)\n" + " --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset [%s]\n", tmp_require); fprintf(fp, - " --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" + " --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, - " -s, --samples LIST comma separated list of samples to include\n" - " -S, --samples-file FILE file of samples to include\n" - " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" - " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" - " -x, --ignore-overlaps disable read-pair overlap detection\n" - " --seed INT random number seed used for sampling deep regions [0]\n" + " -s, --samples LIST Comma separated list of samples to include\n" + " -S, --samples-file FILE File of samples to include\n" + " -t, --targets REG[,...] Similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " -x, --ignore-overlaps Disable read-pair overlap detection\n" + " --seed INT Random number seed used for sampling deep regions [0]\n" "\n" "Output options:\n" - " -a, --annotate LIST optional tags to output; '?' to list available tags []\n" - " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" - " to minimum per-sample DP\n" - " --no-version do not append version and command line to the header\n" - " -o, --output FILE write output to FILE [standard output]\n" + " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n" + " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n" + " To minimum per-sample DP\n" + " --no-version Do not append version and command line to the header\n" + " -o, --output FILE Write output to FILE [standard output]\n" " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" - " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" - " -U, --mwu-u use older probability scale for Mann-Whitney U test\n" - " --threads INT use multithreading with INT worker threads [0]\n" + " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" + " -U, --mwu-u Use older probability scale for Mann-Whitney U test\n" + " --threads INT Use multithreading with INT worker threads [0]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" " -X, --config STR Specify platform specific profiles (see below)\n" " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); fprintf(fp, - " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); + " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac); fprintf(fp, - " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", mplp->tandemQ); fprintf(fp, - " -I, --skip-indels do not perform indel calling\n" - " -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + " -I, --skip-indels Do not perform indel calling\n" + " -L, --max-idepth INT Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); fprintf(fp, - " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); fprintf(fp, - " -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); + " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); fprintf(fp, " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); fprintf(fp, - " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" - " -P, --platforms STR comma separated list of platforms for indels [all]\n" + " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n" + " -P, --platforms STR Comma separated list of platforms for indels [all]\n" " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); fprintf(fp, " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); + fprintf(fp, + " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" @@ -1210,6 +1216,8 @@ int main_mpileup(int argc, char *argv[]) mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; + mplp.indel_win_size = 110; + mplp.clevel = -1; hts_srand48(0); static const struct option lopts[] = @@ -1260,6 +1268,7 @@ int main_mpileup(int argc, char *argv[]) {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, {"indel-bias", required_argument, NULL, 10}, + {"indel-size", required_argument, NULL, 15}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, @@ -1339,7 +1348,18 @@ int main_mpileup(int argc, char *argv[]) case 'u': mplp.output_type = FT_BCF; break; case 'z': mplp.output_type = FT_VCF_GZ; break; case 'v': mplp.output_type = FT_VCF; break; - default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); + default: + { + char *tmp; + mplp.clevel = strtol(optarg,&tmp,10); + if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + char *tmp; + mplp.clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1); } break; case 'C': mplp.capQ_thres = atoi(optarg); break; @@ -1364,6 +1384,17 @@ int main_mpileup(int argc, char *argv[]) else mplp.indel_bias = 1/atof(optarg); break; + case 15: { + char *tmp; + mplp.indel_win_size = strtol(optarg,&tmp,10); + if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg); + if ( mplp.indel_win_size < 110 ) + { + mplp.indel_win_size = 110; + fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size); + } + } + break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index c66c752..7ba73f8 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -72,11 +72,11 @@ typedef struct { int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, max_indel_depth, max_read_len, fmt_flag, ambig_reads; int rflag_require, rflag_filter, output_type; - int openQ, extQ, tandemQ, min_support; // for indels + int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels double indel_bias; char *reg_fname, *pl_list, *fai_fname, *output_fname; - int reg_is_file, record_cmd_line, n_threads; + int reg_is_file, record_cmd_line, n_threads, clevel; faidx_t *fai; regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions regitr_t *bed_itr, *reg_itr; @@ -317,15 +317,16 @@ static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) } if (ma->conf->flag & MPLP_REALN) { - int i, tot_ins = 0; + int i; + // int tot_ins = 0; + // int p = 0; uint32_t *cigar = bam_get_cigar(b); - int p = 0; for (i=0; icore.n_cigar; i++) { int cig = cigar[i] & BAM_CIGAR_MASK; - if (bam_cigar_type(cig) & 2) - p += cigar[i] >> BAM_CIGAR_SHIFT; + // if (bam_cigar_type(cig) & 2) + // p += cigar[i] >> BAM_CIGAR_SHIFT; if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) { - tot_ins += cigar[i] >> BAM_CIGAR_SHIFT; + // tot_ins += cigar[i] >> BAM_CIGAR_SHIFT; // Possible further optimsation, check tot_ins==1 later // (and remove break) so we can detect single bp indels. // We may want to focus BAQ on more complex regions only. @@ -720,7 +721,9 @@ static int mpileup(mplp_conf_t *conf) fprintf(bcftools_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header - conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname)); + char wmode[8]; + set_wmode(wmode,conf->output_type,conf->output_fname,conf->clevel); + conf->bcf_fp = hts_open(conf->output_fname ? conf->output_fname : "-", wmode); if (conf->bcf_fp == NULL) { fprintf(bcftools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); bcftools_exit(EXIT_FAILURE); @@ -845,6 +848,7 @@ static int mpileup(mplp_conf_t *conf) conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bca->fmt_flag = conf->fmt_flag; conf->bca->ambig_reads = conf->ambig_reads; + conf->bca->indel_win_size = conf->indel_win_size; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; @@ -1097,74 +1101,76 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n" "\n" "Input options:\n" - " -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n" - " -A, --count-orphans do not discard anomalous read pairs\n" - " -b, --bam-list FILE list of input BAM filenames, one per line\n" - " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" - " -C, --adjust-MQ INT adjust mapping quality [0]\n" + " -6, --illumina1.3+ Quality is in the Illumina-1.3+ encoding\n" + " -A, --count-orphans Do not discard anomalous read pairs\n" + " -b, --bam-list FILE List of input BAM filenames, one per line\n" + " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" + " -C, --adjust-MQ INT Adjust mapping quality [0]\n" " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" - " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); + " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); fprintf(fp, - " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" - " -f, --fasta-ref FILE faidx indexed reference sequence file\n" - " --no-reference do not require fasta reference file\n" - " -G, --read-groups FILE select or exclude read groups listed in the file\n" - " -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); + " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n" + " -f, --fasta-ref FILE Faidx indexed reference sequence file\n" + " --no-reference Do not require fasta reference file\n" + " -G, --read-groups FILE Select or exclude read groups listed in the file\n" + " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq); fprintf(fp, - " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); + " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ); fprintf(fp, - " --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); + " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ); fprintf(fp, " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ); fprintf(fp, - " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n" - " -R, --regions-file FILE restrict to regions listed in a file\n" - " --ignore-RG ignore RG tags (one BAM = one sample)\n" - " --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); + " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" + " -R, --regions-file FILE Restrict to regions listed in a file\n" + " --ignore-RG Ignore RG tags (one BAM = one sample)\n" + " --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset [%s]\n", tmp_require); fprintf(fp, - " --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" + " --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, - " -s, --samples LIST comma separated list of samples to include\n" - " -S, --samples-file FILE file of samples to include\n" - " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n" - " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" - " -x, --ignore-overlaps disable read-pair overlap detection\n" - " --seed INT random number seed used for sampling deep regions [0]\n" + " -s, --samples LIST Comma separated list of samples to include\n" + " -S, --samples-file FILE File of samples to include\n" + " -t, --targets REG[,...] Similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " -x, --ignore-overlaps Disable read-pair overlap detection\n" + " --seed INT Random number seed used for sampling deep regions [0]\n" "\n" "Output options:\n" - " -a, --annotate LIST optional tags to output; '?' to list available tags []\n" - " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" - " to minimum per-sample DP\n" - " --no-version do not append version and command line to the header\n" - " -o, --output FILE write output to FILE [standard output]\n" + " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n" + " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n" + " To minimum per-sample DP\n" + " --no-version Do not append version and command line to the header\n" + " -o, --output FILE Write output to FILE [standard output]\n" " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" - " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" - " -U, --mwu-u use older probability scale for Mann-Whitney U test\n" - " --threads INT use multithreading with INT worker threads [0]\n" + " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" + " -U, --mwu-u Use older probability scale for Mann-Whitney U test\n" + " --threads INT Use multithreading with INT worker threads [0]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" " -X, --config STR Specify platform specific profiles (see below)\n" " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); fprintf(fp, - " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac); + " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac); fprintf(fp, - " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ); + " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", mplp->tandemQ); fprintf(fp, - " -I, --skip-indels do not perform indel calling\n" - " -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); + " -I, --skip-indels Do not perform indel calling\n" + " -L, --max-idepth INT Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth); fprintf(fp, - " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); + " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support); fprintf(fp, - " -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); + " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len); fprintf(fp, " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ); fprintf(fp, - " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n" - " -P, --platforms STR comma separated list of platforms for indels [all]\n" + " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n" + " -P, --platforms STR Comma separated list of platforms for indels [all]\n" " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); fprintf(fp, " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); + fprintf(fp, + " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" @@ -1212,6 +1218,8 @@ int main_mpileup(int argc, char *argv[]) mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; + mplp.indel_win_size = 110; + mplp.clevel = -1; hts_srand48(0); static const struct option lopts[] = @@ -1262,6 +1270,7 @@ int main_mpileup(int argc, char *argv[]) {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, {"indel-bias", required_argument, NULL, 10}, + {"indel-size", required_argument, NULL, 15}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, @@ -1341,7 +1350,18 @@ int main_mpileup(int argc, char *argv[]) case 'u': mplp.output_type = FT_BCF; break; case 'z': mplp.output_type = FT_VCF_GZ; break; case 'v': mplp.output_type = FT_VCF; break; - default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); + default: + { + char *tmp; + mplp.clevel = strtol(optarg,&tmp,10); + if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + char *tmp; + mplp.clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1); } break; case 'C': mplp.capQ_thres = atoi(optarg); break; @@ -1366,6 +1386,17 @@ int main_mpileup(int argc, char *argv[]) else mplp.indel_bias = 1/atof(optarg); break; + case 15: { + char *tmp; + mplp.indel_win_size = strtol(optarg,&tmp,10); + if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg); + if ( mplp.indel_win_size < 110 ) + { + mplp.indel_win_size = 110; + fprintf(bcftools_stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size); + } + } + break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index 0976fe3..14ee5de 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -67,11 +67,12 @@ typedef struct } annot_line_t; -#define REPLACE_MISSING 0 // replace only missing values -#define REPLACE_ALL 1 // replace both missing and existing values -#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing -#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise -#define MATCH_VALUE 4 // do not set, just match the value -c ~ID +#define REPLACE_MISSING (1<<0) // -c +TAG .. replace only missing values +#define REPLACE_ALL (1<<1) // -c TAG .. replace both missing and existing values +#define REPLACE_NON_MISSING (1<<2) // -c -TAG .. replace only if tgt is not missing +#define SET_OR_APPEND (1<<3) // -c =TAG .. set new value if missing or non-existent, append otherwise +#define MATCH_VALUE (1<<4) // -c ~ID .. do not set, just match the value +#define CARRY_OVER_MISSING (1<<5) // -c .TAG .. carry over source missing values as well #define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest #define MM_APPEND 1 // append, possibly multiple times #define MM_UNIQUE 2 // append, only unique values @@ -114,7 +115,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr, *hdr_out, *tgts_hdr; htsFile *out_fh; - int output_type, n_threads; + int output_type, n_threads, clevel; bcf_sr_regions_t *tgts; regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns @@ -139,6 +140,7 @@ typedef struct _args_t annot_col_t *cols; // column indexes and setters int ncols; int match_id; // set iff `-c ~ID` given + int match_end; // set iff `-c ~INFO/END` is given char *set_ids_fmt; convert_t *set_ids; @@ -292,6 +294,15 @@ static void init_remove_annots(args_t *args) void *keep = khash_str2int_init(); kstring_t str = {0,0,0}; char *ss = args->remove_annots; + + int i, ntags, needs_info = 0; + if ( args->set_ids ) + { + const char **tags = convert_list_used_tags(args->set_ids,&ntags); + for (i=0; inrm++; @@ -352,7 +363,11 @@ static void init_remove_annots(args_t *args) fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); tag->key = strdup(str.s); - if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; + if ( type==BCF_HL_INFO ) + { + tag->handler = remove_info_tag; + if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt); + } else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; } else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) ) @@ -365,7 +380,11 @@ static void init_remove_annots(args_t *args) else { tag->key = strdup(str.s); - if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; + if ( type==BCF_HL_INFO ) + { + tag->handler = remove_info_tag; + if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt); + } else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key); } @@ -379,6 +398,7 @@ static void init_remove_annots(args_t *args) else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual; else if ( !strcasecmp("INFO",str.s) ) { + if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt); tag->handler = remove_info; if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO); } @@ -502,12 +522,16 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat // note: so far this works only with one filter, not a list of filters annot_line_t *tab = (annot_line_t*) data; - if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." + if ( tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) // don't overwrite with a missing value unless asked + { + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0); + return 0; + } hts_expand(int,1,args->mtmpi,args->tmpi); args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]); if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]); - if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]); - if ( col->replace!=REPLACE_MISSING ) + if ( col->replace & SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]); + if ( !(col->replace & REPLACE_MISSING) ) { bcf_update_filter(args->hdr_out,line,NULL,0); return bcf_update_filter(args->hdr_out,line,args->tmpi,1); @@ -526,10 +550,14 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void bcf1_t *rec = (bcf1_t*) data; if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT); if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); - if ( !rec->d.n_flt ) return 0; // don't overwrite with a missing value - if ( col->replace==SET_OR_APPEND || col->replace==REPLACE_MISSING ) + if ( !rec->d.n_flt ) // don't overwrite with a missing value unless asked { - if ( col->replace==REPLACE_MISSING && line->d.n_flt ) return 0; // only update missing FILTER + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0); + return 0; + } + if ( col->replace & (SET_OR_APPEND|REPLACE_MISSING) ) + { + if ( (col->replace & REPLACE_MISSING) && line->d.n_flt ) return 0; // only update missing FILTER for (i=0; id.n_flt; i++) { const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]); @@ -546,10 +574,21 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void bcf_update_filter(args->hdr_out,line,NULL,0); return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt); } +static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." + char *tmp; + int pos = strtol(tab->cols[col->icol], &tmp, 10); + if ( tmp==tab->cols[col->icol] ) + error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]); + line->pos = pos - 1; + return 0; +} static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n"); - if ( col->replace==MATCH_VALUE ) return 0; + if ( col->replace & MATCH_VALUE ) return 0; // possible cases: // IN ANNOT OUT ACHIEVED_BY @@ -562,8 +601,8 @@ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) // annot_line_t *tab = (annot_line_t*) data; if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." - if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]); - if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]); + if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]); + if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]); // running with +ID, only update missing ids if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) @@ -572,7 +611,7 @@ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) } static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { - if ( col->replace==MATCH_VALUE ) return 0; + if ( col->replace & MATCH_VALUE ) return 0; bcf1_t *rec = (bcf1_t*) data; @@ -588,8 +627,8 @@ static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *dat if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "." id = rec->d.id; } - if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id); - if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id); + if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id); + if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,id); // running with +ID, only update missing ids if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) @@ -630,9 +669,12 @@ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) annot_line_t *tab = (annot_line_t*) data; char *str = tab->cols[col->icol]; - if ( str[0]=='.' && str[1]==0 ) return 0; // empty - - if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0; + if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked + { + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual); + return 0; + } + if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0; line->qual = strtod(str, &str); if ( str == tab->cols[col->icol] ) @@ -642,8 +684,12 @@ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - if ( bcf_float_is_missing(rec->qual) ) return 0; - if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0; + if ( bcf_float_is_missing(rec->qual) ) // don't overwrite with a missing value unless asked + { + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual); + return 0; + } + if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0; line->qual = rec->qual; return 0; } @@ -653,7 +699,11 @@ static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void * annot_line_t *tab = (annot_line_t*) data; char *str = tab->cols[col->icol]; - if ( str[0]=='.' && str[1]==0 ) return 0; + if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked + { + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); + return 0; + } if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1); if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); @@ -690,7 +740,7 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int if ( ntmpi2 < ndst ) args->tmpi2[i] = bcf_int32_missing; continue; } - if ( ntmpi2==ndst && col->replace==REPLACE_MISSING + if ( ntmpi2==ndst && (col->replace & REPLACE_MISSING) && args->tmpi2[i]!=bcf_int32_missing && args->tmpi2[i]!=bcf_int32_vector_end ) continue; @@ -704,7 +754,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d // This is a bit hacky, only to reuse existing code with minimal changes: // -c =TAG will now behave as -l TAG:APPEND for integers - if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND; + if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND; if ( !tab ) { @@ -716,10 +766,23 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } int i,ntmpi = 0; + if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused ) + { + ntmpi = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi, &args->mtmpi); + if ( ntmpi>0 && (args->tmpi[0]!=bcf_int32_missing || (col->replace & CARRY_OVER_MISSING)) ) + { + col->mm_dbl_nused = col->mm_dbl_ndat = ntmpi; + hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); + for (i=0; imm_dbl[i] = args->tmpi[i]; + col->mm_dbl_ndat = 1; + } + ntmpi = 0; + } if ( tab ) // has data, not flushing yet { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1; while ( *end ) { @@ -727,7 +790,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) { - if ( col->merge_method==MM_APPEND_MISSING ) + if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) args->tmpi[ntmpi-1] = bcf_int32_missing; else ntmpi--; @@ -794,12 +857,11 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); } static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -811,7 +873,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; @@ -842,7 +904,7 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int if ( ntmpf2 < ndst ) bcf_float_set_missing(args->tmpf2[i]); continue; } - if ( ntmpf2==ndst && col->replace==REPLACE_MISSING + if ( ntmpf2==ndst && (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(args->tmpf2[i]) && !bcf_float_is_vector_end(args->tmpf2[i]) ) continue; @@ -856,7 +918,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * // This is a bit hacky, only to reuse existing code with minimal changes: // -c =TAG will now behave as -l TAG:APPEND for floats - if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND; + if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND; if ( !tab ) { @@ -868,10 +930,26 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * } int i,ntmpf = 0; - if ( tab ) + if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused ) + { + ntmpf = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf, &args->mtmpf); + if ( ntmpf>0 && (!bcf_float_is_missing(args->tmpf[0]) || (col->replace & CARRY_OVER_MISSING)) ) + { + col->mm_dbl_nused = ntmpf; + hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); + for (i=0; itmpf[i]) ) + bcf_double_set_missing(col->mm_dbl[i]); + else + col->mm_dbl[i] = args->tmpf[i]; + col->mm_dbl_ndat = 1; + } + ntmpf = 0; + } + if ( tab ) // data row, not just flushing { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1; while ( *end ) { @@ -879,7 +957,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * hts_expand(float,ntmpf,args->mtmpf,args->tmpf); if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) { - if ( col->merge_method==MM_APPEND_MISSING ) + if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) bcf_float_set_missing(args->tmpf[ntmpf-1]); else ntmpf--; @@ -962,7 +1040,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; @@ -979,7 +1057,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; @@ -1026,7 +1104,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in if ( empty ) copy_string_field(".",0,1,&args->tmpks,i); continue; } - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { // Do not replace filled values. The field must be looked up again because // of realloc in copy_string_field @@ -1055,7 +1133,7 @@ void khash_str2int_clear_free(void *_hash) } static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { - if ( col->replace==REPLACE_MISSING && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) + if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) { int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; @@ -1063,7 +1141,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d // This is a bit hacky, only to reuse existing code with minimal changes: // -c =TAG will now behave as -l TAG:unique for strings - if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE; + if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE; annot_line_t *tab = (annot_line_t*) data; @@ -1072,7 +1150,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d { len = strlen(tab->cols[col->icol]); if ( !len ) return 0; - if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1; + if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1; } if ( col->merge_method!=MM_FIRST ) @@ -1090,6 +1168,14 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); } + if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l ) + { + int m = col->mm_kstr.m; + int n = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &col->mm_kstr.s, &m); + col->mm_kstr.m = m; + if ( n>0 && ((col->replace & CARRY_OVER_MISSING) || col->mm_kstr.s[0]!='.' || col->mm_kstr.s[1]) ) col->mm_kstr.l = n; + } + if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); kputs(tab->cols[col->icol], &col->mm_kstr); return 1; @@ -1135,7 +1221,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; @@ -1210,7 +1296,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo nsrc /= bcf_hdr_nsamples(args->files->readers[1].header); if ( ndst<=0 ) // field not present in dst file { - if ( col->replace==REPLACE_NON_MISSING ) return 0; + if ( col->replace & REPLACE_NON_MISSING ) return 0; hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); for (i=0; ihdr_out); i++) { @@ -1235,8 +1321,8 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( args->sample_map[i]==-1 ) continue; int32_t *src = args->tmpi + nsrc*args->sample_map[i]; int32_t *dst = args->tmpi2 + ndst*i; - if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue; - if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue; + if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(dst[0]) ) continue; + if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(dst[0]) ) continue; for (j=0; jtmpi3 + nsrc*i; int keep_ori = 0; if ( args->sample_map[i]==-1 ) keep_ori = 1; - else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1; - else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1; + else if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(ori[0]) ) keep_ori = 1; + else if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1; if ( keep_ori ) { for (j=0; j 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out); if ( ndst<=0 ) { - if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); for (i=0; ihdr_out); i++) { @@ -1331,9 +1417,9 @@ static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING // x . . -TAG .. REPLACE_NON_MISSING - if ( col->replace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } - else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } - else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } + else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } + else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } for (j=0; jtmpi3 + nvals*i; // expanded buffer int use_new_ann = 1; if ( args->sample_map[i]==-1 ) use_new_ann = 0; - else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; } - else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; } - else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace & REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace & REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace & REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; } if ( !use_new_ann ) { for (j=0; j 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out); if ( ndst<=0 ) { - if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2); for (i=0; ihdr_out); i++) { @@ -1397,9 +1483,9 @@ static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, if ( args->sample_map[i]==-1 ) continue; float *src = vals + nvals*args->sample_map[i]; float *dst = args->tmpf2 + ndst*i; - if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } - else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } - else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } + else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } + else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } for (j=0; jtmpf3 + nvals*i; // expanded buffer int use_new_ann = 1; if ( args->sample_map[i]==-1 ) use_new_ann = 0; - else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } - else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } - else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; } + else if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } + else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } + else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; } if ( !use_new_ann ) { for (j=0; jsample_map[i]; char **dst = args->tmpp2 + i; - if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } - else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } - else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } + else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } + else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } *dst = *src; } return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl); @@ -1618,7 +1704,7 @@ static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, v int ndst1 = ndst / nsmpl_dst; if ( ndst <= 0 ) { - if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present if ( col->number==BCF_VL_G ) ndst1 = line->n_allele*(line->n_allele+1)/2; else @@ -1725,7 +1811,7 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, int ndst1 = ndst / nsmpl_dst; if ( ndst <= 0 ) { - if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present if ( col->number==BCF_VL_G ) ndst1 = line->n_allele*(line->n_allele+1)/2; else @@ -2011,6 +2097,24 @@ static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str) } ksprintf(str,">\n"); } +static char *set_replace_mode(char *ss, int *replace) +{ + int mode = 0; + while (*ss) + { + if ( *ss=='+' ) mode |= REPLACE_MISSING; + else if ( *ss=='-' ) mode |= REPLACE_NON_MISSING; + else if ( *ss=='=' ) mode |= SET_OR_APPEND; + else if ( *ss=='.' ) mode |= CARRY_OVER_MISSING; + else break; + ss++; + } + if ( !mode ) mode = REPLACE_ALL; +// is exactly one bit set? +// if ( mode && !(mode && ((mode & mode-1) == 0)) ) + *replace = mode; + return ss; +} static void init_columns(args_t *args) { int need_sample_map = 0; @@ -2061,10 +2165,8 @@ static void init_columns(args_t *args) while ( *ss ) { if ( *se && *se!=',' ) { se++; continue; } - int replace = REPLACE_ALL; - if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; } - else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; } - else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; } + int replace; + ss = set_replace_mode(ss, &replace); icol++; str.l = 0; kputsn(ss, se-ss, &str); @@ -2101,9 +2203,9 @@ static void init_columns(args_t *args) } else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) ) { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); if ( str.s[0]=='~' ) replace = MATCH_VALUE; - if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n"); + if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2112,12 +2214,38 @@ static void init_columns(args_t *args) col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); - if ( replace==MATCH_VALUE ) args->match_id = icol; + if ( replace & MATCH_VALUE ) args->match_id = icol; + } + else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf ) + { + replace = MATCH_VALUE; + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->setter = NULL; + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); + args->match_end = icol; + } + else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf ) + { + if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n"); + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->setter = setter_pos; + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); + args->match_end = icol; } else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column { if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s); - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2136,7 +2264,7 @@ static void init_columns(args_t *args) } else if ( !strcasecmp("FILTER",str.s) ) { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2165,8 +2293,8 @@ static void init_columns(args_t *args) } else if ( !strcasecmp("QUAL",str.s) ) { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n"); + if ( replace & SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2178,8 +2306,8 @@ static void init_columns(args_t *args) } else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); + if ( replace & SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n"); bcf_hdr_t *tgts_hdr = args->files->readers[1].header; int j; for (j=0; jnhrec; j++) @@ -2317,8 +2445,8 @@ static void init_columns(args_t *args) } else { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); + if ( replace & SET_OR_APPEND ) { if ( args->tgts_is_vcf ) error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n" @@ -2333,6 +2461,11 @@ static void init_columns(args_t *args) key_dst = str.s + 5; explicit_dst_info = 1; } + else if ( !strcasecmp("~INFO/END",str.s) ) + { + key_dst = str.s + 6; + explicit_dst_info = 1; + } else key_dst = str.s; char *key_src = strstr(key_dst,":="); @@ -2420,7 +2553,7 @@ static void init_columns(args_t *args) case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break; default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id)); } - if ( replace==SET_OR_APPEND ) // change to Number=. + if ( replace & SET_OR_APPEND ) // change to Number=. { bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL); if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst); @@ -2595,6 +2728,11 @@ static void init_data(args_t *args) args->hdr = args->files->readers[0].header; args->hdr_out = bcf_hdr_dup(args->hdr); + if ( args->set_ids_fmt ) + { + if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; } + args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt); + } if ( args->remove_annots ) init_remove_annots(args); if ( args->header_fname ) init_header_lines(args); if ( args->targets_fname && args->tgts_is_vcf ) @@ -2638,12 +2776,6 @@ static void init_data(args_t *args) if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); - if ( args->set_ids_fmt ) - { - if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; } - args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt); - } - if ( args->mark_sites ) { if ( !args->targets_fname ) error("The -a option not given\n"); @@ -2651,13 +2783,15 @@ static void init_data(args_t *args) args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); if ( args->rename_annots ) rename_annots(args, args->rename_annots); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); @@ -2789,6 +2923,7 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) ) { + if ( args->nalines + 1 == 0xffff ) break; // likely a symbolic allele, don't let the buffer overflow args->nalines++; hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); annot_line_t *tmp = &args->alines[args->nalines-1]; @@ -2869,6 +3004,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) { if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + if ( !args->cols[j].setter ) continue; if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); } @@ -2887,6 +3023,10 @@ static void annotate(args_t *args, bcf1_t *line) if ( args->nalines >= 0xffff || line->n_allele >= 0xffff ) error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + kstring_t match_end = {0,0,0}; + if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 ) + kputw(args->tmpi[0],&match_end); + // Find matching lines for (i=0; inalines; i++) { @@ -2906,6 +3046,7 @@ static void annotate(args_t *args, bcf1_t *line) ialt++; } if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue; + if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue; args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; has_overlap = 1; break; @@ -2917,6 +3058,9 @@ static void annotate(args_t *args, bcf1_t *line) has_overlap = 1; } } + + free(match_end.s); + // Sort lines if needed if ( args->has_append_mode ) { @@ -2945,6 +3089,7 @@ static void annotate(args_t *args, bcf1_t *line) { if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; if ( args->cols[j].done==1 ) continue; + if ( !args->cols[j].setter ) continue; int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); if ( ret < 0 ) error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); @@ -2957,6 +3102,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) { if ( args->cols[j].done==1 ) continue; + if ( !args->cols[j].setter ) continue; int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]); if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); @@ -2977,6 +3123,7 @@ static void annotate(args_t *args, bcf1_t *line) { if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; if ( args->cols[j].done==1 ) continue; + if ( !args->cols[j].setter ) continue; int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); if ( ret < 0 ) error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); @@ -2989,6 +3136,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) { if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + if ( !args->cols[j].setter ) continue; int ret = args->cols[j].setter(args,line,&args->cols[j],NULL); if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); @@ -3001,8 +3149,11 @@ static void annotate(args_t *args, bcf1_t *line) { bcf1_t *aline = bcf_sr_get_line(args->files,1); for (j=0; jncols; j++) + { + if ( !args->cols[j].setter ) continue; if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } has_overlap = 1; } @@ -3038,30 +3189,34 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools annotate [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(stderr, " --collapse STR matching records by , see man page for details [some]\n"); - fprintf(stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); - fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n"); - fprintf(stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n"); - fprintf(stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n"); - fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); - fprintf(stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); - fprintf(stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions REGION restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n"); - fprintf(stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n"); - fprintf(stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n"); - fprintf(stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); - fprintf(stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); - fprintf(stderr, " --threads INT number of extra output compression threads [0]\n"); + fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(stderr, " --collapse STR Matching records by , see man page for details [some]\n"); + fprintf(stderr, " -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); + fprintf(stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " --force Continue despite parsing error (at your own risk!)\n"); + fprintf(stderr, " -h, --header-lines FILE Lines which should be appended to the VCF header\n"); + fprintf(stderr, " -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details\n"); + fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them\n"); + fprintf(stderr, " -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); + fprintf(stderr, " -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " --rename-annots FILE Rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n"); + fprintf(stderr, " --rename-chrs FILE Rename sequences according to the mapping: old\\tnew\n"); + fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " -S, --samples-file [^]FILE File of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); + fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); + fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Examples:\n"); + fprintf(stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n"); fprintf(stderr, "\n"); exit(1); } @@ -3079,7 +3234,9 @@ int main_vcfannotate(int argc, char *argv[]) args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; args->set_ids_replace = 1; args->match_id = -1; + args->clevel = -1; int regions_is_file = 0, collapse = 0; + int regions_overlap = 1; static struct option loptions[] = { @@ -3096,6 +3253,7 @@ int main_vcfannotate(int argc, char *argv[]) {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"remove",required_argument,NULL,'x'}, {"columns-file",required_argument,NULL,'C'}, {"columns",required_argument,NULL,'c'}, @@ -3109,6 +3267,7 @@ int main_vcfannotate(int argc, char *argv[]) {"force",no_argument,NULL,'f'}, {NULL,0,NULL,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) { switch (c) { @@ -3136,8 +3295,17 @@ int main_vcfannotate(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); @@ -3161,6 +3329,12 @@ int main_vcfannotate(int argc, char *argv[]) else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->single_overlaps = 1; break; @@ -3180,6 +3354,7 @@ int main_vcfannotate(int argc, char *argv[]) if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index b7e707b..3c8469e 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -69,11 +69,12 @@ typedef struct } annot_line_t; -#define REPLACE_MISSING 0 // replace only missing values -#define REPLACE_ALL 1 // replace both missing and existing values -#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing -#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise -#define MATCH_VALUE 4 // do not set, just match the value -c ~ID +#define REPLACE_MISSING (1<<0) // -c +TAG .. replace only missing values +#define REPLACE_ALL (1<<1) // -c TAG .. replace both missing and existing values +#define REPLACE_NON_MISSING (1<<2) // -c -TAG .. replace only if tgt is not missing +#define SET_OR_APPEND (1<<3) // -c =TAG .. set new value if missing or non-existent, append otherwise +#define MATCH_VALUE (1<<4) // -c ~ID .. do not set, just match the value +#define CARRY_OVER_MISSING (1<<5) // -c .TAG .. carry over source missing values as well #define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest #define MM_APPEND 1 // append, possibly multiple times #define MM_UNIQUE 2 // append, only unique values @@ -116,7 +117,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr, *hdr_out, *tgts_hdr; htsFile *out_fh; - int output_type, n_threads; + int output_type, n_threads, clevel; bcf_sr_regions_t *tgts; regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns @@ -141,6 +142,7 @@ typedef struct _args_t annot_col_t *cols; // column indexes and setters int ncols; int match_id; // set iff `-c ~ID` given + int match_end; // set iff `-c ~INFO/END` is given char *set_ids_fmt; convert_t *set_ids; @@ -294,6 +296,15 @@ static void init_remove_annots(args_t *args) void *keep = khash_str2int_init(); kstring_t str = {0,0,0}; char *ss = args->remove_annots; + + int i, ntags, needs_info = 0; + if ( args->set_ids ) + { + const char **tags = convert_list_used_tags(args->set_ids,&ntags); + for (i=0; inrm++; @@ -354,7 +365,11 @@ static void init_remove_annots(args_t *args) fprintf(bcftools_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); tag->key = strdup(str.s); - if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; + if ( type==BCF_HL_INFO ) + { + tag->handler = remove_info_tag; + if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt); + } else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; } else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) ) @@ -367,7 +382,11 @@ static void init_remove_annots(args_t *args) else { tag->key = strdup(str.s); - if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; + if ( type==BCF_HL_INFO ) + { + tag->handler = remove_info_tag; + if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt); + } else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key); } @@ -381,6 +400,7 @@ static void init_remove_annots(args_t *args) else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual; else if ( !strcasecmp("INFO",str.s) ) { + if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt); tag->handler = remove_info; if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO); } @@ -504,12 +524,16 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat // note: so far this works only with one filter, not a list of filters annot_line_t *tab = (annot_line_t*) data; - if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." + if ( tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) // don't overwrite with a missing value unless asked + { + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0); + return 0; + } hts_expand(int,1,args->mtmpi,args->tmpi); args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]); if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]); - if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]); - if ( col->replace!=REPLACE_MISSING ) + if ( col->replace & SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]); + if ( !(col->replace & REPLACE_MISSING) ) { bcf_update_filter(args->hdr_out,line,NULL,0); return bcf_update_filter(args->hdr_out,line,args->tmpi,1); @@ -528,10 +552,14 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void bcf1_t *rec = (bcf1_t*) data; if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT); if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); - if ( !rec->d.n_flt ) return 0; // don't overwrite with a missing value - if ( col->replace==SET_OR_APPEND || col->replace==REPLACE_MISSING ) + if ( !rec->d.n_flt ) // don't overwrite with a missing value unless asked { - if ( col->replace==REPLACE_MISSING && line->d.n_flt ) return 0; // only update missing FILTER + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0); + return 0; + } + if ( col->replace & (SET_OR_APPEND|REPLACE_MISSING) ) + { + if ( (col->replace & REPLACE_MISSING) && line->d.n_flt ) return 0; // only update missing FILTER for (i=0; id.n_flt; i++) { const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]); @@ -548,10 +576,21 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void bcf_update_filter(args->hdr_out,line,NULL,0); return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt); } +static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data) +{ + annot_line_t *tab = (annot_line_t*) data; + if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." + char *tmp; + int pos = strtol(tab->cols[col->icol], &tmp, 10); + if ( tmp==tab->cols[col->icol] ) + error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]); + line->pos = pos - 1; + return 0; +} static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n"); - if ( col->replace==MATCH_VALUE ) return 0; + if ( col->replace & MATCH_VALUE ) return 0; // possible cases: // IN ANNOT OUT ACHIEVED_BY @@ -564,8 +603,8 @@ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) // annot_line_t *tab = (annot_line_t*) data; if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." - if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]); - if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]); + if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]); + if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]); // running with +ID, only update missing ids if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) @@ -574,7 +613,7 @@ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) } static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { - if ( col->replace==MATCH_VALUE ) return 0; + if ( col->replace & MATCH_VALUE ) return 0; bcf1_t *rec = (bcf1_t*) data; @@ -590,8 +629,8 @@ static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *dat if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "." id = rec->d.id; } - if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id); - if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id); + if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id); + if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,id); // running with +ID, only update missing ids if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) ) @@ -632,9 +671,12 @@ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) annot_line_t *tab = (annot_line_t*) data; char *str = tab->cols[col->icol]; - if ( str[0]=='.' && str[1]==0 ) return 0; // empty - - if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0; + if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked + { + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual); + return 0; + } + if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0; line->qual = strtod(str, &str); if ( str == tab->cols[col->icol] ) @@ -644,8 +686,12 @@ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { bcf1_t *rec = (bcf1_t*) data; - if ( bcf_float_is_missing(rec->qual) ) return 0; - if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0; + if ( bcf_float_is_missing(rec->qual) ) // don't overwrite with a missing value unless asked + { + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual); + return 0; + } + if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0; line->qual = rec->qual; return 0; } @@ -655,7 +701,11 @@ static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void * annot_line_t *tab = (annot_line_t*) data; char *str = tab->cols[col->icol]; - if ( str[0]=='.' && str[1]==0 ) return 0; + if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked + { + if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); + return 0; + } if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1); if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); @@ -692,7 +742,7 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int if ( ntmpi2 < ndst ) args->tmpi2[i] = bcf_int32_missing; continue; } - if ( ntmpi2==ndst && col->replace==REPLACE_MISSING + if ( ntmpi2==ndst && (col->replace & REPLACE_MISSING) && args->tmpi2[i]!=bcf_int32_missing && args->tmpi2[i]!=bcf_int32_vector_end ) continue; @@ -706,7 +756,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d // This is a bit hacky, only to reuse existing code with minimal changes: // -c =TAG will now behave as -l TAG:APPEND for integers - if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND; + if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND; if ( !tab ) { @@ -718,10 +768,23 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } int i,ntmpi = 0; + if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused ) + { + ntmpi = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi, &args->mtmpi); + if ( ntmpi>0 && (args->tmpi[0]!=bcf_int32_missing || (col->replace & CARRY_OVER_MISSING)) ) + { + col->mm_dbl_nused = col->mm_dbl_ndat = ntmpi; + hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); + for (i=0; imm_dbl[i] = args->tmpi[i]; + col->mm_dbl_ndat = 1; + } + ntmpi = 0; + } if ( tab ) // has data, not flushing yet { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1; while ( *end ) { @@ -729,7 +792,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) { - if ( col->merge_method==MM_APPEND_MISSING ) + if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) args->tmpi[ntmpi-1] = bcf_int32_missing; else ntmpi--; @@ -796,12 +859,11 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); } static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -813,7 +875,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; @@ -844,7 +906,7 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int if ( ntmpf2 < ndst ) bcf_float_set_missing(args->tmpf2[i]); continue; } - if ( ntmpf2==ndst && col->replace==REPLACE_MISSING + if ( ntmpf2==ndst && (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(args->tmpf2[i]) && !bcf_float_is_vector_end(args->tmpf2[i]) ) continue; @@ -858,7 +920,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * // This is a bit hacky, only to reuse existing code with minimal changes: // -c =TAG will now behave as -l TAG:APPEND for floats - if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND; + if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND; if ( !tab ) { @@ -870,10 +932,26 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * } int i,ntmpf = 0; - if ( tab ) + if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused ) + { + ntmpf = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf, &args->mtmpf); + if ( ntmpf>0 && (!bcf_float_is_missing(args->tmpf[0]) || (col->replace & CARRY_OVER_MISSING)) ) + { + col->mm_dbl_nused = ntmpf; + hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); + for (i=0; itmpf[i]) ) + bcf_double_set_missing(col->mm_dbl[i]); + else + col->mm_dbl[i] = args->tmpf[i]; + col->mm_dbl_ndat = 1; + } + ntmpf = 0; + } + if ( tab ) // data row, not just flushing { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1; while ( *end ) { @@ -881,7 +959,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * hts_expand(float,ntmpf,args->mtmpf,args->tmpf); if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) { - if ( col->merge_method==MM_APPEND_MISSING ) + if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) bcf_float_set_missing(args->tmpf[ntmpf-1]); else ntmpf--; @@ -964,7 +1042,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; @@ -981,7 +1059,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; @@ -1028,7 +1106,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in if ( empty ) copy_string_field(".",0,1,&args->tmpks,i); continue; } - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { // Do not replace filled values. The field must be looked up again because // of realloc in copy_string_field @@ -1057,7 +1135,7 @@ void khash_str2int_clear_free(void *_hash) } static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { - if ( col->replace==REPLACE_MISSING && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) + if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) { int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; @@ -1065,7 +1143,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d // This is a bit hacky, only to reuse existing code with minimal changes: // -c =TAG will now behave as -l TAG:unique for strings - if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE; + if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE; annot_line_t *tab = (annot_line_t*) data; @@ -1074,7 +1152,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d { len = strlen(tab->cols[col->icol]); if ( !len ) return 0; - if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1; + if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1; } if ( col->merge_method!=MM_FIRST ) @@ -1092,6 +1170,14 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); } + if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l ) + { + int m = col->mm_kstr.m; + int n = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &col->mm_kstr.s, &m); + col->mm_kstr.m = m; + if ( n>0 && ((col->replace & CARRY_OVER_MISSING) || col->mm_kstr.s[0]!='.' || col->mm_kstr.s[1]) ) col->mm_kstr.l = n; + } + if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); kputs(tab->cols[col->icol], &col->mm_kstr); return 1; @@ -1137,7 +1223,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele); - if ( col->replace==REPLACE_MISSING ) + if ( col->replace & REPLACE_MISSING ) { int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; @@ -1212,7 +1298,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo nsrc /= bcf_hdr_nsamples(args->files->readers[1].header); if ( ndst<=0 ) // field not present in dst file { - if ( col->replace==REPLACE_NON_MISSING ) return 0; + if ( col->replace & REPLACE_NON_MISSING ) return 0; hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); for (i=0; ihdr_out); i++) { @@ -1237,8 +1323,8 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( args->sample_map[i]==-1 ) continue; int32_t *src = args->tmpi + nsrc*args->sample_map[i]; int32_t *dst = args->tmpi2 + ndst*i; - if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue; - if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue; + if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(dst[0]) ) continue; + if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(dst[0]) ) continue; for (j=0; jtmpi3 + nsrc*i; int keep_ori = 0; if ( args->sample_map[i]==-1 ) keep_ori = 1; - else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1; - else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1; + else if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(ori[0]) ) keep_ori = 1; + else if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1; if ( keep_ori ) { for (j=0; j 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out); if ( ndst<=0 ) { - if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2); for (i=0; ihdr_out); i++) { @@ -1333,9 +1419,9 @@ static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING // x . . -TAG .. REPLACE_NON_MISSING - if ( col->replace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } - else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } - else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } + else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } + else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } for (j=0; jtmpi3 + nvals*i; // expanded buffer int use_new_ann = 1; if ( args->sample_map[i]==-1 ) use_new_ann = 0; - else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; } - else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; } - else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace & REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace & REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; } + else if ( col->replace & REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; } if ( !use_new_ann ) { for (j=0; j 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out); if ( ndst<=0 ) { - if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2); for (i=0; ihdr_out); i++) { @@ -1399,9 +1485,9 @@ static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, if ( args->sample_map[i]==-1 ) continue; float *src = vals + nvals*args->sample_map[i]; float *dst = args->tmpf2 + ndst*i; - if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } - else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } - else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } + else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } + else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } for (j=0; jtmpf3 + nvals*i; // expanded buffer int use_new_ann = 1; if ( args->sample_map[i]==-1 ) use_new_ann = 0; - else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } - else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } - else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; } + else if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } + else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; } + else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; } if ( !use_new_ann ) { for (j=0; jsample_map[i]; char **dst = args->tmpp2 + i; - if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } - else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } - else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } + else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } + else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } *dst = *src; } return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl); @@ -1620,7 +1706,7 @@ static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, v int ndst1 = ndst / nsmpl_dst; if ( ndst <= 0 ) { - if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present if ( col->number==BCF_VL_G ) ndst1 = line->n_allele*(line->n_allele+1)/2; else @@ -1727,7 +1813,7 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, int ndst1 = ndst / nsmpl_dst; if ( ndst <= 0 ) { - if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present if ( col->number==BCF_VL_G ) ndst1 = line->n_allele*(line->n_allele+1)/2; else @@ -2013,6 +2099,24 @@ static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str) } ksprintf(str,">\n"); } +static char *set_replace_mode(char *ss, int *replace) +{ + int mode = 0; + while (*ss) + { + if ( *ss=='+' ) mode |= REPLACE_MISSING; + else if ( *ss=='-' ) mode |= REPLACE_NON_MISSING; + else if ( *ss=='=' ) mode |= SET_OR_APPEND; + else if ( *ss=='.' ) mode |= CARRY_OVER_MISSING; + else break; + ss++; + } + if ( !mode ) mode = REPLACE_ALL; +// is exactly one bit set? +// if ( mode && !(mode && ((mode & mode-1) == 0)) ) + *replace = mode; + return ss; +} static void init_columns(args_t *args) { int need_sample_map = 0; @@ -2063,10 +2167,8 @@ static void init_columns(args_t *args) while ( *ss ) { if ( *se && *se!=',' ) { se++; continue; } - int replace = REPLACE_ALL; - if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; } - else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; } - else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; } + int replace; + ss = set_replace_mode(ss, &replace); icol++; str.l = 0; kputsn(ss, se-ss, &str); @@ -2103,9 +2205,9 @@ static void init_columns(args_t *args) } else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) ) { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); if ( str.s[0]=='~' ) replace = MATCH_VALUE; - if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n"); + if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2114,12 +2216,38 @@ static void init_columns(args_t *args) col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); - if ( replace==MATCH_VALUE ) args->match_id = icol; + if ( replace & MATCH_VALUE ) args->match_id = icol; + } + else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf ) + { + replace = MATCH_VALUE; + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->setter = NULL; + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); + args->match_end = icol; + } + else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf ) + { + if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n"); + args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); + annot_col_t *col = &args->cols[args->ncols-1]; + memset(col,0,sizeof(*col)); + col->icol = icol; + col->replace = replace; + col->setter = setter_pos; + col->hdr_key_src = strdup(str.s); + col->hdr_key_dst = strdup(str.s); + args->match_end = icol; } else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column { if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s); - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2138,7 +2266,7 @@ static void init_columns(args_t *args) } else if ( !strcasecmp("FILTER",str.s) ) { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2167,8 +2295,8 @@ static void init_columns(args_t *args) } else if ( !strcasecmp("QUAL",str.s) ) { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n"); + if ( replace & SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2180,8 +2308,8 @@ static void init_columns(args_t *args) } else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n"); + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); + if ( replace & SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n"); bcf_hdr_t *tgts_hdr = args->files->readers[1].header; int j; for (j=0; jnhrec; j++) @@ -2319,8 +2447,8 @@ static void init_columns(args_t *args) } else { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) + if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); + if ( replace & SET_OR_APPEND ) { if ( args->tgts_is_vcf ) error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n" @@ -2335,6 +2463,11 @@ static void init_columns(args_t *args) key_dst = str.s + 5; explicit_dst_info = 1; } + else if ( !strcasecmp("~INFO/END",str.s) ) + { + key_dst = str.s + 6; + explicit_dst_info = 1; + } else key_dst = str.s; char *key_src = strstr(key_dst,":="); @@ -2422,7 +2555,7 @@ static void init_columns(args_t *args) case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break; default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id)); } - if ( replace==SET_OR_APPEND ) // change to Number=. + if ( replace & SET_OR_APPEND ) // change to Number=. { bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL); if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst); @@ -2597,6 +2730,11 @@ static void init_data(args_t *args) args->hdr = args->files->readers[0].header; args->hdr_out = bcf_hdr_dup(args->hdr); + if ( args->set_ids_fmt ) + { + if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; } + args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt); + } if ( args->remove_annots ) init_remove_annots(args); if ( args->header_fname ) init_header_lines(args); if ( args->targets_fname && args->tgts_is_vcf ) @@ -2640,12 +2778,6 @@ static void init_data(args_t *args) if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); - if ( args->set_ids_fmt ) - { - if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; } - args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt); - } - if ( args->mark_sites ) { if ( !args->targets_fname ) error("The -a option not given\n"); @@ -2653,13 +2785,15 @@ static void init_data(args_t *args) args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites); } - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); + if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate"); if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); if ( args->rename_annots ) rename_annots(args, args->rename_annots); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); @@ -2791,6 +2925,7 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) ) { + if ( args->nalines + 1 == 0xffff ) break; // likely a symbolic allele, don't let the buffer overflow args->nalines++; hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); annot_line_t *tmp = &args->alines[args->nalines-1]; @@ -2871,6 +3006,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) { if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + if ( !args->cols[j].setter ) continue; if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); } @@ -2889,6 +3025,10 @@ static void annotate(args_t *args, bcf1_t *line) if ( args->nalines >= 0xffff || line->n_allele >= 0xffff ) error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + kstring_t match_end = {0,0,0}; + if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 ) + kputw(args->tmpi[0],&match_end); + // Find matching lines for (i=0; inalines; i++) { @@ -2908,6 +3048,7 @@ static void annotate(args_t *args, bcf1_t *line) ialt++; } if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue; + if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue; args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; has_overlap = 1; break; @@ -2919,6 +3060,9 @@ static void annotate(args_t *args, bcf1_t *line) has_overlap = 1; } } + + free(match_end.s); + // Sort lines if needed if ( args->has_append_mode ) { @@ -2947,6 +3091,7 @@ static void annotate(args_t *args, bcf1_t *line) { if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; if ( args->cols[j].done==1 ) continue; + if ( !args->cols[j].setter ) continue; int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); if ( ret < 0 ) error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); @@ -2959,6 +3104,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) { if ( args->cols[j].done==1 ) continue; + if ( !args->cols[j].setter ) continue; int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]); if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); @@ -2979,6 +3125,7 @@ static void annotate(args_t *args, bcf1_t *line) { if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; if ( args->cols[j].done==1 ) continue; + if ( !args->cols[j].setter ) continue; int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); if ( ret < 0 ) error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); @@ -2991,6 +3138,7 @@ static void annotate(args_t *args, bcf1_t *line) for (j=0; jncols; j++) { if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + if ( !args->cols[j].setter ) continue; int ret = args->cols[j].setter(args,line,&args->cols[j],NULL); if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); @@ -3003,8 +3151,11 @@ static void annotate(args_t *args, bcf1_t *line) { bcf1_t *aline = bcf_sr_get_line(args->files,1); for (j=0; jncols; j++) + { + if ( !args->cols[j].setter ) continue; if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } has_overlap = 1; } @@ -3040,30 +3191,34 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools annotate [options] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(bcftools_stderr, " --collapse STR matching records by , see man page for details [some]\n"); - fprintf(bcftools_stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(bcftools_stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); - fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " --force continue despite parsing error (at your own risk!)\n"); - fprintf(bcftools_stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n"); - fprintf(bcftools_stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n"); - fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); - fprintf(bcftools_stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); - fprintf(bcftools_stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -r, --regions REGION restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n"); - fprintf(bcftools_stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n"); - fprintf(bcftools_stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n"); - fprintf(bcftools_stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); - fprintf(bcftools_stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); - fprintf(bcftools_stderr, " --threads INT number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(bcftools_stderr, " --collapse STR Matching records by , see man page for details [some]\n"); + fprintf(bcftools_stderr, " -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); + fprintf(bcftools_stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " --force Continue despite parsing error (at your own risk!)\n"); + fprintf(bcftools_stderr, " -h, --header-lines FILE Lines which should be appended to the VCF header\n"); + fprintf(bcftools_stderr, " -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details\n"); + fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them\n"); + fprintf(bcftools_stderr, " -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); + fprintf(bcftools_stderr, " -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " --rename-annots FILE Rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n"); + fprintf(bcftools_stderr, " --rename-chrs FILE Rename sequences according to the mapping: old\\tnew\n"); + fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " -S, --samples-file [^]FILE File of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); + fprintf(bcftools_stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); + fprintf(bcftools_stderr, " --threads INT Number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Examples:\n"); + fprintf(bcftools_stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -3081,7 +3236,9 @@ int main_vcfannotate(int argc, char *argv[]) args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; args->set_ids_replace = 1; args->match_id = -1; + args->clevel = -1; int regions_is_file = 0, collapse = 0; + int regions_overlap = 1; static struct option loptions[] = { @@ -3098,6 +3255,7 @@ int main_vcfannotate(int argc, char *argv[]) {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"remove",required_argument,NULL,'x'}, {"columns-file",required_argument,NULL,'C'}, {"columns",required_argument,NULL,'c'}, @@ -3111,6 +3269,7 @@ int main_vcfannotate(int argc, char *argv[]) {"force",no_argument,NULL,'f'}, {NULL,0,NULL,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) { switch (c) { @@ -3138,8 +3297,17 @@ int main_vcfannotate(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); @@ -3163,6 +3331,12 @@ int main_vcfannotate(int argc, char *argv[]) else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->single_overlaps = 1; break; @@ -3182,6 +3356,7 @@ int main_vcfannotate(int argc, char *argv[]) if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index e2aab3f..ca2a899 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -74,13 +74,13 @@ rec_tgt_t; typedef struct { int flag; // combination of CF_* flags above - int output_type, n_threads, record_cmd_line; + int output_type, n_threads, record_cmd_line, clevel; htsFile *bcf_in, *out_fh; char *bcf_fname, *output_fname; char **samples; // for subsampling and ploidy int nsamples, *samples_map; // mapping from output sample names to original VCF char *regions, *targets; // regions to process - int regions_is_file, targets_is_file; + int regions_is_file, targets_is_file, regions_overlap; regidx_t *tgt_idx; regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp; vcfbuf_t *vcfbuf; @@ -624,6 +624,7 @@ static void init_data(args_t *args) if ( args->regions ) { + bcf_sr_set_opt(args->aux.srs,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); } @@ -692,7 +693,9 @@ static void init_data(args_t *args) if ( args->aux.flag & CALL_CONSTR_ALLELES ) args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -876,42 +879,41 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools call [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "File format options:\n"); - fprintf(stderr, " --no-version Do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n"); - fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); - fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --samples LIST List of samples to include [all samples]\n"); - fprintf(stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n"); + fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -s, --samples LIST List of samples to include [all samples]\n"); + fprintf(stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Input/output options:\n"); - fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); -//todo? -// fprintf(stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n"); -// fprintf(stderr, " tag removal [^I16,^QS,^FMT/QS]\n"); - fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); - fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); - fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); - fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); - fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); - fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); - fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); - fprintf(stderr, " -v, --variants-only Output variant sites only\n"); + fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n"); + fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); + fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); + fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); + fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); + fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); + fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); + fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); + fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); + fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); + fprintf(stderr, " -v, --variants-only Output variant sites only\n"); fprintf(stderr, "\n"); fprintf(stderr, "Consensus/variant calling options:\n"); - fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); - fprintf(stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n"); - fprintf(stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); - fprintf(stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args.clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'C': @@ -1056,6 +1070,12 @@ int main_vcfcall(int argc, char *argv[]) case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break; case 9 : args.n_threads = strtol(optarg, 0, 0); break; case 8 : args.record_cmd_line = 0; break; + case 4 : + if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; default: usage(&args); } } diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index b5bedb9..63e1b03 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -76,13 +76,13 @@ rec_tgt_t; typedef struct { int flag; // combination of CF_* flags above - int output_type, n_threads, record_cmd_line; + int output_type, n_threads, record_cmd_line, clevel; htsFile *bcf_in, *out_fh; char *bcf_fname, *output_fname; char **samples; // for subsampling and ploidy int nsamples, *samples_map; // mapping from output sample names to original VCF char *regions, *targets; // regions to process - int regions_is_file, targets_is_file; + int regions_is_file, targets_is_file, regions_overlap; regidx_t *tgt_idx; regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp; vcfbuf_t *vcfbuf; @@ -626,6 +626,7 @@ static void init_data(args_t *args) if ( args->regions ) { + bcf_sr_set_opt(args->aux.srs,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); } @@ -694,7 +695,9 @@ static void init_data(args_t *args) if ( args->aux.flag & CALL_CONSTR_ALLELES ) args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -878,42 +881,41 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools call [options] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "File format options:\n"); - fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n"); - fprintf(bcftools_stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); - fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include [all samples]\n"); - fprintf(bcftools_stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n"); + fprintf(bcftools_stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include [all samples]\n"); + fprintf(bcftools_stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Input/output options:\n"); - fprintf(bcftools_stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n"); - fprintf(bcftools_stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); -//todo? -// fprintf(bcftools_stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n"); -// fprintf(bcftools_stderr, " tag removal [^I16,^QS,^FMT/QS]\n"); - fprintf(bcftools_stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); - fprintf(bcftools_stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); - fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); - fprintf(bcftools_stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); - fprintf(bcftools_stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(bcftools_stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); - fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); - fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); - fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n"); + fprintf(bcftools_stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n"); + fprintf(bcftools_stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); + fprintf(bcftools_stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); + fprintf(bcftools_stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); + fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); + fprintf(bcftools_stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); + fprintf(bcftools_stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); + fprintf(bcftools_stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); + fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); + fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); + fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Consensus/variant calling options:\n"); - fprintf(bcftools_stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); - fprintf(bcftools_stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n"); - fprintf(bcftools_stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); - fprintf(bcftools_stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(bcftools_stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args.clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'C': @@ -1058,6 +1072,12 @@ int main_vcfcall(int argc, char *argv[]) case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break; case 9 : args.n_threads = strtol(optarg, 0, 0); break; case 8 : args.record_cmd_line = 0; break; + case 4 : + if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; default: usage(&args); } } diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c index 02d610d..02f56b9 100644 --- a/bcftools/vcfcnv.c +++ b/bcftools/vcfcnv.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2018 Genome Research Ltd. + Copyright (c) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -346,7 +346,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " chr = row[0]\n" " if chr[0]=='#': continue\n" " if chr not in dat: dat[chr] = []\n" - " dat[chr].append([row[1], float(row[2]), float(row[3])])\n" + " dat[chr].append([int(row[1]), float(row[2]), float(row[3])])\n" "\n" "cnv = {}\n" "with open('%s', 'r') as f:\n" @@ -356,6 +356,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " if chr[0]=='#': continue\n" " if chr not in cnv: cnv[chr] = []\n" " row[2] = int(row[2]) + 0.5\n" + " row[1] = int(row[1])\n" " cnv[chr].append(row[1:])\n" "\n" "for chr in dat:\n" @@ -372,7 +373,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " heat[1][x] = cn_dat[x][3]\n" " heat[2][x] = cn_dat[x][4]\n" " heat[3][x] = cn_dat[x][5]\n" - " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r')\n" + " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r', shading='auto', alpha=0)\n" " mesh.set_clim(vmin=-1,vmax=1)\n" " ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'.-',ms=3,color='black')\n" " fig.suptitle('%s (chr '+chr+')')\n" @@ -458,7 +459,7 @@ static void create_plots(args_t *args) " for row in reader:\n" " chr = row[0]\n" " if chr != plot_chr: continue\n" - " dat.append([row[1], float(row[2]), float(row[3])])\n" + " dat.append([int(row[1]), float(row[2]), float(row[3])])\n" "def read_cnv(file,cnv,plot_chr):\n" " with open(file, 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" @@ -466,6 +467,7 @@ static void create_plots(args_t *args) " chr = row[0]\n" " if chr != plot_chr: continue\n" " row[2] = int(row[2]) + 0.5\n" + " row[1] = int(row[1])\n" " cnv.append(row[1:])\n" "def find_diffs(a,b):\n" " out = []\n" @@ -504,7 +506,7 @@ static void create_plots(args_t *args) " heat[1][x] = cn_dat[x][3]\n" " heat[2][x] = cn_dat[x][4]\n" " heat[3][x] = cn_dat[x][5]\n" - " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr')\n" + " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr', shading='auto', alpha=0)\n" " mesh.set_clim(vmin=-1,vmax=1)\n" " ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'-',ms=3,color='black',lw=1.7)\n" "\n" @@ -1212,28 +1214,30 @@ static void usage(args_t *args) fprintf(stderr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n"); fprintf(stderr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n"); fprintf(stderr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n"); - fprintf(stderr, "Usage: bcftools cnv [OPTIONS] \n"); + fprintf(stderr, "Usage: bcftools cnv [OPTIONS] FILE.vcf\n"); fprintf(stderr, "General Options:\n"); - fprintf(stderr, " -c, --control-sample optional control sample name to highlight differences\n"); - fprintf(stderr, " -f, --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(stderr, " -o, --output-dir \n"); - fprintf(stderr, " -p, --plot-threshold plot aberrant chromosomes with quality at least 'float'\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --query-sample query samply name\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " -c, --control-sample STRING Optional control sample name to highlight differences\n"); + fprintf(stderr, " -f, --AF-file FILE Read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); + fprintf(stderr, " -o, --output-dir PATH \n"); + fprintf(stderr, " -p, --plot-threshold FLOAT Plot aberrant chromosomes with quality at least FLOAT\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -s, --query-sample STRING Query samply name\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, "HMM Options:\n"); - fprintf(stderr, " -a, --aberrant fraction of aberrant cells in query and control [1.0,1.0]\n"); - fprintf(stderr, " -b, --BAF-weight relative contribution from BAF [1]\n"); - fprintf(stderr, " -d, --BAF-dev expected BAF deviation in query and control [0.04,0.04]\n"); // experimental - fprintf(stderr, " -e, --err-prob uniform error probability [1e-4]\n"); - fprintf(stderr, " -k, --LRR-dev expected LRR deviation [0.2,0.2]\n"); // experimental - fprintf(stderr, " -l, --LRR-weight relative contribution from LRR [0.2]\n"); - fprintf(stderr, " -L, --LRR-smooth-win window of LRR moving average smoothing [10]\n"); - fprintf(stderr, " -O, --optimize estimate fraction of aberrant cells down to [1.0]\n"); - fprintf(stderr, " -P, --same-prob prior probability of -s/-c being the same [0.5]\n"); - fprintf(stderr, " -x, --xy-prob P(x|y) transition probability [1e-9]\n"); + fprintf(stderr, " -a, --aberrant FLOAT[,FLOAT] Fraction of aberrant cells in query and control [1.0,1.0]\n"); + fprintf(stderr, " -b, --BAF-weight FLOAT Relative contribution from BAF [1]\n"); + fprintf(stderr, " -d, --BAF-dev FLOAT[,FLOAT] Expected BAF deviation in query and control [0.04,0.04]\n"); // experimental + fprintf(stderr, " -e, --err-prob FLOAT Uniform error probability [1e-4]\n"); + fprintf(stderr, " -k, --LRR-dev FLOAT[,FLOAT] Expected LRR deviation [0.2,0.2]\n"); // experimental + fprintf(stderr, " -l, --LRR-weight FLOAT Relative contribution from LRR [0.2]\n"); + fprintf(stderr, " -L, --LRR-smooth-win INT Window of LRR moving average smoothing [10]\n"); + fprintf(stderr, " -O, --optimize FLOAT Estimate fraction of aberrant cells down to FLOAT [1.0]\n"); + fprintf(stderr, " -P, --same-prob FLOA> Prior probability of -s/-c being the same [0.5]\n"); + fprintf(stderr, " -x, --xy-prob FLOAT P(x|y) transition probability [1e-9]\n"); fprintf(stderr, "\n"); exit(1); } @@ -1265,6 +1269,9 @@ int main_vcfcnv(int argc, char *argv[]) args->query_sample.lrr_dev2 = args->control_sample.lrr_dev2 = 0.2*0.2; //0.20*0.20; // illumina: 0.18 int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; + static struct option loptions[] = { {"BAF-dev",1,0,'d'}, @@ -1283,8 +1290,10 @@ int main_vcfcnv(int argc, char *argv[]) {"control-sample",1,0,'c'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"plot-threshold",1,0,'p'}, {"output-dir",1,0,'o'}, {0,0,0,0} @@ -1371,6 +1380,18 @@ int main_vcfcnv(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 4 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -1388,11 +1409,13 @@ int main_vcfcnv(int argc, char *argv[]) if ( !args->output_dir ) error("Expected -o option\n"); if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index d74486d..7562809 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2014-2018 Genome Research Ltd. + Copyright (c) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -348,7 +348,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " chr = row[0]\n" " if chr[0]=='#': continue\n" " if chr not in dat: dat[chr] = []\n" - " dat[chr].append([row[1], float(row[2]), float(row[3])])\n" + " dat[chr].append([int(row[1]), float(row[2]), float(row[3])])\n" "\n" "cnv = {}\n" "with open('%s', 'r') as f:\n" @@ -358,6 +358,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " if chr[0]=='#': continue\n" " if chr not in cnv: cnv[chr] = []\n" " row[2] = int(row[2]) + 0.5\n" + " row[1] = int(row[1])\n" " cnv[chr].append(row[1:])\n" "\n" "for chr in dat:\n" @@ -374,7 +375,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " heat[1][x] = cn_dat[x][3]\n" " heat[2][x] = cn_dat[x][4]\n" " heat[3][x] = cn_dat[x][5]\n" - " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r')\n" + " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r', shading='auto', alpha=0)\n" " mesh.set_clim(vmin=-1,vmax=1)\n" " ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'.-',ms=3,color='black')\n" " fig.suptitle('%s (chr '+chr+')')\n" @@ -460,7 +461,7 @@ static void create_plots(args_t *args) " for row in reader:\n" " chr = row[0]\n" " if chr != plot_chr: continue\n" - " dat.append([row[1], float(row[2]), float(row[3])])\n" + " dat.append([int(row[1]), float(row[2]), float(row[3])])\n" "def read_cnv(file,cnv,plot_chr):\n" " with open(file, 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" @@ -468,6 +469,7 @@ static void create_plots(args_t *args) " chr = row[0]\n" " if chr != plot_chr: continue\n" " row[2] = int(row[2]) + 0.5\n" + " row[1] = int(row[1])\n" " cnv.append(row[1:])\n" "def find_diffs(a,b):\n" " out = []\n" @@ -506,7 +508,7 @@ static void create_plots(args_t *args) " heat[1][x] = cn_dat[x][3]\n" " heat[2][x] = cn_dat[x][4]\n" " heat[3][x] = cn_dat[x][5]\n" - " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr')\n" + " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr', shading='auto', alpha=0)\n" " mesh.set_clim(vmin=-1,vmax=1)\n" " ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'-',ms=3,color='black',lw=1.7)\n" "\n" @@ -1214,28 +1216,30 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n"); fprintf(bcftools_stderr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n"); fprintf(bcftools_stderr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n"); - fprintf(bcftools_stderr, "Usage: bcftools cnv [OPTIONS] \n"); + fprintf(bcftools_stderr, "Usage: bcftools cnv [OPTIONS] FILE.vcf\n"); fprintf(bcftools_stderr, "General Options:\n"); - fprintf(bcftools_stderr, " -c, --control-sample optional control sample name to highlight differences\n"); - fprintf(bcftools_stderr, " -f, --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(bcftools_stderr, " -o, --output-dir \n"); - fprintf(bcftools_stderr, " -p, --plot-threshold plot aberrant chromosomes with quality at least 'float'\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --query-sample query samply name\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -c, --control-sample STRING Optional control sample name to highlight differences\n"); + fprintf(bcftools_stderr, " -f, --AF-file FILE Read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); + fprintf(bcftools_stderr, " -o, --output-dir PATH \n"); + fprintf(bcftools_stderr, " -p, --plot-threshold FLOAT Plot aberrant chromosomes with quality at least FLOAT\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -s, --query-sample STRING Query samply name\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, "HMM Options:\n"); - fprintf(bcftools_stderr, " -a, --aberrant fraction of aberrant cells in query and control [1.0,1.0]\n"); - fprintf(bcftools_stderr, " -b, --BAF-weight relative contribution from BAF [1]\n"); - fprintf(bcftools_stderr, " -d, --BAF-dev expected BAF deviation in query and control [0.04,0.04]\n"); // experimental - fprintf(bcftools_stderr, " -e, --err-prob uniform error probability [1e-4]\n"); - fprintf(bcftools_stderr, " -k, --LRR-dev expected LRR deviation [0.2,0.2]\n"); // experimental - fprintf(bcftools_stderr, " -l, --LRR-weight relative contribution from LRR [0.2]\n"); - fprintf(bcftools_stderr, " -L, --LRR-smooth-win window of LRR moving average smoothing [10]\n"); - fprintf(bcftools_stderr, " -O, --optimize estimate fraction of aberrant cells down to [1.0]\n"); - fprintf(bcftools_stderr, " -P, --same-prob prior probability of -s/-c being the same [0.5]\n"); - fprintf(bcftools_stderr, " -x, --xy-prob P(x|y) transition probability [1e-9]\n"); + fprintf(bcftools_stderr, " -a, --aberrant FLOAT[,FLOAT] Fraction of aberrant cells in query and control [1.0,1.0]\n"); + fprintf(bcftools_stderr, " -b, --BAF-weight FLOAT Relative contribution from BAF [1]\n"); + fprintf(bcftools_stderr, " -d, --BAF-dev FLOAT[,FLOAT] Expected BAF deviation in query and control [0.04,0.04]\n"); // experimental + fprintf(bcftools_stderr, " -e, --err-prob FLOAT Uniform error probability [1e-4]\n"); + fprintf(bcftools_stderr, " -k, --LRR-dev FLOAT[,FLOAT] Expected LRR deviation [0.2,0.2]\n"); // experimental + fprintf(bcftools_stderr, " -l, --LRR-weight FLOAT Relative contribution from LRR [0.2]\n"); + fprintf(bcftools_stderr, " -L, --LRR-smooth-win INT Window of LRR moving average smoothing [10]\n"); + fprintf(bcftools_stderr, " -O, --optimize FLOAT Estimate fraction of aberrant cells down to FLOAT [1.0]\n"); + fprintf(bcftools_stderr, " -P, --same-prob FLOA> Prior probability of -s/-c being the same [0.5]\n"); + fprintf(bcftools_stderr, " -x, --xy-prob FLOAT P(x|y) transition probability [1e-9]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -1267,6 +1271,9 @@ int main_vcfcnv(int argc, char *argv[]) args->query_sample.lrr_dev2 = args->control_sample.lrr_dev2 = 0.2*0.2; //0.20*0.20; // illumina: 0.18 int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; + static struct option loptions[] = { {"BAF-dev",1,0,'d'}, @@ -1285,8 +1292,10 @@ int main_vcfcnv(int argc, char *argv[]) {"control-sample",1,0,'c'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"plot-threshold",1,0,'p'}, {"output-dir",1,0,'o'}, {0,0,0,0} @@ -1373,6 +1382,18 @@ int main_vcfcnv(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 4 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -1390,11 +1411,13 @@ int main_vcfcnv(int argc, char *argv[]) if ( !args->output_dir ) error("Expected -o option\n"); if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c index 0781a60..50013a1 100644 --- a/bcftools/vcfconcat.c +++ b/bcftools/vcfconcat.c @@ -43,7 +43,7 @@ typedef struct _args_t { bcf_srs_t *files; htsFile *out_fh; - int output_type, n_threads, record_cmd_line; + int output_type, n_threads, record_cmd_line, clevel; bcf_hdr_t *out_hdr; int *seen_seq; @@ -51,13 +51,14 @@ typedef struct _args_t int *start_pos, start_tid, ifname; int *swap_phase, nswap, *nmatch, *nmism; bcf1_t **buf; + uint8_t *buf_mask; int nbuf, mbuf, prev_chr, min_PQ, prev_pos_check; int32_t *GTa, *GTb, mGTa, mGTb, *phase_qual, *phase_set; char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list; - int argc, nfnames, allow_overlaps, phased_concat, regions_is_file; + int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap; int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; - int verbose; + int verbose, explicit_output_type, ligate_force, ligate_warn; htsThreadPool *tpool; } args_t; @@ -116,7 +117,9 @@ static void init_data(args_t *args) bcf_hdr_append(args->out_hdr,"##FORMAT="); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->allow_overlaps || args->phased_concat ) { @@ -144,6 +147,7 @@ static void init_data(args_t *args) { if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } @@ -213,6 +217,7 @@ static void destroy_data(args_t *args) free(args->swap_phase); for (i=0; imbuf; i++) bcf_destroy(args->buf[i]); free(args->buf); + free(args->buf_mask); free(args->GTa); free(args->GTb); free(args->nmatch); @@ -251,9 +256,10 @@ static void phased_flush(args_t *args) int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr); static int gt_absent_warned = 0; - for (i=0; inbuf; i+=2) { + if ( args->buf_mask[i/2]!=3 ) continue; + bcf1_t *arec = args->buf[i]; bcf1_t *brec = args->buf[i+1]; @@ -300,19 +306,23 @@ static void phased_flush(args_t *args) } for (i=0; inbuf/2; i+=2) { - bcf1_t *arec = args->buf[i]; - bcf_translate(args->out_hdr, args->files->readers[0].header, arec); - if ( args->nswap ) - phase_update(args, args->out_hdr, arec); + bcf1_t *rec; + bcf_hdr_t *hdr; + int mask = args->buf_mask[i/2]; + if ( mask & 1 ) { rec = args->buf[i]; hdr = args->files->readers[0].header; } + else { rec = args->buf[i+1]; hdr = args->files->readers[1].header; } + bcf_translate(args->out_hdr, hdr, rec); + if ( args->nswap && (mask&1) ) + phase_update(args, args->out_hdr, rec); if ( !args->compact_PS || args->phase_set_changed ) { - bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); + bcf_update_format_int32(args->out_hdr,rec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } - if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write(args->out_fh, args->out_hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1,args->prev_pos_check+1); - args->prev_pos_check = arec->pos; + if ( rec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(hdr,rec),(int64_t)rec->pos+1,args->prev_pos_check+1); + args->prev_pos_check = rec->pos; } args->nswap = 0; for (j=0; jnbuf; i+=2) { - bcf1_t *brec = args->buf[i+1]; - bcf_translate(args->out_hdr, args->files->readers[1].header, brec); - if ( !PQ_printed ) + bcf1_t *rec; + bcf_hdr_t *hdr; + int mask = args->buf_mask[i/2]; + if ( mask & 2 ) { rec = args->buf[i+1]; hdr = args->files->readers[1].header; } + else { rec = args->buf[i]; hdr = args->files->readers[0].header; } + bcf_translate(args->out_hdr, hdr, rec); + if ( !PQ_printed && mask==3 ) { - bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl); + bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; jphase_qual[j] < args->min_PQ ) { - args->phase_set[j] = brec->pos+1; + args->phase_set[j] = rec->pos+1; args->phase_set_changed = 1; } else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing; } if ( args->nswap ) - phase_update(args, args->out_hdr, brec); + phase_update(args, args->out_hdr, rec); if ( !args->compact_PS || args->phase_set_changed ) { - bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl); + bcf_update_format_int32(args->out_hdr,rec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } - if ( bcf_write(args->out_fh, args->out_hdr, brec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write(args->out_fh, args->out_hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1,args->prev_pos_check+1); - args->prev_pos_check = brec->pos; + if ( rec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(hdr,rec),(int64_t)rec->pos+1,args->prev_pos_check+1); + args->prev_pos_check = rec->pos; } args->nbuf = 0; } -static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) +static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec, int is_overlap) { + bcf_hdr_t *ahdr = arec ? bcf_sr_get_header(args->files,0) : NULL; + bcf_hdr_t *bhdr = brec ? bcf_sr_get_header(args->files,1) : NULL; + if ( arec && arec->errcode ) - error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1, args->files->readers[0].fname); + error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(ahdr,arec),(int64_t) arec->pos+1, args->files->readers[0].fname); if ( brec && brec->errcode ) - error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1, args->files->readers[1].fname); + error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(bhdr,brec),(int64_t) brec->pos+1, args->files->readers[1].fname); int i, nsmpl = bcf_hdr_nsamples(args->out_hdr); - int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec)); + int chr_id = arec ? bcf_hdr_name2id(args->out_hdr,bcf_seqname(ahdr,arec)) : bcf_hdr_name2id(args->out_hdr,bcf_seqname(bhdr,brec)); if ( args->prev_chr<0 || args->prev_chr!=chr_id ) { if ( args->prev_chr>=0 ) phased_flush(args); for (i=0; iphase_set[i] = arec->pos+1; + args->phase_set[i] = arec ? arec->pos+1 : brec->pos+1; args->phase_set_changed = 1; - if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec)); + if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", arec ? bcf_seqname(ahdr,arec) : bcf_seqname(bhdr,brec)); args->seen_seq[chr_id] = 1; args->prev_chr = chr_id; args->prev_pos_check = -1; } - if ( !brec ) + if ( !is_overlap ) { - bcf_translate(args->out_hdr, args->files->readers[0].header, arec); + assert(arec); + + bcf_translate(args->out_hdr, ahdr, arec); if ( args->nswap ) phase_update(args, args->out_hdr, arec); if ( !args->compact_PS || args->phase_set_changed ) @@ -403,7 +422,7 @@ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); if ( arec->pos < args->prev_pos_check ) - error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); + error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(ahdr,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); args->prev_pos_check = arec->pos; return; } @@ -411,11 +430,21 @@ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) int m = args->mbuf; args->nbuf += 2; hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf); + if ( m < args->mbuf ) args->buf_mask = (uint8_t*)realloc(args->buf_mask,sizeof(*args->buf_mask)*args->mbuf); for (i=m; imbuf; i++) args->buf[i] = bcf_init1(); - SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]); - SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]); + if ( arec ) SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]); + if ( brec ) SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]); + args->buf_mask[args->nbuf/2-1] = (arec?1:0) | (brec?2:0); +} + +static int _get_active_index(bcf_srs_t *sr) +{ + int i; + for (i=0; inreaders; i++) + if ( bcf_sr_has_line(sr,i) ) return i; + return -1; } static void concat(args_t *args) @@ -451,37 +480,47 @@ static void concat(args_t *args) else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start - int nret; + int nret, ir; while ( (nret = bcf_sr_next_line(args->files)) ) { + int is_overlap = args->files->nreaders==1 ? 0 : 1; if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped - if ( ! bcf_sr_region_done(args->files,0) ) + if ( bcf_sr_region_done(args->files,0) ) + { + phased_flush(args); + bcf_sr_remove_reader(args->files, 0); + is_overlap = 0; + } + else if ( args->ligate_warn ) { if ( !site_drop_warned ) { + ir = _get_active_index(args->files); fprintf(stderr, "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n" " overlap, sites in overlapping regions present in one but missing in other are dropped.\n" " This warning is printed only once.\n", - bcf_seqname(bcf_sr_get_header(args->files,1),bcf_sr_get_line(args->files,1)), (int64_t) bcf_sr_get_line(args->files,1)->pos+1 - ); + bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1); site_drop_warned = 1; } continue; } - phased_flush(args); - bcf_sr_remove_reader(args->files, 0); + else if ( !args->ligate_force ) + { + ir = _get_active_index(args->files); + error("Error: The --ligate option is intended for VCFs with perfect overlap, the site %s:%"PRId64" breaks the assumption\n", + bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1); + } } // Get a line to learn about current position - for (i=0; ifiles->nreaders; i++) - if ( bcf_sr_has_line(args->files,i) ) break; - bcf1_t *line = bcf_sr_get_line(args->files,i); + ir = _get_active_index(args->files); + bcf1_t *line = bcf_sr_get_line(args->files,ir); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. - if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; + if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader @@ -494,16 +533,37 @@ static void concat(args_t *args) } if ( must_seek ) { - bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); + bcf_sr_seek(args->files, bcf_seqname(args->files->readers[ir].header,line), line->pos); seek_pos = line->pos; - seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); + seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped - if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; + if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) && !args->ligate_force ) + { + if ( args->ligate_warn && !site_drop_warned ) + { + ir = _get_active_index(args->files); + fprintf(stderr, + "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n" + " overlap, sites in overlapping regions present in one but missing in other are dropped.\n" + " This warning is printed only once.\n", + bcf_seqname(bcf_sr_get_header(args->files,ir),line), (int64_t) line->pos+1); + site_drop_warned = 1; + } + else if ( !args->ligate_warn ) + { + ir = _get_active_index(args->files); + error("Error: The --ligate option is intended for VCFs with perfect overlap, the site %s:%"PRId64" breaks the assumption\n", + bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1); + } + continue; + } - phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); + bcf1_t *line0 = bcf_sr_get_line(args->files,0); + bcf1_t *line1 = args->files->nreaders > 1 ? bcf_sr_get_line(args->files,1) : NULL; + phased_push(args, line0, line1, is_overlap); } if ( args->files->nreaders ) @@ -720,11 +780,6 @@ static void naive_concat_check_headers(args_t *args) // if BCF, check if tag IDs are consistent in the dictionary of strings if ( type.compression!=bgzf ) error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); - if ( type.format==vcf ) - { - bcf_hdr_destroy(hdr); - continue; - } _check_hrecs(hdr0,hdr,args->fnames[0],args->fnames[i]); _check_hrecs(hdr,hdr0,args->fnames[i],args->fnames[0]); @@ -742,6 +797,10 @@ static void naive_concat(args_t *args) // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; + htsFormat output_type; + output_type.format = (args->output_type & FT_VCF) ? vcf : bcf; + output_type.compression = (args->output_type & FT_GZ) ? bgzf : no_compression; + struct timeval t0, t1; const size_t page_size = BGZF_MAX_BLOCK_SIZE; uint8_t *buf = (uint8_t*) malloc(page_size); @@ -759,10 +818,17 @@ static void naive_concat(args_t *args) htsFormat type = *hts_get_format(hts_fp); if ( type.compression!=bgzf ) - error("\nThe --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); + error("\nThe --naive option works only for compressed BCFs or VCFs\n"); file_types |= type.format==vcf ? 1 : 2; if ( file_types==3 ) - error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); + error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs\n"); + if ( args->explicit_output_type ) + { + if ( output_type.format!=type.format ) + error("\nThe --naive option works only for the output of the same type, all BCFs or all VCFs\n"); + if ( output_type.compression!=type.compression ) + error("\nThe --naive option works only for the output of the same compression type\n"); + } BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) @@ -848,20 +914,23 @@ static void usage(args_t *args) fprintf(stderr, "Options:\n"); fprintf(stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n"); fprintf(stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n"); - fprintf(stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); + fprintf(stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: \n"); fprintf(stderr, " -D, --remove-duplicates Alias for -d exact\n"); - fprintf(stderr, " -f, --file-list Read the list of files from a file.\n"); + fprintf(stderr, " -f, --file-list FILE Read the list of files from a file.\n"); fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); + fprintf(stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n"); + fprintf(stderr, " --ligate-warn Drop sites in imperfect overlaps\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -n, --naive Concatenate files without recompression, a header check compatibility is performed\n"); fprintf(stderr, " --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution.\n"); - fprintf(stderr, " -o, --output Write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); - fprintf(stderr, " -r, --regions Restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file Restrict to regions listed in a file\n"); - fprintf(stderr, " --threads Use multithreading with worker threads [0]\n"); - fprintf(stderr, " -v, --verbose <0|1> Set verbosity level [1]\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " -q, --min-PQ INT Break phase set if phasing quality is lower than [30]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -v, --verbose 0|1 Set verbosity level [1]\n"); fprintf(stderr, "\n"); exit(1); } @@ -877,6 +946,7 @@ int main_vcfconcat(int argc, char *argv[]) args->record_cmd_line = 1; args->min_PQ = 30; args->verbose = 1; + args->clevel = -1; static struct option loptions[] = { @@ -886,10 +956,13 @@ int main_vcfconcat(int argc, char *argv[]) {"compact-PS",no_argument,NULL,'c'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,12}, {"remove-duplicates",no_argument,NULL,'D'}, {"rm-dups",required_argument,NULL,'d'}, {"allow-overlaps",no_argument,NULL,'a'}, {"ligate",no_argument,NULL,'l'}, + {"ligate-force",no_argument,NULL,10}, + {"ligate-warn",no_argument,NULL,11}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, @@ -917,17 +990,35 @@ int main_vcfconcat(int argc, char *argv[]) case 'f': args->file_list = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': + args->explicit_output_type = 1; switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; + case 10 : args->ligate_force = 1; break; + case 11 : args->ligate_warn = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; + case 12 : + if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 'v': args->verbose = strtol(optarg, 0, 0); error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); @@ -944,6 +1035,7 @@ int main_vcfconcat(int argc, char *argv[]) args->fnames[args->nfnames-1] = strdup(argv[optind]); optind++; } + if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n"); if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); if ( args->file_list ) diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index 0cd061e..e0b23ad 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -45,7 +45,7 @@ typedef struct _args_t { bcf_srs_t *files; htsFile *out_fh; - int output_type, n_threads, record_cmd_line; + int output_type, n_threads, record_cmd_line, clevel; bcf_hdr_t *out_hdr; int *seen_seq; @@ -53,13 +53,14 @@ typedef struct _args_t int *start_pos, start_tid, ifname; int *swap_phase, nswap, *nmatch, *nmism; bcf1_t **buf; + uint8_t *buf_mask; int nbuf, mbuf, prev_chr, min_PQ, prev_pos_check; int32_t *GTa, *GTb, mGTa, mGTb, *phase_qual, *phase_set; char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list; - int argc, nfnames, allow_overlaps, phased_concat, regions_is_file; + int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap; int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; - int verbose; + int verbose, explicit_output_type, ligate_force, ligate_warn; htsThreadPool *tpool; } args_t; @@ -118,7 +119,9 @@ static void init_data(args_t *args) bcf_hdr_append(args->out_hdr,"##FORMAT="); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->allow_overlaps || args->phased_concat ) { @@ -146,6 +149,7 @@ static void init_data(args_t *args) { if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } @@ -215,6 +219,7 @@ static void destroy_data(args_t *args) free(args->swap_phase); for (i=0; imbuf; i++) bcf_destroy(args->buf[i]); free(args->buf); + free(args->buf_mask); free(args->GTa); free(args->GTb); free(args->nmatch); @@ -253,9 +258,10 @@ static void phased_flush(args_t *args) int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr); static int gt_absent_warned = 0; - for (i=0; inbuf; i+=2) { + if ( args->buf_mask[i/2]!=3 ) continue; + bcf1_t *arec = args->buf[i]; bcf1_t *brec = args->buf[i+1]; @@ -302,19 +308,23 @@ static void phased_flush(args_t *args) } for (i=0; inbuf/2; i+=2) { - bcf1_t *arec = args->buf[i]; - bcf_translate(args->out_hdr, args->files->readers[0].header, arec); - if ( args->nswap ) - phase_update(args, args->out_hdr, arec); + bcf1_t *rec; + bcf_hdr_t *hdr; + int mask = args->buf_mask[i/2]; + if ( mask & 1 ) { rec = args->buf[i]; hdr = args->files->readers[0].header; } + else { rec = args->buf[i+1]; hdr = args->files->readers[1].header; } + bcf_translate(args->out_hdr, hdr, rec); + if ( args->nswap && (mask&1) ) + phase_update(args, args->out_hdr, rec); if ( !args->compact_PS || args->phase_set_changed ) { - bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); + bcf_update_format_int32(args->out_hdr,rec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } - if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write(args->out_fh, args->out_hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1,args->prev_pos_check+1); - args->prev_pos_check = arec->pos; + if ( rec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(hdr,rec),(int64_t)rec->pos+1,args->prev_pos_check+1); + args->prev_pos_check = rec->pos; } args->nswap = 0; for (j=0; jnbuf; i+=2) { - bcf1_t *brec = args->buf[i+1]; - bcf_translate(args->out_hdr, args->files->readers[1].header, brec); - if ( !PQ_printed ) + bcf1_t *rec; + bcf_hdr_t *hdr; + int mask = args->buf_mask[i/2]; + if ( mask & 2 ) { rec = args->buf[i+1]; hdr = args->files->readers[1].header; } + else { rec = args->buf[i]; hdr = args->files->readers[0].header; } + bcf_translate(args->out_hdr, hdr, rec); + if ( !PQ_printed && mask==3 ) { - bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl); + bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; jphase_qual[j] < args->min_PQ ) { - args->phase_set[j] = brec->pos+1; + args->phase_set[j] = rec->pos+1; args->phase_set_changed = 1; } else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing; } if ( args->nswap ) - phase_update(args, args->out_hdr, brec); + phase_update(args, args->out_hdr, rec); if ( !args->compact_PS || args->phase_set_changed ) { - bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl); + bcf_update_format_int32(args->out_hdr,rec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } - if ( bcf_write(args->out_fh, args->out_hdr, brec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write(args->out_fh, args->out_hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1,args->prev_pos_check+1); - args->prev_pos_check = brec->pos; + if ( rec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(hdr,rec),(int64_t)rec->pos+1,args->prev_pos_check+1); + args->prev_pos_check = rec->pos; } args->nbuf = 0; } -static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) +static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec, int is_overlap) { + bcf_hdr_t *ahdr = arec ? bcf_sr_get_header(args->files,0) : NULL; + bcf_hdr_t *bhdr = brec ? bcf_sr_get_header(args->files,1) : NULL; + if ( arec && arec->errcode ) - error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1, args->files->readers[0].fname); + error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(ahdr,arec),(int64_t) arec->pos+1, args->files->readers[0].fname); if ( brec && brec->errcode ) - error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1, args->files->readers[1].fname); + error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(bhdr,brec),(int64_t) brec->pos+1, args->files->readers[1].fname); int i, nsmpl = bcf_hdr_nsamples(args->out_hdr); - int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec)); + int chr_id = arec ? bcf_hdr_name2id(args->out_hdr,bcf_seqname(ahdr,arec)) : bcf_hdr_name2id(args->out_hdr,bcf_seqname(bhdr,brec)); if ( args->prev_chr<0 || args->prev_chr!=chr_id ) { if ( args->prev_chr>=0 ) phased_flush(args); for (i=0; iphase_set[i] = arec->pos+1; + args->phase_set[i] = arec ? arec->pos+1 : brec->pos+1; args->phase_set_changed = 1; - if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec)); + if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", arec ? bcf_seqname(ahdr,arec) : bcf_seqname(bhdr,brec)); args->seen_seq[chr_id] = 1; args->prev_chr = chr_id; args->prev_pos_check = -1; } - if ( !brec ) + if ( !is_overlap ) { - bcf_translate(args->out_hdr, args->files->readers[0].header, arec); + assert(arec); + + bcf_translate(args->out_hdr, ahdr, arec); if ( args->nswap ) phase_update(args, args->out_hdr, arec); if ( !args->compact_PS || args->phase_set_changed ) @@ -405,7 +424,7 @@ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); if ( arec->pos < args->prev_pos_check ) - error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); + error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(ahdr,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); args->prev_pos_check = arec->pos; return; } @@ -413,11 +432,21 @@ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) int m = args->mbuf; args->nbuf += 2; hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf); + if ( m < args->mbuf ) args->buf_mask = (uint8_t*)realloc(args->buf_mask,sizeof(*args->buf_mask)*args->mbuf); for (i=m; imbuf; i++) args->buf[i] = bcf_init1(); - SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]); - SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]); + if ( arec ) SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]); + if ( brec ) SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]); + args->buf_mask[args->nbuf/2-1] = (arec?1:0) | (brec?2:0); +} + +static int _get_active_index(bcf_srs_t *sr) +{ + int i; + for (i=0; inreaders; i++) + if ( bcf_sr_has_line(sr,i) ) return i; + return -1; } static void concat(args_t *args) @@ -453,37 +482,47 @@ static void concat(args_t *args) else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start - int nret; + int nret, ir; while ( (nret = bcf_sr_next_line(args->files)) ) { + int is_overlap = args->files->nreaders==1 ? 0 : 1; if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped - if ( ! bcf_sr_region_done(args->files,0) ) + if ( bcf_sr_region_done(args->files,0) ) + { + phased_flush(args); + bcf_sr_remove_reader(args->files, 0); + is_overlap = 0; + } + else if ( args->ligate_warn ) { if ( !site_drop_warned ) { + ir = _get_active_index(args->files); fprintf(bcftools_stderr, "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n" " overlap, sites in overlapping regions present in one but missing in other are dropped.\n" " This warning is printed only once.\n", - bcf_seqname(bcf_sr_get_header(args->files,1),bcf_sr_get_line(args->files,1)), (int64_t) bcf_sr_get_line(args->files,1)->pos+1 - ); + bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1); site_drop_warned = 1; } continue; } - phased_flush(args); - bcf_sr_remove_reader(args->files, 0); + else if ( !args->ligate_force ) + { + ir = _get_active_index(args->files); + error("Error: The --ligate option is intended for VCFs with perfect overlap, the site %s:%"PRId64" breaks the assumption\n", + bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1); + } } // Get a line to learn about current position - for (i=0; ifiles->nreaders; i++) - if ( bcf_sr_has_line(args->files,i) ) break; - bcf1_t *line = bcf_sr_get_line(args->files,i); + ir = _get_active_index(args->files); + bcf1_t *line = bcf_sr_get_line(args->files,ir); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. - if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; + if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader @@ -496,16 +535,37 @@ static void concat(args_t *args) } if ( must_seek ) { - bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); + bcf_sr_seek(args->files, bcf_seqname(args->files->readers[ir].header,line), line->pos); seek_pos = line->pos; - seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); + seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped - if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; + if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) && !args->ligate_force ) + { + if ( args->ligate_warn && !site_drop_warned ) + { + ir = _get_active_index(args->files); + fprintf(bcftools_stderr, + "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n" + " overlap, sites in overlapping regions present in one but missing in other are dropped.\n" + " This warning is printed only once.\n", + bcf_seqname(bcf_sr_get_header(args->files,ir),line), (int64_t) line->pos+1); + site_drop_warned = 1; + } + else if ( !args->ligate_warn ) + { + ir = _get_active_index(args->files); + error("Error: The --ligate option is intended for VCFs with perfect overlap, the site %s:%"PRId64" breaks the assumption\n", + bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1); + } + continue; + } - phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); + bcf1_t *line0 = bcf_sr_get_line(args->files,0); + bcf1_t *line1 = args->files->nreaders > 1 ? bcf_sr_get_line(args->files,1) : NULL; + phased_push(args, line0, line1, is_overlap); } if ( args->files->nreaders ) @@ -722,11 +782,6 @@ static void naive_concat_check_headers(args_t *args) // if BCF, check if tag IDs are consistent in the dictionary of strings if ( type.compression!=bgzf ) error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); - if ( type.format==vcf ) - { - bcf_hdr_destroy(hdr); - continue; - } _check_hrecs(hdr0,hdr,args->fnames[0],args->fnames[i]); _check_hrecs(hdr,hdr0,args->fnames[i],args->fnames[0]); @@ -744,6 +799,10 @@ static void naive_concat(args_t *args) // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; + htsFormat output_type; + output_type.format = (args->output_type & FT_VCF) ? vcf : bcf; + output_type.compression = (args->output_type & FT_GZ) ? bgzf : no_compression; + struct timeval t0, t1; const size_t page_size = BGZF_MAX_BLOCK_SIZE; uint8_t *buf = (uint8_t*) malloc(page_size); @@ -761,10 +820,17 @@ static void naive_concat(args_t *args) htsFormat type = *hts_get_format(hts_fp); if ( type.compression!=bgzf ) - error("\nThe --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); + error("\nThe --naive option works only for compressed BCFs or VCFs\n"); file_types |= type.format==vcf ? 1 : 2; if ( file_types==3 ) - error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); + error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs\n"); + if ( args->explicit_output_type ) + { + if ( output_type.format!=type.format ) + error("\nThe --naive option works only for the output of the same type, all BCFs or all VCFs\n"); + if ( output_type.compression!=type.compression ) + error("\nThe --naive option works only for the output of the same compression type\n"); + } BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) @@ -850,20 +916,23 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Options:\n"); fprintf(bcftools_stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n"); fprintf(bcftools_stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n"); - fprintf(bcftools_stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); + fprintf(bcftools_stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: \n"); fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d exact\n"); - fprintf(bcftools_stderr, " -f, --file-list Read the list of files from a file.\n"); + fprintf(bcftools_stderr, " -f, --file-list FILE Read the list of files from a file.\n"); fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); + fprintf(bcftools_stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n"); + fprintf(bcftools_stderr, " --ligate-warn Drop sites in imperfect overlaps\n"); fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -n, --naive Concatenate files without recompression, a header check compatibility is performed\n"); fprintf(bcftools_stderr, " --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution.\n"); - fprintf(bcftools_stderr, " -o, --output Write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); - fprintf(bcftools_stderr, " -r, --regions Restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file Restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " --threads Use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -v, --verbose <0|1> Set verbosity level [1]\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " -q, --min-PQ INT Break phase set if phasing quality is lower than [30]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbose 0|1 Set verbosity level [1]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -879,6 +948,7 @@ int main_vcfconcat(int argc, char *argv[]) args->record_cmd_line = 1; args->min_PQ = 30; args->verbose = 1; + args->clevel = -1; static struct option loptions[] = { @@ -888,10 +958,13 @@ int main_vcfconcat(int argc, char *argv[]) {"compact-PS",no_argument,NULL,'c'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,12}, {"remove-duplicates",no_argument,NULL,'D'}, {"rm-dups",required_argument,NULL,'d'}, {"allow-overlaps",no_argument,NULL,'a'}, {"ligate",no_argument,NULL,'l'}, + {"ligate-force",no_argument,NULL,10}, + {"ligate-warn",no_argument,NULL,11}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, @@ -919,17 +992,35 @@ int main_vcfconcat(int argc, char *argv[]) case 'f': args->file_list = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': + args->explicit_output_type = 1; switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; + case 10 : args->ligate_force = 1; break; + case 11 : args->ligate_warn = 1; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; + case 12 : + if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 'v': args->verbose = strtol(optarg, 0, 0); error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); @@ -946,6 +1037,7 @@ int main_vcfconcat(int argc, char *argv[]) args->fnames[args->nfnames-1] = strdup(argv[optind]); optind++; } + if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n"); if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); if ( args->file_list ) diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index a48e85c..c0fddac 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -66,9 +66,10 @@ struct _args_t float *flt; int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col; int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; + int regions_overlap, targets_overlap; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname, *sex_fname; - int argc, n_threads, record_cmd_line, keep_duplicates; + int argc, n_threads, record_cmd_line, keep_duplicates, clevel; }; static void destroy_data(args_t *args) @@ -88,11 +89,13 @@ static void open_vcf(args_t *args, const char *format_str) if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } @@ -394,7 +397,9 @@ static void gensample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -522,7 +527,9 @@ static void haplegendsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -636,7 +643,9 @@ static void hapsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -1224,7 +1233,9 @@ static void tsv_to_vcf(args_t *args) bcf_hdr_add_sample(args->header, NULL); args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -1276,7 +1287,9 @@ static void tsv_to_vcf(args_t *args) static void vcf_to_vcf(args_t *args) { open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1305,7 +1318,9 @@ static void gvcf_to_vcf(args_t *args) if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname); open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1380,58 +1395,60 @@ static void usage(void) fprintf(stderr, "\n"); fprintf(stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n"); fprintf(stderr, " formats details. When specifying output files explicitly instead\n"); - fprintf(stderr, " of with , one can use '-' for stdout and '.' to suppress.\n"); - fprintf(stderr, "Usage: bcftools convert [OPTIONS] \n"); + fprintf(stderr, " of with PREFIX, one can use '-' for stdout and '.' to suppress.\n"); + fprintf(stderr, "Usage: bcftools convert [OPTIONS] INPUT_FILE\n"); fprintf(stderr, "\n"); fprintf(stderr, "VCF input options:\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --samples list of samples to include\n"); - fprintf(stderr, " -S, --samples-file file of samples to include\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); + fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -s, --samples LIST List of samples to include\n"); + fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "VCF output options:\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output output file name [stdout]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); - fprintf(stderr, " -G, --gensample2vcf <...> |,\n"); - fprintf(stderr, " -g, --gensample <...> |,\n"); - fprintf(stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); - fprintf(stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); - fprintf(stderr, " --keep-duplicates keep duplicate positions\n"); - fprintf(stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); + fprintf(stderr, " -G, --gensample2vcf ... |,\n"); + fprintf(stderr, " -g, --gensample ... |,\n"); + fprintf(stderr, " --tag STRING Tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); + fprintf(stderr, " --chrom Output chromosome in first column instead of CHROM:POS_REF_ALT\n"); + fprintf(stderr, " --keep-duplicates Keep duplicate positions\n"); + fprintf(stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(stderr, " --vcf-ids Output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, "\n"); fprintf(stderr, "gVCF conversion:\n"); - fprintf(stderr, " --gvcf2vcf expand gVCF reference blocks\n"); - fprintf(stderr, " -f, --fasta-ref reference sequence in fasta format\n"); + fprintf(stderr, " --gvcf2vcf Expand gVCF reference blocks\n"); + fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); fprintf(stderr, "\n"); fprintf(stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n"); - fprintf(stderr, " --hapsample2vcf <...> |,\n"); - fprintf(stderr, " --hapsample <...> |,\n"); - fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); - fprintf(stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); + fprintf(stderr, " --hapsample2vcf ... |,\n"); + fprintf(stderr, " --hapsample ... |,\n"); + fprintf(stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n"); + fprintf(stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, "\n"); fprintf(stderr, "HAP/LEGEND/SAMPLE conversion:\n"); - fprintf(stderr, " -H, --haplegendsample2vcf <...> |,,\n"); - fprintf(stderr, " -h, --haplegendsample <...> |,,\n"); - fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); - fprintf(stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); + fprintf(stderr, " -H, --haplegendsample2vcf ... |,,\n"); + fprintf(stderr, " -h, --haplegendsample ... |,,\n"); + fprintf(stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n"); + fprintf(stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, "\n"); fprintf(stderr, "TSV conversion:\n"); - fprintf(stderr, " --tsv2vcf \n"); - fprintf(stderr, " -c, --columns columns of the input tsv file [ID,CHROM,POS,AA]\n"); - fprintf(stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(stderr, " -s, --samples list of sample names\n"); - fprintf(stderr, " -S, --samples-file file of sample names\n"); + fprintf(stderr, " --tsv2vcf FILE\n"); + fprintf(stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n"); + fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); + fprintf(stderr, " -s, --samples LIST List of sample names\n"); + fprintf(stderr, " -S, --samples-file FILE File of sample names\n"); fprintf(stderr, "\n"); // fprintf(stderr, "PLINK options:\n"); // fprintf(stderr, " -p, --plink |,,|,,|,\n"); @@ -1453,6 +1470,9 @@ int main_vcfconvert(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->regions_overlap = 1; + args->targets_overlap = 0; + args->clevel = -1; static struct option loptions[] = { @@ -1463,8 +1483,10 @@ int main_vcfconvert(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,13}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,14}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, {"sex",required_argument,NULL,11}, @@ -1486,6 +1508,7 @@ int main_vcfconvert(int argc, char *argv[]) {"keep-duplicates",no_argument,NULL,12}, {NULL,0,NULL,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) { switch (c) { case 'e': @@ -1520,7 +1543,16 @@ int main_vcfconvert(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break; @@ -1528,6 +1560,18 @@ int main_vcfconvert(int argc, char *argv[]) case 10 : args->record_cmd_line = 0; break; case 11 : args->sex_fname = optarg; break; case 12 : args->keep_duplicates = 1; break; + case 13 : + if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 14 : + if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index 358e404..8f8d4a3 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -68,9 +68,10 @@ struct _args_t float *flt; int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col; int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; + int regions_overlap, targets_overlap; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname, *sex_fname; - int argc, n_threads, record_cmd_line, keep_duplicates; + int argc, n_threads, record_cmd_line, keep_duplicates, clevel; }; static void destroy_data(args_t *args) @@ -90,11 +91,13 @@ static void open_vcf(args_t *args, const char *format_str) if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } @@ -396,7 +399,9 @@ static void gensample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -524,7 +529,9 @@ static void haplegendsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -638,7 +645,9 @@ static void hapsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -1226,7 +1235,9 @@ static void tsv_to_vcf(args_t *args) bcf_hdr_add_sample(args->header, NULL); args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -1278,7 +1289,9 @@ static void tsv_to_vcf(args_t *args) static void vcf_to_vcf(args_t *args) { open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1307,7 +1320,9 @@ static void gvcf_to_vcf(args_t *args) if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname); open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->outfname,args->clevel); + htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1382,58 +1397,60 @@ static void usage(void) fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n"); fprintf(bcftools_stderr, " formats details. When specifying output files explicitly instead\n"); - fprintf(bcftools_stderr, " of with , one can use '-' for bcftools_stdout and '.' to suppress.\n"); - fprintf(bcftools_stderr, "Usage: bcftools convert [OPTIONS] \n"); + fprintf(bcftools_stderr, " of with PREFIX, one can use '-' for bcftools_stdout and '.' to suppress.\n"); + fprintf(bcftools_stderr, "Usage: bcftools convert [OPTIONS] INPUT_FILE\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "VCF input options:\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --samples list of samples to include\n"); - fprintf(bcftools_stderr, " -S, --samples-file file of samples to include\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include\n"); + fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "VCF output options:\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output output file name [bcftools_stdout]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); - fprintf(bcftools_stderr, " -G, --gensample2vcf <...> |,\n"); - fprintf(bcftools_stderr, " -g, --gensample <...> |,\n"); - fprintf(bcftools_stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); - fprintf(bcftools_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); - fprintf(bcftools_stderr, " --keep-duplicates keep duplicate positions\n"); - fprintf(bcftools_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(bcftools_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, " -G, --gensample2vcf ... |,\n"); + fprintf(bcftools_stderr, " -g, --gensample ... |,\n"); + fprintf(bcftools_stderr, " --tag STRING Tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); + fprintf(bcftools_stderr, " --chrom Output chromosome in first column instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, " --keep-duplicates Keep duplicate positions\n"); + fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "gVCF conversion:\n"); - fprintf(bcftools_stderr, " --gvcf2vcf expand gVCF reference blocks\n"); - fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); + fprintf(bcftools_stderr, " --gvcf2vcf Expand gVCF reference blocks\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n"); - fprintf(bcftools_stderr, " --hapsample2vcf <...> |,\n"); - fprintf(bcftools_stderr, " --hapsample <...> |,\n"); - fprintf(bcftools_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); - fprintf(bcftools_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(bcftools_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, " --hapsample2vcf ... |,\n"); + fprintf(bcftools_stderr, " --hapsample ... |,\n"); + fprintf(bcftools_stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n"); + fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "HAP/LEGEND/SAMPLE conversion:\n"); - fprintf(bcftools_stderr, " -H, --haplegendsample2vcf <...> |,,\n"); - fprintf(bcftools_stderr, " -h, --haplegendsample <...> |,,\n"); - fprintf(bcftools_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); - fprintf(bcftools_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(bcftools_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, " -H, --haplegendsample2vcf ... |,,\n"); + fprintf(bcftools_stderr, " -h, --haplegendsample ... |,,\n"); + fprintf(bcftools_stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n"); + fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "TSV conversion:\n"); - fprintf(bcftools_stderr, " --tsv2vcf \n"); - fprintf(bcftools_stderr, " -c, --columns columns of the input tsv file [ID,CHROM,POS,AA]\n"); - fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(bcftools_stderr, " -s, --samples list of sample names\n"); - fprintf(bcftools_stderr, " -S, --samples-file file of sample names\n"); + fprintf(bcftools_stderr, " --tsv2vcf FILE\n"); + fprintf(bcftools_stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); + fprintf(bcftools_stderr, " -s, --samples LIST List of sample names\n"); + fprintf(bcftools_stderr, " -S, --samples-file FILE File of sample names\n"); fprintf(bcftools_stderr, "\n"); // fprintf(bcftools_stderr, "PLINK options:\n"); // fprintf(bcftools_stderr, " -p, --plink |,,|,,|,\n"); @@ -1455,6 +1472,9 @@ int main_vcfconvert(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->regions_overlap = 1; + args->targets_overlap = 0; + args->clevel = -1; static struct option loptions[] = { @@ -1465,8 +1485,10 @@ int main_vcfconvert(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,13}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,14}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, {"sex",required_argument,NULL,11}, @@ -1488,6 +1510,7 @@ int main_vcfconvert(int argc, char *argv[]) {"keep-duplicates",no_argument,NULL,12}, {NULL,0,NULL,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) { switch (c) { case 'e': @@ -1522,7 +1545,16 @@ int main_vcfconvert(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break; @@ -1530,6 +1562,18 @@ int main_vcfconvert(int argc, char *argv[]) case 10 : args->record_cmd_line = 0; break; case 11 : args->sex_fname = optarg; break; case 12 : args->keep_duplicates = 1; break; + case 13 : + if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 14 : + if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c index 723bcdf..06b8d19 100644 --- a/bcftools/vcffilter.c +++ b/bcftools/vcffilter.c @@ -71,7 +71,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr; htsFile *out_fh; - int output_type, n_threads; + int output_type, n_threads, clevel; char **argv, *output_fname, *targets_list, *regions_list; int argc, record_cmd_line; @@ -80,7 +80,9 @@ args_t; static void init_data(args_t *args) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -408,21 +410,23 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools filter [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -g, --SnpGap [:type] filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); - fprintf(stderr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); - fprintf(stderr, " -i, --include include only sites for which the expression is true (see man page for details\n"); - fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --soft-filter annotate FILTER column with or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n"); - fprintf(stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -g, --SnpGap INT[:TYPE] Filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); + fprintf(stderr, " -G, --IndelGap INT Filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); + fprintf(stderr, " -i, --include EXPR Include only sites for which the expression is true (see man page for details\n"); + fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -s, --soft-filter STRING Annotate FILTER column with or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n"); + fprintf(stderr, " -S, --set-GTs .|0 Set genotypes of failed samples to missing (.) or ref (0)\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, "\n"); exit(1); } @@ -437,7 +441,10 @@ int main_vcffilter(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->clevel = -1; int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -448,8 +455,10 @@ int main_vcffilter(int argc, char *argv[]) {"include",required_argument,NULL,'i'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, @@ -498,7 +507,16 @@ int main_vcffilter(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 's': args->soft_filter = optarg; break; @@ -523,6 +541,18 @@ int main_vcffilter(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 4 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -542,6 +572,7 @@ int main_vcffilter(int argc, char *argv[]) if ( args->regions_list ) { args->files->require_index = 1; + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } @@ -552,12 +583,14 @@ int main_vcffilter(int argc, char *argv[]) kputs(argv[optind+1],&tmp); for (i=optind+2; ifiles->require_index = 1; + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, tmp.s, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); free(tmp.s); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c index 5709182..28824dc 100644 --- a/bcftools/vcffilter.c.pysam.c +++ b/bcftools/vcffilter.c.pysam.c @@ -73,7 +73,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr; htsFile *out_fh; - int output_type, n_threads; + int output_type, n_threads, clevel; char **argv, *output_fname, *targets_list, *regions_list; int argc, record_cmd_line; @@ -82,7 +82,9 @@ args_t; static void init_data(args_t *args) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); @@ -410,21 +412,23 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools filter [options] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -g, --SnpGap [:type] filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); - fprintf(bcftools_stderr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); - fprintf(bcftools_stderr, " -i, --include include only sites for which the expression is true (see man page for details\n"); - fprintf(bcftools_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --soft-filter annotate FILTER column with or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n"); - fprintf(bcftools_stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -g, --SnpGap INT[:TYPE] Filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); + fprintf(bcftools_stderr, " -G, --IndelGap INT Filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); + fprintf(bcftools_stderr, " -i, --include EXPR Include only sites for which the expression is true (see man page for details\n"); + fprintf(bcftools_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -s, --soft-filter STRING Annotate FILTER column with or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n"); + fprintf(bcftools_stderr, " -S, --set-GTs .|0 Set genotypes of failed samples to missing (.) or ref (0)\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -439,7 +443,10 @@ int main_vcffilter(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->clevel = -1; int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -450,8 +457,10 @@ int main_vcffilter(int argc, char *argv[]) {"include",required_argument,NULL,'i'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, @@ -500,7 +509,16 @@ int main_vcffilter(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 's': args->soft_filter = optarg; break; @@ -525,6 +543,18 @@ int main_vcffilter(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 4 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -544,6 +574,7 @@ int main_vcffilter(int argc, char *argv[]) if ( args->regions_list ) { args->files->require_index = 1; + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } @@ -554,12 +585,14 @@ int main_vcffilter(int argc, char *argv[]) kputs(argv[optind+1],&tmp); for (i=optind+2; ifiles->require_index = 1; + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, tmp.s, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); free(tmp.s); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c index 8a96e3e..4d36b91 100644 --- a/bcftools/vcfgtcheck.c +++ b/bcftools/vcfgtcheck.c @@ -57,6 +57,7 @@ typedef struct bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples; int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file; + int regions_overlap, targets_overlap; int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl; double *pdiff, *qry_prob, *gt_prob; uint32_t *ndiff,*ncnt,ncmp, npairs; @@ -236,8 +237,16 @@ static void init_data(args_t *args) hts_srand48(0); args->files = bcf_sr_init(); - if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); - if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); + if ( args->regions ) + { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); + if ( bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); + } + if ( args->targets ) + { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap); + if ( bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); + } if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX); if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum)); @@ -1047,10 +1056,12 @@ static void usage(void) fprintf(stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n"); fprintf(stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n"); fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Check discordance of all samples from B against all sample in A\n"); @@ -1074,6 +1085,8 @@ int main_vcfgtcheck(int argc, char *argv[]) args->gt_use_GT = -1; args->calc_hwe_prob = 1; args->use_PLs = 40; + args->regions_overlap = 1; + args->targets_overlap = 0; // external sort for --distinctive-sites #ifdef _WIN32 @@ -1110,8 +1123,10 @@ int main_vcfgtcheck(int argc, char *argv[]) {"distinctive-sites",1,0,6}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,7}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,8}, {"pairs",1,0,'p'}, {"pairs-file",1,0,'P'}, {0,0,0,0} @@ -1198,6 +1213,18 @@ int main_vcfgtcheck(int argc, char *argv[]) case 'R': args->regions = optarg; args->regions_is_file = 1; break; case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; + case 7 : + if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 8 : + if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index 6ab27ed..d13dd84 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -59,6 +59,7 @@ typedef struct bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples; int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file; + int regions_overlap, targets_overlap; int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl; double *pdiff, *qry_prob, *gt_prob; uint32_t *ndiff,*ncnt,ncmp, npairs; @@ -238,8 +239,16 @@ static void init_data(args_t *args) hts_srand48(0); args->files = bcf_sr_init(); - if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); - if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); + if ( args->regions ) + { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); + if ( bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); + } + if ( args->targets ) + { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap); + if ( bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); + } if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX); if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum)); @@ -1049,10 +1058,12 @@ static void usage(void) fprintf(bcftools_stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n"); fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n"); fprintf(bcftools_stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n"); fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # Check discordance of all samples from B against all sample in A\n"); @@ -1076,6 +1087,8 @@ int main_vcfgtcheck(int argc, char *argv[]) args->gt_use_GT = -1; args->calc_hwe_prob = 1; args->use_PLs = 40; + args->regions_overlap = 1; + args->targets_overlap = 0; // external sort for --distinctive-sites #ifdef _WIN32 @@ -1112,8 +1125,10 @@ int main_vcfgtcheck(int argc, char *argv[]) {"distinctive-sites",1,0,6}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,7}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,8}, {"pairs",1,0,'p'}, {"pairs-file",1,0,'P'}, {0,0,0,0} @@ -1200,6 +1215,18 @@ int main_vcfgtcheck(int argc, char *argv[]) case 'R': args->regions = optarg; args->regions_is_file = 1; break; case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; + case 7 : + if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 8 : + if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c index 4a16d8a..5fd50c2 100644 --- a/bcftools/vcfindex.c +++ b/bcftools/vcfindex.c @@ -83,6 +83,7 @@ int vcf_index_stats(char *fname, int stats) * the total number of records. */ int len = strlen(fname); + int idx_only = 0; if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) { fntemp = strdup(fname); if ( !fntemp ) return 1; @@ -96,27 +97,45 @@ int vcf_index_stats(char *fname, int stats) fntemp = strdup(fname); fname = fntemp; fname[len-4] = 0; + idx_only = 1; } if ( stats&per_contig ) { - fp = hts_open(fname,"r"); - if ( !fp ) { - fprintf(stderr,"Could not read %s\n", fname); - ret = 1; goto cleanup; + if ( idx_only ) + { + struct stat buf; + if ( stat(fname, &buf)==0 ) idx_only = 0; } - hdr = bcf_hdr_read(fp); - if ( !hdr ) { - fprintf(stderr,"Could not read the header: %s\n", fname); - ret = 1; goto cleanup; + + enum htsExactFormat fmt; + if ( !idx_only ) + { + fp = hts_open(fname,"r"); + if ( !fp ) { + fprintf(stderr,"Could not read %s\n", fname); + ret = 1; goto cleanup; + } + hdr = bcf_hdr_read(fp); + if ( !hdr ) { + fprintf(stderr,"Could not read the header: %s\n", fname); + ret = 1; goto cleanup; + } + fmt = hts_get_format(fp)->format; + } + else + { + int len = strlen(fnidx); + if ( !strcasecmp(".tbi",fnidx+len-4) ) fmt = vcf; + else fmt = bcf; } - if ( hts_get_format(fp)->format==vcf ) + if ( fmt==vcf ) { tbx = tbx_index_load2(fname, fnidx); if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; } } - else if ( hts_get_format(fp)->format==bcf ) + else if ( fmt==bcf ) { idx = bcf_index_load2(fname, fnidx); if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; } @@ -158,19 +177,17 @@ int vcf_index_stats(char *fname, int stats) } else { nseq = hts_idx_nseq(idx); } - + if ( !tbx && !hdr ) fprintf(stderr,"Warning: cannot determine contig names given the .csi index alone\n"); for (tid=0; tididx : idx, tid, &records, &v); sum += records; if ( (stats&total) || !records ) continue; - const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL; - if ( ctg_name ) { - bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL; - int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records); - } + const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a"; + bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL; + int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; + printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records); } if ( !sum ) { diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c index acbae89..43d342d 100644 --- a/bcftools/vcfindex.c.pysam.c +++ b/bcftools/vcfindex.c.pysam.c @@ -85,6 +85,7 @@ int vcf_index_stats(char *fname, int stats) * the total number of records. */ int len = strlen(fname); + int idx_only = 0; if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) { fntemp = strdup(fname); if ( !fntemp ) return 1; @@ -98,27 +99,45 @@ int vcf_index_stats(char *fname, int stats) fntemp = strdup(fname); fname = fntemp; fname[len-4] = 0; + idx_only = 1; } if ( stats&per_contig ) { - fp = hts_open(fname,"r"); - if ( !fp ) { - fprintf(bcftools_stderr,"Could not read %s\n", fname); - ret = 1; goto cleanup; + if ( idx_only ) + { + struct stat buf; + if ( stat(fname, &buf)==0 ) idx_only = 0; } - hdr = bcf_hdr_read(fp); - if ( !hdr ) { - fprintf(bcftools_stderr,"Could not read the header: %s\n", fname); - ret = 1; goto cleanup; + + enum htsExactFormat fmt; + if ( !idx_only ) + { + fp = hts_open(fname,"r"); + if ( !fp ) { + fprintf(bcftools_stderr,"Could not read %s\n", fname); + ret = 1; goto cleanup; + } + hdr = bcf_hdr_read(fp); + if ( !hdr ) { + fprintf(bcftools_stderr,"Could not read the header: %s\n", fname); + ret = 1; goto cleanup; + } + fmt = hts_get_format(fp)->format; + } + else + { + int len = strlen(fnidx); + if ( !strcasecmp(".tbi",fnidx+len-4) ) fmt = vcf; + else fmt = bcf; } - if ( hts_get_format(fp)->format==vcf ) + if ( fmt==vcf ) { tbx = tbx_index_load2(fname, fnidx); if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; } } - else if ( hts_get_format(fp)->format==bcf ) + else if ( fmt==bcf ) { idx = bcf_index_load2(fname, fnidx); if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; } @@ -160,19 +179,17 @@ int vcf_index_stats(char *fname, int stats) } else { nseq = hts_idx_nseq(idx); } - + if ( !tbx && !hdr ) fprintf(bcftools_stderr,"Warning: cannot determine contig names given the .csi index alone\n"); for (tid=0; tididx : idx, tid, &records, &v); sum += records; if ( (stats&total) || !records ) continue; - const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL; - if ( ctg_name ) { - bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL; - int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records); - } + const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a"; + bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL; + int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; + fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records); } if ( !sum ) { diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index 1d2fab1..acc1885 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -50,7 +50,7 @@ THE SOFTWARE. */ typedef struct { - int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads; + int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads, clevel; int nflt, *flt_logic; filter_t **flt; char **flt_expr; @@ -141,7 +141,9 @@ void isec_vcf(args_t *args) if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { - out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); @@ -356,7 +358,9 @@ static void init_data(args_t *args) #define OPEN_FILE(i,j) { \ open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \ - args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \ + char wmode[8]; \ + set_wmode(wmode,args->output_type,args->fnames[i],args->clevel); \ + args->fh_out[i] = hts_open(args->fnames[i], wmode); \ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ @@ -458,22 +462,24 @@ static void usage(void) fprintf(stderr, "Usage: bcftools isec [options] [...]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(stderr, " -C, --complement output positions present only in the first file but missing in the others\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(stderr, " -i, --include include only sites for which the expression is true\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -n, --nfiles [+-=~] output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -p, --prefix if given, subset each of the input files accordingly, see also -w\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); + fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); + fprintf(stderr, " -C, --complement Output positions present only in the first file but missing in the others\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); + fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(stderr, " -i, --include EXPR Include only sites for which the expression is true\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -n, --nfiles [+-=~]INT Output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " -p, --prefix DIR If given, subset each of the input files accordingly, see also -w\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); @@ -504,7 +510,10 @@ int main_vcfisec(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->clevel = -1; int targets_is_file = 0, regions_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -519,14 +528,17 @@ int main_vcfisec(int argc, char *argv[]) {"write",required_argument,NULL,'w'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) { switch (c) { case 'o': args->output_fname = optarg; break; @@ -536,7 +548,16 @@ int main_vcfisec(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'c': @@ -576,6 +597,18 @@ int main_vcfisec(int argc, char *argv[]) else if ( sscanf(p,"%d",&args->isec_n)!=1 ) error("Could not parse --nfiles %s\n", optarg); } break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 4 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 'h': @@ -584,10 +617,18 @@ int main_vcfisec(int argc, char *argv[]) } } if ( argc-optind<1 ) usage(); // no file given - if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 ) - error("Failed to read the targets: %s\n", args->targets_list); - if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions_list); + if ( args->targets_list ) + { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); + if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 ) + error("Failed to read the targets: %s\n", args->targets_list); + } + if ( args->regions_list ) + { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); + if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", args->regions_list); + } if ( argc-optind==2 && !args->isec_op ) { args->isec_op = OP_VENN; diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index d59d7df..87178cf 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -52,7 +52,7 @@ THE SOFTWARE. */ typedef struct { - int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads; + int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads, clevel; int nflt, *flt_logic; filter_t **flt; char **flt_expr; @@ -143,7 +143,9 @@ void isec_vcf(args_t *args) if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { - out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); @@ -358,7 +360,9 @@ static void init_data(args_t *args) #define OPEN_FILE(i,j) { \ open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \ - args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \ + char wmode[8]; \ + set_wmode(wmode,args->output_type,args->fnames[i],args->clevel); \ + args->fh_out[i] = hts_open(args->fnames[i], wmode); \ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ @@ -460,22 +464,24 @@ static void usage(void) fprintf(bcftools_stderr, "Usage: bcftools isec [options] [...]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(bcftools_stderr, " -C, --complement output positions present only in the first file but missing in the others\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(bcftools_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(bcftools_stderr, " -i, --include include only sites for which the expression is true\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -n, --nfiles [+-=~] output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -p, --prefix if given, subset each of the input files accordingly, see also -w\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); + fprintf(bcftools_stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); + fprintf(bcftools_stderr, " -C, --complement Output positions present only in the first file but missing in the others\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(bcftools_stderr, " -i, --include EXPR Include only sites for which the expression is true\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -n, --nfiles [+-=~]INT Output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " -p, --prefix DIR If given, subset each of the input files accordingly, see also -w\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); @@ -506,7 +512,10 @@ int main_vcfisec(int argc, char *argv[]) args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; + args->clevel = -1; int targets_is_file = 0, regions_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -521,14 +530,17 @@ int main_vcfisec(int argc, char *argv[]) {"write",required_argument,NULL,'w'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) { switch (c) { case 'o': args->output_fname = optarg; break; @@ -538,7 +550,16 @@ int main_vcfisec(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'c': @@ -578,6 +599,18 @@ int main_vcfisec(int argc, char *argv[]) else if ( sscanf(p,"%d",&args->isec_n)!=1 ) error("Could not parse --nfiles %s\n", optarg); } break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 4 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 'h': @@ -586,10 +619,18 @@ int main_vcfisec(int argc, char *argv[]) } } if ( argc-optind<1 ) usage(); // no file given - if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 ) - error("Failed to read the targets: %s\n", args->targets_list); - if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions_list); + if ( args->targets_list ) + { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); + if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 ) + error("Failed to read the targets: %s\n", args->targets_list); + } + if ( args->regions_list ) + { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); + if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", args->regions_list); + } if ( argc-optind==2 && !args->isec_op ) { args->isec_op = OP_VENN; diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index 637e1b9..f87bce7 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -161,8 +161,9 @@ typedef struct htsFile *out_fh; bcf_hdr_t *out_hdr; char **argv; - int argc, n_threads, record_cmd_line; + int argc, n_threads, record_cmd_line, clevel; int local_alleles; // the value of -L option + int keep_AC_AN; } args_t; @@ -345,6 +346,19 @@ static void info_rules_init(args_t *args) if ( str.l ) kputc(',',&str); kputs("IMF:max",&str); } + if ( !bcf_hdr_nsamples(args->out_hdr) ) + { + if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "AN")) ) + { + if ( str.l ) kputc(',',&str); + kputs("AN:sum",&str); + } + if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "AC")) ) + { + if ( str.l ) kputc(',',&str); + kputs("AC:sum",&str); + } + } if ( !str.l ) return; args->info_rules = str.s; @@ -376,6 +390,8 @@ static void info_rules_init(args_t *args) else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type); + if ( !strcmp(rule->hdr_tag,"AC") || !strcmp(rule->hdr_tag,"AN") ) args->keep_AC_AN = 1; + ss = strchr(ss, '\0'); ss++; if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag); @@ -1057,7 +1073,7 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t * @param src: source string * @param isrc: index of the field to copy * @param src_len: length of source string (excluding the terminating \0) - * @param dst: destination kstring (must be initialized) + * @param dst: destination kstring (must be initialized with missing values, e.g. as ".") * @param idst: index of the destination field */ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst) @@ -1251,7 +1267,7 @@ void merge_info(args_t *args, bcf1_t *out) bcf_info_t *inf = &line->d.info[j]; const char *key = hdr->id[BCF_DT_ID][inf->key].key; - if ( !strcmp("AC",key) || !strcmp("AN",key) ) continue; // AC and AN are done in merge_format() after genotypes are done + if ( !args->keep_AC_AN && (!strcmp("AC",key) || !strcmp("AN",key)) ) continue; // AC and AN are done in merge_format() after genotypes are done int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key); if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key); @@ -2268,7 +2284,8 @@ void merge_format(args_t *args, bcf1_t *out) out->n_sample = bcf_hdr_nsamples(out_hdr); if ( has_GT ) merge_GT(args, ma->fmt_map, out); - update_AN_AC(out_hdr, out); + if ( !args->keep_AC_AN ) + update_AN_AC(out_hdr, out); for (i=1; i<=max_ifmt; i++) merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); @@ -3003,7 +3020,9 @@ void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr) } void merge_vcf(args_t *args) { - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); @@ -3083,24 +3102,25 @@ static void usage(void) fprintf(stderr, "Usage: bcftools merge [options] [...]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " --force-samples resolve duplicate sample names\n"); - fprintf(stderr, " --print-header print only the merged header and exit\n"); - fprintf(stderr, " --use-header use the provided header\n"); - fprintf(stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n"); - fprintf(stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(stderr, " -F, --filter-logic remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); - fprintf(stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); - fprintf(stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); - fprintf(stderr, " -l, --file-list read file names from the file\n"); - fprintf(stderr, " -L, --local-alleles EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); - fprintf(stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); - fprintf(stderr, " --no-index merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(stderr, " --force-samples Resolve duplicate sample names\n"); + fprintf(stderr, " --print-header Print only the merged header and exit\n"); + fprintf(stderr, " --use-header FILE Use the provided header\n"); + fprintf(stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n"); + fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); + fprintf(stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); + fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); + fprintf(stderr, " -l, --file-list FILE Read file names from the file\n"); + fprintf(stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); + fprintf(stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); + fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, "\n"); exit(1); } @@ -3116,7 +3136,9 @@ int main_vcfmerge(int argc, char *argv[]) args->n_threads = 0; args->record_cmd_line = 1; args->collapse = COLLAPSE_BOTH; + args->clevel = -1; int regions_is_file = 0; + int regions_overlap = 1; static struct option loptions[] = { @@ -3135,6 +3157,7 @@ int main_vcfmerge(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,4}, {"info-rules",required_argument,NULL,'i'}, {"no-version",no_argument,NULL,8}, {"no-index",no_argument,NULL,10}, @@ -3173,7 +3196,16 @@ int main_vcfmerge(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'm': @@ -3193,6 +3225,12 @@ int main_vcfmerge(int argc, char *argv[]) case 1 : args->header_fname = optarg; break; case 2 : args->header_only = 1; break; case 3 : args->force_samples = 1; break; + case 4 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->no_index = 1; break; @@ -3213,6 +3251,7 @@ int main_vcfmerge(int argc, char *argv[]) bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX); if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( regions_is_file ) diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index 0f1c94c..03119ae 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -163,8 +163,9 @@ typedef struct htsFile *out_fh; bcf_hdr_t *out_hdr; char **argv; - int argc, n_threads, record_cmd_line; + int argc, n_threads, record_cmd_line, clevel; int local_alleles; // the value of -L option + int keep_AC_AN; } args_t; @@ -347,6 +348,19 @@ static void info_rules_init(args_t *args) if ( str.l ) kputc(',',&str); kputs("IMF:max",&str); } + if ( !bcf_hdr_nsamples(args->out_hdr) ) + { + if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "AN")) ) + { + if ( str.l ) kputc(',',&str); + kputs("AN:sum",&str); + } + if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "AC")) ) + { + if ( str.l ) kputc(',',&str); + kputs("AC:sum",&str); + } + } if ( !str.l ) return; args->info_rules = str.s; @@ -378,6 +392,8 @@ static void info_rules_init(args_t *args) else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type); + if ( !strcmp(rule->hdr_tag,"AC") || !strcmp(rule->hdr_tag,"AN") ) args->keep_AC_AN = 1; + ss = strchr(ss, '\0'); ss++; if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag); @@ -1059,7 +1075,7 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t * @param src: source string * @param isrc: index of the field to copy * @param src_len: length of source string (excluding the terminating \0) - * @param dst: destination kstring (must be initialized) + * @param dst: destination kstring (must be initialized with missing values, e.g. as ".") * @param idst: index of the destination field */ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst) @@ -1253,7 +1269,7 @@ void merge_info(args_t *args, bcf1_t *out) bcf_info_t *inf = &line->d.info[j]; const char *key = hdr->id[BCF_DT_ID][inf->key].key; - if ( !strcmp("AC",key) || !strcmp("AN",key) ) continue; // AC and AN are done in merge_format() after genotypes are done + if ( !args->keep_AC_AN && (!strcmp("AC",key) || !strcmp("AN",key)) ) continue; // AC and AN are done in merge_format() after genotypes are done int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key); if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key); @@ -2270,7 +2286,8 @@ void merge_format(args_t *args, bcf1_t *out) out->n_sample = bcf_hdr_nsamples(out_hdr); if ( has_GT ) merge_GT(args, ma->fmt_map, out); - update_AN_AC(out_hdr, out); + if ( !args->keep_AC_AN ) + update_AN_AC(out_hdr, out); for (i=1; i<=max_ifmt; i++) merge_format_field(args, &ma->fmt_map[i*files->nreaders], out); @@ -3005,7 +3022,9 @@ void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr) } void merge_vcf(args_t *args) { - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); @@ -3085,24 +3104,25 @@ static void usage(void) fprintf(bcftools_stderr, "Usage: bcftools merge [options] [...]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " --force-samples resolve duplicate sample names\n"); - fprintf(bcftools_stderr, " --print-header print only the merged header and exit\n"); - fprintf(bcftools_stderr, " --use-header use the provided header\n"); - fprintf(bcftools_stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n"); - fprintf(bcftools_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(bcftools_stderr, " -F, --filter-logic remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); - fprintf(bcftools_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); - fprintf(bcftools_stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); - fprintf(bcftools_stderr, " -l, --file-list read file names from the file\n"); - fprintf(bcftools_stderr, " -L, --local-alleles EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); - fprintf(bcftools_stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); - fprintf(bcftools_stderr, " --no-index merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " --force-samples Resolve duplicate sample names\n"); + fprintf(bcftools_stderr, " --print-header Print only the merged header and exit\n"); + fprintf(bcftools_stderr, " --use-header FILE Use the provided header\n"); + fprintf(bcftools_stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n"); + fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(bcftools_stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); + fprintf(bcftools_stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); + fprintf(bcftools_stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); + fprintf(bcftools_stderr, " -l, --file-list FILE Read file names from the file\n"); + fprintf(bcftools_stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); + fprintf(bcftools_stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); + fprintf(bcftools_stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -3118,7 +3138,9 @@ int main_vcfmerge(int argc, char *argv[]) args->n_threads = 0; args->record_cmd_line = 1; args->collapse = COLLAPSE_BOTH; + args->clevel = -1; int regions_is_file = 0; + int regions_overlap = 1; static struct option loptions[] = { @@ -3137,6 +3159,7 @@ int main_vcfmerge(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,4}, {"info-rules",required_argument,NULL,'i'}, {"no-version",no_argument,NULL,8}, {"no-index",no_argument,NULL,10}, @@ -3175,7 +3198,16 @@ int main_vcfmerge(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'm': @@ -3195,6 +3227,12 @@ int main_vcfmerge(int argc, char *argv[]) case 1 : args->header_fname = optarg; break; case 2 : args->header_only = 1; break; case 3 : args->force_samples = 1; break; + case 4 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->no_index = 1; break; @@ -3215,6 +3253,7 @@ int main_vcfmerge(int argc, char *argv[]) bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX); if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( regions_is_file ) diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index 7b510b1..8a140fb 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -97,7 +97,7 @@ typedef struct faidx_t *fai; struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; - int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; + int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel; int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; @@ -1252,7 +1252,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ int nsmpl = bcf_hdr_nsamples(args->hdr); ngts /= nsmpl; - int i, j, k; + int i, j, k,k2; for (i=1; intmp_arr2 / 4; @@ -1265,16 +1265,19 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ int32_t *gt2 = (int32_t*) args->tmp_arr2; for (j=0; j=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2); + int ial = args->maps[i].map[ial2]; + for (k=0; k=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial); - gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]); + gt[k] = bcf_gt_unphased(ial); } } gt += ngts; @@ -1987,7 +1990,9 @@ static bcf1_t *next_atomized_line(args_t *args) } static void normalize_vcf(args_t *args) { - args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); @@ -2085,12 +2090,14 @@ static void usage(void) fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n"); fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n"); fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(stderr, "\n"); @@ -2118,9 +2125,12 @@ int main_vcfnorm(int argc, char *argv[]) args->buf_win = 1000; args->mrows_collapse = COLLAPSE_BOTH; args->do_indels = 1; + args->clevel = -1; int region_is_file = 0; int targets_is_file = 0; args->use_star_allele = 1; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -2135,8 +2145,10 @@ int main_vcfnorm(int argc, char *argv[]) {"multiallelics",required_argument,NULL,'m'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,1}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,2}, {"site-win",required_argument,NULL,'w'}, {"remove-duplicates",no_argument,NULL,'D'}, {"rm-dup",required_argument,NULL,'d'}, @@ -2200,7 +2212,16 @@ int main_vcfnorm(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'o': args->output_fname = optarg; break; @@ -2221,6 +2242,18 @@ int main_vcfnorm(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 7 : args->force = 1; break; + case 1 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 2 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -2241,11 +2274,13 @@ int main_vcfnorm(int argc, char *argv[]) if ( args->region ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->region,region_is_file)<0 ) error("Failed to read the regions: %s\n", args->region); } if ( args->targets ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets,targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); } diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index e48443f..aa21490 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -99,7 +99,7 @@ typedef struct faidx_t *fai; struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; - int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; + int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel; int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; @@ -1254,7 +1254,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ int nsmpl = bcf_hdr_nsamples(args->hdr); ngts /= nsmpl; - int i, j, k; + int i, j, k,k2; for (i=1; intmp_arr2 / 4; @@ -1267,16 +1267,19 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ int32_t *gt2 = (int32_t*) args->tmp_arr2; for (j=0; j=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2); + int ial = args->maps[i].map[ial2]; + for (k=0; k=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial); - gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]); + gt[k] = bcf_gt_unphased(ial); } } gt += ngts; @@ -1989,7 +1992,9 @@ static bcf1_t *next_atomized_line(args_t *args) } static void normalize_vcf(args_t *args) { - args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); @@ -2087,12 +2092,14 @@ static void usage(void) fprintf(bcftools_stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n"); fprintf(bcftools_stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n"); fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n"); fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(bcftools_stderr, "\n"); @@ -2120,9 +2127,12 @@ int main_vcfnorm(int argc, char *argv[]) args->buf_win = 1000; args->mrows_collapse = COLLAPSE_BOTH; args->do_indels = 1; + args->clevel = -1; int region_is_file = 0; int targets_is_file = 0; args->use_star_allele = 1; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -2137,8 +2147,10 @@ int main_vcfnorm(int argc, char *argv[]) {"multiallelics",required_argument,NULL,'m'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,1}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,2}, {"site-win",required_argument,NULL,'w'}, {"remove-duplicates",no_argument,NULL,'D'}, {"rm-dup",required_argument,NULL,'d'}, @@ -2202,7 +2214,16 @@ int main_vcfnorm(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; case 'o': args->output_fname = optarg; break; @@ -2223,6 +2244,18 @@ int main_vcfnorm(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 7 : args->force = 1; break; + case 1 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 2 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -2243,11 +2276,13 @@ int main_vcfnorm(int argc, char *argv[]) if ( args->region ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->region,region_is_file)<0 ) error("Failed to read the regions: %s\n", args->region); } if ( args->targets ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets,targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); } diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c index c4ea52d..7656192 100644 --- a/bcftools/vcfplugin.c +++ b/bcftools/vcfplugin.c @@ -137,7 +137,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr, *hdr_out; htsFile *out_fh; - int output_type, n_threads; + int output_type, n_threads, clevel; filter_t *filter; char *filter_str; @@ -522,7 +522,9 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); if ( !args->drop_header ) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); @@ -558,22 +560,24 @@ static void usage(args_t *args) fprintf(stderr, " bcftools +name [OPTIONS] [-- PLUGIN_OPTIONS]\n"); fprintf(stderr, "\n"); fprintf(stderr, "VCF input options:\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); + fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, "VCF output options:\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " --threads INTT Use multithreading with worker threads [0]\n"); fprintf(stderr, "Plugin options:\n"); - fprintf(stderr, " -h, --help list plugin's options\n"); - fprintf(stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); - fprintf(stderr, " -v, --verbose print verbose information, -vv increases verbosity\n"); - fprintf(stderr, " -V, --version print version string and exit\n"); + fprintf(stderr, " -h, --help List plugin's options\n"); + fprintf(stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); + fprintf(stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n"); + fprintf(stderr, " -V, --version Print version string and exit\n"); fprintf(stderr, "\n"); exit(1); } @@ -609,7 +613,10 @@ int main_plugin(int argc, char *argv[]) args->n_threads = 0; args->record_cmd_line = 1; args->nplugin_paths = -1; + args->clevel = -1; int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0; + int regions_overlap = 1; + int targets_overlap = 0; if ( argc==1 ) usage(args); char *plugin_name = NULL; @@ -643,11 +650,14 @@ int main_plugin(int argc, char *argv[]) {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,1}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,2}, {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0) { switch (c) { @@ -660,8 +670,17 @@ int main_plugin(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); @@ -674,6 +693,18 @@ int main_plugin(int argc, char *argv[]) case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': args->plist_only = 1; break; + case 1 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 2 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case '?': @@ -727,11 +758,13 @@ int main_plugin(int argc, char *argv[]) args->files = bcf_sr_init(); if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); args->files->collapse |= COLLAPSE_SOME; diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index 2143a0a..cb577d5 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -139,7 +139,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr, *hdr_out; htsFile *out_fh; - int output_type, n_threads; + int output_type, n_threads, clevel; filter_t *filter; char *filter_str; @@ -524,7 +524,9 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); if ( !args->drop_header ) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); @@ -560,22 +562,24 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " bcftools +name [OPTIONS] [-- PLUGIN_OPTIONS]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "VCF input options:\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, "VCF output options:\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " --threads INTT Use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, "Plugin options:\n"); - fprintf(bcftools_stderr, " -h, --help list plugin's options\n"); - fprintf(bcftools_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); - fprintf(bcftools_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n"); - fprintf(bcftools_stderr, " -V, --version print version string and exit\n"); + fprintf(bcftools_stderr, " -h, --help List plugin's options\n"); + fprintf(bcftools_stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); + fprintf(bcftools_stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n"); + fprintf(bcftools_stderr, " -V, --version Print version string and exit\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -611,7 +615,10 @@ int main_plugin(int argc, char *argv[]) args->n_threads = 0; args->record_cmd_line = 1; args->nplugin_paths = -1; + args->clevel = -1; int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0; + int regions_overlap = 1; + int targets_overlap = 0; if ( argc==1 ) usage(args); char *plugin_name = NULL; @@ -645,11 +652,14 @@ int main_plugin(int argc, char *argv[]) {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,1}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,2}, {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0) { switch (c) { @@ -662,8 +672,17 @@ int main_plugin(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); @@ -676,6 +695,18 @@ int main_plugin(int argc, char *argv[]) case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': args->plist_only = 1; break; + case 1 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 2 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case '?': @@ -729,11 +760,13 @@ int main_plugin(int argc, char *argv[]) args->files = bcf_sr_init(); if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); args->files->collapse |= COLLAPSE_SOME; diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c index 6568c82..882c3bb 100644 --- a/bcftools/vcfquery.c +++ b/bcftools/vcfquery.c @@ -221,20 +221,22 @@ static void usage(void) fprintf(stderr, "Usage: bcftools query [options] [ [...]]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -f, --format see man page for details\n"); - fprintf(stderr, " -H, --print-header print header\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -l, --list-samples print the list of samples and exit\n"); - fprintf(stderr, " -o, --output output file name [stdout]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --samples list of samples to include\n"); - fprintf(stderr, " -S, --samples-file file of samples to include\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n"); - fprintf(stderr, " -v, --vcf-list process multiple VCFs listed in the file\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -f, --format STRING See man page for details\n"); + fprintf(stderr, " -H, --print-header Print header\n"); + fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -l, --list-samples Print the list of samples and exit\n"); + fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -s, --samples LIST List of samples to include\n"); + fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n"); + fprintf(stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); @@ -248,6 +250,8 @@ int main_vcfquery(int argc, char *argv[]) args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -260,8 +264,10 @@ int main_vcfquery(int argc, char *argv[]) {"output",1,0,'o'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,1}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,2}, {"annots",1,0,'a'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, @@ -311,6 +317,18 @@ int main_vcfquery(int argc, char *argv[]) case 'u': args->allow_undef_tags = 1; break; case 's': args->sample_list = optarg; break; case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; + case 1 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 2 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -348,10 +366,15 @@ int main_vcfquery(int argc, char *argv[]) if ( !fname ) usage(); args->files = bcf_sr_init(); if ( optind+1 < argc ) args->files->require_index = 1; - if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions_list); + if ( args->regions_list ) + { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); + if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", args->regions_list); + } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index fc264b7..1ed3d00 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -223,20 +223,22 @@ static void usage(void) fprintf(bcftools_stderr, "Usage: bcftools query [options] [ [...]]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -f, --format see man page for details\n"); - fprintf(bcftools_stderr, " -H, --print-header print header\n"); - fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -l, --list-samples print the list of samples and exit\n"); - fprintf(bcftools_stderr, " -o, --output output file name [bcftools_stdout]\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -s, --samples list of samples to include\n"); - fprintf(bcftools_stderr, " -S, --samples-file file of samples to include\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n"); - fprintf(bcftools_stderr, " -v, --vcf-list process multiple VCFs listed in the file\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -f, --format STRING See man page for details\n"); + fprintf(bcftools_stderr, " -H, --print-header Print header\n"); + fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -l, --list-samples Print the list of samples and exit\n"); + fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include\n"); + fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(bcftools_stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n"); + fprintf(bcftools_stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); @@ -250,6 +252,8 @@ int main_vcfquery(int argc, char *argv[]) args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -262,8 +266,10 @@ int main_vcfquery(int argc, char *argv[]) {"output",1,0,'o'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,1}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,2}, {"annots",1,0,'a'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, @@ -313,6 +319,18 @@ int main_vcfquery(int argc, char *argv[]) case 'u': args->allow_undef_tags = 1; break; case 's': args->sample_list = optarg; break; case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; + case 1 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 2 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -350,10 +368,15 @@ int main_vcfquery(int argc, char *argv[]) if ( !fname ) usage(); args->files = bcf_sr_init(); if ( optind+1 < argc ) args->files->require_index = 1; - if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions_list); + if ( args->regions_list ) + { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); + if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", args->regions_list); + } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c index 8e95c9a..fd8beb2 100644 --- a/bcftools/vcfroh.c +++ b/bcftools/vcfroh.c @@ -1094,10 +1094,12 @@ static void usage(args_t *args) fprintf(stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -s, --samples list of samples to analyze [all samples]\n"); fprintf(stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "HMM Options:\n"); @@ -1118,6 +1120,8 @@ int main_vcfroh(int argc, char *argv[]) args->t2HW = 5e-9; args->rec_rate = 0; int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -1140,8 +1144,10 @@ int main_vcfroh(int argc, char *argv[]) {"viterbi-training",1,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,6}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,7}, {"genetic-map",1,0,'m'}, {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, @@ -1201,6 +1207,18 @@ int main_vcfroh(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; + case 6 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; + case 7 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'V': args->vi_training = 1; @@ -1229,11 +1247,13 @@ int main_vcfroh(int argc, char *argv[]) if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n"); if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c index b742faa..1546461 100644 --- a/bcftools/vcfroh.c.pysam.c +++ b/bcftools/vcfroh.c.pysam.c @@ -1096,10 +1096,12 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n"); fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " -s, --samples list of samples to analyze [all samples]\n"); fprintf(bcftools_stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "HMM Options:\n"); @@ -1120,6 +1122,8 @@ int main_vcfroh(int argc, char *argv[]) args->t2HW = 5e-9; args->rec_rate = 0; int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -1142,8 +1146,10 @@ int main_vcfroh(int argc, char *argv[]) {"viterbi-training",1,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,6}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,7}, {"genetic-map",1,0,'m'}, {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, @@ -1203,6 +1209,18 @@ int main_vcfroh(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; + case 6 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; + case 7 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'V': args->vi_training = 1; @@ -1231,11 +1249,13 @@ int main_vcfroh(int argc, char *argv[]) if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n"); if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c index 7ec13fb..a8052d0 100644 --- a/bcftools/vcfsort.c +++ b/bcftools/vcfsort.c @@ -56,9 +56,10 @@ typedef struct _args_t { bcf_hdr_t *hdr; char **argv, *fname, *output_fname, *tmp_dir; - int argc, output_type; + int argc, output_type, clevel; size_t max_mem, mem; bcf1_t **buf; + uint8_t *mem_block; size_t nbuf, mbuf, nblk; blk_t *blk; } @@ -104,8 +105,6 @@ int cmp_bcf_pos(const void *aptr, const void *bptr) // This will be called rarely so should not slow the sorting down // noticeably. - if ( !a->unpacked ) bcf_unpack(a, BCF_UN_STR); - if ( !b->unpacked ) bcf_unpack(b, BCF_UN_STR); int i; for (i=0; in_allele; i++) { @@ -141,7 +140,6 @@ void buf_flush(args_t *args) for (i=0; inbuf; i++) { if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); - bcf_destroy(args->buf[i]); } if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname); @@ -149,14 +147,83 @@ void buf_flush(args_t *args) args->mem = 0; } + +static inline uint8_t *_align_up(uint8_t *ptr) +{ + return (uint8_t*)(((size_t)ptr + 8 - 1) & ~((size_t)(8 - 1))); +} + void buf_push(args_t *args, bcf1_t *rec) { - int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*); - if ( args->mem + delta > args->max_mem ) buf_flush(args); + size_t delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + rec->unpack_size[0] + rec->unpack_size[1] + + sizeof(*rec->d.allele)*rec->d.m_allele + + sizeof(bcf1_t*) // args->buf + + 8; // the number of _align_up() calls + + if ( delta > args->max_mem - args->mem ) + { + args->nbuf++; + hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf); + args->buf[args->nbuf-1] = rec; + buf_flush(args); + bcf_destroy(rec); + return; + } + + // make sure nothing has changed in htslib + assert( rec->unpacked==BCF_UN_STR && !rec->d.flt && !rec->d.info && !rec->d.fmt && !rec->d.var ); + + uint8_t *ptr_beg = args->mem_block + args->mem; + uint8_t *ptr = _align_up(ptr_beg); + bcf1_t *new_rec = (bcf1_t*)ptr; + memcpy(new_rec,rec,sizeof(*rec)); + ptr += sizeof(*rec); + + // The array of allele pointers does not need alignment as bcf1_t is already padded to the biggest + // data type in the structure + char **allele = (char**)ptr; + ptr += rec->n_allele*sizeof(*allele); + + // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark + // and the end may be uninitialized + delta = rec->d.allele[rec->n_allele-1] - rec->d.allele[0]; + while ( delta < rec->unpack_size[1] ) if ( !rec->d.als[delta++] ) break; + memcpy(ptr,rec->d.als,delta); + new_rec->d.als = (char*)ptr; + ptr = ptr + delta; + + int i; + for (i=0; in_allele; i++) allele[i] = new_rec->d.als + (ptrdiff_t)(rec->d.allele[i] - rec->d.allele[0]); + new_rec->d.allele = allele; + + memcpy(ptr,rec->shared.s,rec->shared.l); + new_rec->shared.s = (char*)ptr; + new_rec->shared.m = rec->shared.l; + ptr += rec->shared.l; + + memcpy(ptr,rec->indiv.s,rec->indiv.l); + new_rec->indiv.s = (char*)ptr; + new_rec->indiv.m = rec->indiv.l; + ptr += rec->indiv.l; + + // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark + // and the end may be uninitialized + i = 0; + while ( i < rec->unpack_size[0] ) if ( !rec->d.id[i++] ) break; + memcpy(ptr,rec->d.id,i); + new_rec->d.id = (char*)ptr; + ptr += i; + args->nbuf++; - args->mem += delta; hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf); - args->buf[args->nbuf-1] = rec; + args->buf[args->nbuf-1] = new_rec; + + delta = ptr - ptr_beg; + args->mem += delta; + + assert( args->mem <= args->max_mem ); + + bcf_destroy(rec); } void sort_blocks(args_t *args) @@ -177,6 +244,7 @@ void sort_blocks(args_t *args) break; } if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1); + bcf_unpack(rec, BCF_UN_STR); buf_push(args, rec); } buf_flush(args); @@ -206,13 +274,13 @@ void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) blk->fh = 0; return; } + bcf_unpack(blk->rec, BCF_UN_STR); khp_insert(blk, bhp, &blk); } void merge_blocks(args_t *args) { fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk); - khp_blk_t *bhp = khp_init(blk); int i; @@ -227,7 +295,9 @@ void merge_blocks(args_t *args) blk_read(args, bhp, args->hdr, blk); } - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); while ( bhp->ndat ) { @@ -252,13 +322,15 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools sort [OPTIONS] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(stderr, " -o, --output FILE output file name [stdout]\n"); - fprintf(stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(stderr, " -o, --output FILE output file name [stdout]\n"); + fprintf(stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + #ifdef _WIN32 - fprintf(stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n"); + fprintf(stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n"); #else - fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); + fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); #endif fprintf(stderr, "\n"); exit(1); @@ -278,6 +350,10 @@ size_t parse_mem_string(const char *str) void mkdir_p(const char *fmt, ...); static void init(args_t *args) { + args->max_mem *= 0.9; + args->mem_block = malloc(args->max_mem); + args->mem = 0; + args->tmp_dir = init_tmp_prefix(args->tmp_dir); #ifdef _WIN32 @@ -295,6 +371,7 @@ static void init(args_t *args) static void destroy(args_t *args) { bcf_hdr_destroy(args->hdr); + free(args->mem_block); free(args->tmp_dir); free(args); } @@ -306,6 +383,7 @@ int main_sort(int argc, char *argv[]) args->argc = argc; args->argv = argv; args->max_mem = 768*1000*1000; args->output_fname = "-"; + args->clevel = -1; static struct option loptions[] = { @@ -317,6 +395,7 @@ int main_sort(int argc, char *argv[]) {"help",no_argument,NULL,'h'}, {0,0,0,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0) { switch (c) @@ -330,8 +409,17 @@ int main_sort(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; case 'h': case '?': usage(args); break; diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c index 1fd74d3..d3eb6b7 100644 --- a/bcftools/vcfsort.c.pysam.c +++ b/bcftools/vcfsort.c.pysam.c @@ -58,9 +58,10 @@ typedef struct _args_t { bcf_hdr_t *hdr; char **argv, *fname, *output_fname, *tmp_dir; - int argc, output_type; + int argc, output_type, clevel; size_t max_mem, mem; bcf1_t **buf; + uint8_t *mem_block; size_t nbuf, mbuf, nblk; blk_t *blk; } @@ -106,8 +107,6 @@ int cmp_bcf_pos(const void *aptr, const void *bptr) // This will be called rarely so should not slow the sorting down // noticeably. - if ( !a->unpacked ) bcf_unpack(a, BCF_UN_STR); - if ( !b->unpacked ) bcf_unpack(b, BCF_UN_STR); int i; for (i=0; in_allele; i++) { @@ -143,7 +142,6 @@ void buf_flush(args_t *args) for (i=0; inbuf; i++) { if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); - bcf_destroy(args->buf[i]); } if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname); @@ -151,14 +149,83 @@ void buf_flush(args_t *args) args->mem = 0; } + +static inline uint8_t *_align_up(uint8_t *ptr) +{ + return (uint8_t*)(((size_t)ptr + 8 - 1) & ~((size_t)(8 - 1))); +} + void buf_push(args_t *args, bcf1_t *rec) { - int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*); - if ( args->mem + delta > args->max_mem ) buf_flush(args); + size_t delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + rec->unpack_size[0] + rec->unpack_size[1] + + sizeof(*rec->d.allele)*rec->d.m_allele + + sizeof(bcf1_t*) // args->buf + + 8; // the number of _align_up() calls + + if ( delta > args->max_mem - args->mem ) + { + args->nbuf++; + hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf); + args->buf[args->nbuf-1] = rec; + buf_flush(args); + bcf_destroy(rec); + return; + } + + // make sure nothing has changed in htslib + assert( rec->unpacked==BCF_UN_STR && !rec->d.flt && !rec->d.info && !rec->d.fmt && !rec->d.var ); + + uint8_t *ptr_beg = args->mem_block + args->mem; + uint8_t *ptr = _align_up(ptr_beg); + bcf1_t *new_rec = (bcf1_t*)ptr; + memcpy(new_rec,rec,sizeof(*rec)); + ptr += sizeof(*rec); + + // The array of allele pointers does not need alignment as bcf1_t is already padded to the biggest + // data type in the structure + char **allele = (char**)ptr; + ptr += rec->n_allele*sizeof(*allele); + + // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark + // and the end may be uninitialized + delta = rec->d.allele[rec->n_allele-1] - rec->d.allele[0]; + while ( delta < rec->unpack_size[1] ) if ( !rec->d.als[delta++] ) break; + memcpy(ptr,rec->d.als,delta); + new_rec->d.als = (char*)ptr; + ptr = ptr + delta; + + int i; + for (i=0; in_allele; i++) allele[i] = new_rec->d.als + (ptrdiff_t)(rec->d.allele[i] - rec->d.allele[0]); + new_rec->d.allele = allele; + + memcpy(ptr,rec->shared.s,rec->shared.l); + new_rec->shared.s = (char*)ptr; + new_rec->shared.m = rec->shared.l; + ptr += rec->shared.l; + + memcpy(ptr,rec->indiv.s,rec->indiv.l); + new_rec->indiv.s = (char*)ptr; + new_rec->indiv.m = rec->indiv.l; + ptr += rec->indiv.l; + + // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark + // and the end may be uninitialized + i = 0; + while ( i < rec->unpack_size[0] ) if ( !rec->d.id[i++] ) break; + memcpy(ptr,rec->d.id,i); + new_rec->d.id = (char*)ptr; + ptr += i; + args->nbuf++; - args->mem += delta; hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf); - args->buf[args->nbuf-1] = rec; + args->buf[args->nbuf-1] = new_rec; + + delta = ptr - ptr_beg; + args->mem += delta; + + assert( args->mem <= args->max_mem ); + + bcf_destroy(rec); } void sort_blocks(args_t *args) @@ -179,6 +246,7 @@ void sort_blocks(args_t *args) break; } if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1); + bcf_unpack(rec, BCF_UN_STR); buf_push(args, rec); } buf_flush(args); @@ -208,13 +276,13 @@ void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) blk->fh = 0; return; } + bcf_unpack(blk->rec, BCF_UN_STR); khp_insert(blk, bhp, &blk); } void merge_blocks(args_t *args) { fprintf(bcftools_stderr,"Merging %d temporary files\n", (int)args->nblk); - khp_blk_t *bhp = khp_init(blk); int i; @@ -229,7 +297,9 @@ void merge_blocks(args_t *args) blk_read(args, bhp, args->hdr, blk); } - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + char wmode[8]; + set_wmode(wmode,args->output_type,args->output_fname,args->clevel); + htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); while ( bhp->ndat ) { @@ -254,13 +324,15 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools sort [OPTIONS] \n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(bcftools_stderr, " -o, --output FILE output file name [bcftools_stdout]\n"); - fprintf(bcftools_stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(bcftools_stderr, " -o, --output FILE output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + #ifdef _WIN32 - fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n"); + fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n"); #else - fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); + fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); #endif fprintf(bcftools_stderr, "\n"); bcftools_exit(1); @@ -280,6 +352,10 @@ size_t parse_mem_string(const char *str) void mkdir_p(const char *fmt, ...); static void init(args_t *args) { + args->max_mem *= 0.9; + args->mem_block = malloc(args->max_mem); + args->mem = 0; + args->tmp_dir = init_tmp_prefix(args->tmp_dir); #ifdef _WIN32 @@ -297,6 +373,7 @@ static void init(args_t *args) static void destroy(args_t *args) { bcf_hdr_destroy(args->hdr); + free(args->mem_block); free(args->tmp_dir); free(args); } @@ -308,6 +385,7 @@ int main_sort(int argc, char *argv[]) args->argc = argc; args->argv = argv; args->max_mem = 768*1000*1000; args->output_fname = "-"; + args->clevel = -1; static struct option loptions[] = { @@ -319,6 +397,7 @@ int main_sort(int argc, char *argv[]) {"help",no_argument,NULL,'h'}, {0,0,0,0} }; + char *tmp; while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0) { switch (c) @@ -332,8 +411,17 @@ int main_sort(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; case 'h': case '?': usage(args); break; diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index 601c557..c13b3e3 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -1759,10 +1759,12 @@ static void usage(void) fprintf(stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n"); fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); fprintf(stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); @@ -1779,6 +1781,8 @@ int main_vcfstats(int argc, char *argv[]) args->argc = argc; args->argv = argv; args->dp_min = 0; args->dp_max = 500; args->dp_step = 1; int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -1791,6 +1795,7 @@ int main_vcfstats(int argc, char *argv[]) {"collapse",1,0,'c'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"verbose",0,0,'v'}, {"depth",1,0,'d'}, {"apply-filters",1,0,'f'}, @@ -1800,6 +1805,7 @@ int main_vcfstats(int argc, char *argv[]) {"split-by-ID",0,0,'I'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"fasta-ref",1,0,'F'}, {"user-tstv",1,0,'u'}, {"threads",1,0,9}, @@ -1844,6 +1850,18 @@ int main_vcfstats(int argc, char *argv[]) case 'i': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 4 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': case '?': usage(); break; @@ -1865,10 +1883,18 @@ int main_vcfstats(int argc, char *argv[]) if ( args->split_by_id ) error("Only one file can be given with -i.\n"); } if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO; - if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) - error("Failed to read the targets: %s\n", args->targets_list); - if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions_list); + if ( args->targets_list ) + { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); + if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) + error("Failed to read the targets: %s\n", args->targets_list); + } + if ( args->regions_list) + { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); + if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", args->regions_list); + } if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) error("Failed to create threads\n"); diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index 050a68a..4f6c898 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -1761,10 +1761,12 @@ static void usage(void) fprintf(bcftools_stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n"); fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(bcftools_stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n"); fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n"); fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(bcftools_stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); fprintf(bcftools_stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n"); fprintf(bcftools_stderr, " --threads INT Use multithreading with worker threads [0]\n"); @@ -1781,6 +1783,8 @@ int main_vcfstats(int argc, char *argv[]) args->argc = argc; args->argv = argv; args->dp_min = 0; args->dp_max = 500; args->dp_step = 1; int regions_is_file = 0, targets_is_file = 0; + int regions_overlap = 1; + int targets_overlap = 0; static struct option loptions[] = { @@ -1793,6 +1797,7 @@ int main_vcfstats(int argc, char *argv[]) {"collapse",1,0,'c'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"verbose",0,0,'v'}, {"depth",1,0,'d'}, {"apply-filters",1,0,'f'}, @@ -1802,6 +1807,7 @@ int main_vcfstats(int argc, char *argv[]) {"split-by-ID",0,0,'I'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"fasta-ref",1,0,'F'}, {"user-tstv",1,0,'u'}, {"threads",1,0,9}, @@ -1846,6 +1852,18 @@ int main_vcfstats(int argc, char *argv[]) case 'i': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 3 : + if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; + case 4 : + if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': case '?': usage(); break; @@ -1867,10 +1885,18 @@ int main_vcfstats(int argc, char *argv[]) if ( args->split_by_id ) error("Only one file can be given with -i.\n"); } if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO; - if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) - error("Failed to read the targets: %s\n", args->targets_list); - if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions_list); + if ( args->targets_list ) + { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); + if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) + error("Failed to read the targets: %s\n", args->targets_list); + } + if ( args->regions_list) + { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap); + if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) + error("Failed to read the regions: %s\n", args->regions_list); + } if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) error("Failed to create threads\n"); diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index ce4c810..1dbcc61 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -65,6 +65,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list; + int regions_overlap, targets_overlap; int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac; int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased; int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type; @@ -220,12 +221,9 @@ static void init_data(args_t *args) free(type_list); } - // setup output - const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out); - char modew[8]; - strcpy(modew,tmp); - if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel); - args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); + char wmode[8]; + set_wmode(wmode,args->output_type,args->fn_out,args->clevel); + args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( args->n_threads > 0) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); @@ -495,39 +493,43 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools view [options] [region1 [...]]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Output options:\n"); - fprintf(stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n"); - fprintf(stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); - fprintf(stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output output file name [stdout]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(stderr, " -G, --drop-genotypes Drop individual genotype information (after subsetting if -s option set)\n"); + fprintf(stderr, " -h, --header-only Print only the header in VCF output (equivalent to bcftools head)\n"); + fprintf(stderr, " -H, --no-header Suppress the header in VCF output\n"); + fprintf(stderr, " --with-header Print both header and records in VCF output [default]\n"); + fprintf(stderr, " -l, --compression-level [0-9] Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); + fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); + fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(stderr, " -t, --targets [^]REGION Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); + fprintf(stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); + fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Subset options:\n"); - fprintf(stderr, " -a, --trim-alt-alleles trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); - fprintf(stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); - fprintf(stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " --force-samples only warn about unknown subset samples\n"); + fprintf(stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); + fprintf(stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); + fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(stderr, "\n"); fprintf(stderr, "Filter options:\n"); - fprintf(stderr, " -c/C, --min-ac/--max-ac [:] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); - fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); - fprintf(stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(stderr, " -g, --genotype [^] require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n"); - fprintf(stderr, " -i/e, --include/--exclude select/exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n"); - fprintf(stderr, " -m/M, --min-alleles/--max-alleles minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); - fprintf(stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n"); - fprintf(stderr, " -q/Q, --min-af/--max-af [:] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); - fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); - fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n"); - fprintf(stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); - fprintf(stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); + fprintf(stderr, " -c/C, --min-ac/--max-ac INT[:TYPE] Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); + fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); + fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(stderr, " -g, --genotype [^]hom|het|miss Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n"); + fprintf(stderr, " -i/e, --include/--exclude EXPR Select/exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -k/n, --known/--novel Select known/novel sites only (ID is not/is '.')\n"); + fprintf(stderr, " -m/M, --min-alleles/--max-alleles INT Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); + fprintf(stderr, " -p/P, --phased/--exclude-phased Select/exclude sites where all samples are phased\n"); + fprintf(stderr, " -q/Q, --min-af/--max-af FLOAT[:TYPE] Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); + fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); + fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n"); + fprintf(stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); + fprintf(stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); fprintf(stderr, "\n"); exit(1); } @@ -545,6 +547,8 @@ int main_vcfview(int argc, char *argv[]) args->n_threads = 0; args->record_cmd_line = 1; args->min_ac = args->max_ac = args->min_af = args->max_af = -1; + args->regions_overlap = 1; + args->targets_overlap = 0; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = @@ -554,6 +558,7 @@ int main_vcfview(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"header-only",no_argument,NULL,'h'}, {"no-header",no_argument,NULL,'H'}, + {"with-header",no_argument,NULL,4}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"trim-alt-alleles",no_argument,NULL,'a'}, @@ -578,8 +583,10 @@ int main_vcfview(int argc, char *argv[]) {"exclude-types",required_argument,NULL,'V'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,2}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"min-ac",required_argument,NULL,'c'}, {"max-ac",required_argument,NULL,'C'}, {"min-af",required_argument,NULL,'q'}, @@ -601,8 +608,17 @@ int main_vcfview(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; case 'l': args->clevel = strtol(optarg,&tmp,10); @@ -612,6 +628,7 @@ int main_vcfview(int argc, char *argv[]) case 'o': args->fn_out = optarg; break; case 'H': args->print_header = 0; break; case 'h': args->header_only = 1; break; + case 4 : args->print_header = 1; args->header_only = 0; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; @@ -698,6 +715,18 @@ int main_vcfview(int argc, char *argv[]) else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg); break; } + case 2 : + if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; + case 3 : + if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case '?': usage(args); break; @@ -723,6 +752,7 @@ int main_vcfview(int argc, char *argv[]) // read in the regions from the command line if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } @@ -738,6 +768,7 @@ int main_vcfview(int argc, char *argv[]) } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index 75b3e64..9767124 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -67,6 +67,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list; + int regions_overlap, targets_overlap; int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac; int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased; int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type; @@ -222,12 +223,9 @@ static void init_data(args_t *args) free(type_list); } - // setup output - const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out); - char modew[8]; - strcpy(modew,tmp); - if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel); - args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); + char wmode[8]; + set_wmode(wmode,args->output_type,args->fn_out,args->clevel); + args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( args->n_threads > 0) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); @@ -497,39 +495,43 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Usage: bcftools view [options] [region1 [...]]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Output options:\n"); - fprintf(bcftools_stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n"); - fprintf(bcftools_stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); - fprintf(bcftools_stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output output file name [bcftools_stdout]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(bcftools_stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(bcftools_stderr, " -G, --drop-genotypes Drop individual genotype information (after subsetting if -s option set)\n"); + fprintf(bcftools_stderr, " -h, --header-only Print only the header in VCF output (equivalent to bcftools head)\n"); + fprintf(bcftools_stderr, " -H, --no-header Suppress the header in VCF output\n"); + fprintf(bcftools_stderr, " --with-header Print both header and records in VCF output [default]\n"); + fprintf(bcftools_stderr, " -l, --compression-level [0-9] Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); + fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); + fprintf(bcftools_stderr, " -t, --targets [^]REGION Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); + fprintf(bcftools_stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); + fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); + fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Subset options:\n"); - fprintf(bcftools_stderr, " -a, --trim-alt-alleles trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); - fprintf(bcftools_stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); - fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " --force-samples only warn about unknown subset samples\n"); + fprintf(bcftools_stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); + fprintf(bcftools_stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); + fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Filter options:\n"); - fprintf(bcftools_stderr, " -c/C, --min-ac/--max-ac [:] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); - fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); - fprintf(bcftools_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(bcftools_stderr, " -g, --genotype [^] require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n"); - fprintf(bcftools_stderr, " -i/e, --include/--exclude select/exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n"); - fprintf(bcftools_stderr, " -m/M, --min-alleles/--max-alleles minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); - fprintf(bcftools_stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n"); - fprintf(bcftools_stderr, " -q/Q, --min-af/--max-af [:] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); - fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); - fprintf(bcftools_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n"); - fprintf(bcftools_stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); - fprintf(bcftools_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); + fprintf(bcftools_stderr, " -c/C, --min-ac/--max-ac INT[:TYPE] Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); + fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); + fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(bcftools_stderr, " -g, --genotype [^]hom|het|miss Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n"); + fprintf(bcftools_stderr, " -i/e, --include/--exclude EXPR Select/exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -k/n, --known/--novel Select known/novel sites only (ID is not/is '.')\n"); + fprintf(bcftools_stderr, " -m/M, --min-alleles/--max-alleles INT Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); + fprintf(bcftools_stderr, " -p/P, --phased/--exclude-phased Select/exclude sites where all samples are phased\n"); + fprintf(bcftools_stderr, " -q/Q, --min-af/--max-af FLOAT[:TYPE] Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); + fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); + fprintf(bcftools_stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n"); + fprintf(bcftools_stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); + fprintf(bcftools_stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -547,6 +549,8 @@ int main_vcfview(int argc, char *argv[]) args->n_threads = 0; args->record_cmd_line = 1; args->min_ac = args->max_ac = args->min_af = args->max_af = -1; + args->regions_overlap = 1; + args->targets_overlap = 0; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = @@ -556,6 +560,7 @@ int main_vcfview(int argc, char *argv[]) {"threads",required_argument,NULL,9}, {"header-only",no_argument,NULL,'h'}, {"no-header",no_argument,NULL,'H'}, + {"with-header",no_argument,NULL,4}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"trim-alt-alleles",no_argument,NULL,'a'}, @@ -580,8 +585,10 @@ int main_vcfview(int argc, char *argv[]) {"exclude-types",required_argument,NULL,'V'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, + {"targets-overlap",required_argument,NULL,2}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, + {"regions-overlap",required_argument,NULL,3}, {"min-ac",required_argument,NULL,'c'}, {"max-ac",required_argument,NULL,'C'}, {"min-af",required_argument,NULL,'q'}, @@ -603,8 +610,17 @@ int main_vcfview(int argc, char *argv[]) case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; - default: error("The output type \"%s\" not recognised\n", optarg); + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } }; + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } break; case 'l': args->clevel = strtol(optarg,&tmp,10); @@ -614,6 +630,7 @@ int main_vcfview(int argc, char *argv[]) case 'o': args->fn_out = optarg; break; case 'H': args->print_header = 0; break; case 'h': args->header_only = 1; break; + case 4 : args->print_header = 1; args->header_only = 0; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; @@ -700,6 +717,18 @@ int main_vcfview(int argc, char *argv[]) else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg); break; } + case 2 : + if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; + else error("Could not parse: --targets-overlap %s\n",optarg); + break; + case 3 : + if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; + else error("Could not parse: --regions-overlap %s\n",optarg); + break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case '?': usage(args); break; @@ -725,6 +754,7 @@ int main_vcfview(int argc, char *argv[]) // read in the regions from the command line if ( args->regions_list ) { + bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap); if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } @@ -740,6 +770,7 @@ int main_vcfview(int argc, char *argv[]) } if ( args->targets_list ) { + bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap); if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } diff --git a/bcftools/version.c b/bcftools/version.c index d068897..73e0b04 100644 --- a/bcftools/version.c +++ b/bcftools/version.c @@ -80,3 +80,23 @@ const char *hts_bcf_wmode2(int file_type, char *fname) return hts_bcf_wmode(file_type); } +void set_wmode(char dst[8], int file_type, char *fname, int clevel) +{ + const char *ret = NULL; + int len = fname ? strlen(fname) : 0; + if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); + else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF); + else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + else ret = hts_bcf_wmode(file_type); + if ( clevel>=0 && clevel<=9 ) + { + if ( strchr(ret,'v') || strchr(ret,'u') ) error("Error: compression level (%d) cannot be set on uncompressed streams (%s)\n",clevel,fname); + len = strlen(ret); + if ( len>6 ) error("Fixme: %s\n", ret); + sprintf(dst, "%s%d", ret, clevel); + } + else + strcpy(dst, ret); +} + diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c index 37fa828..f524b21 100644 --- a/bcftools/version.c.pysam.c +++ b/bcftools/version.c.pysam.c @@ -82,3 +82,23 @@ const char *hts_bcf_wmode2(int file_type, char *fname) return hts_bcf_wmode(file_type); } +void set_wmode(char dst[8], int file_type, char *fname, int clevel) +{ + const char *ret = NULL; + int len = fname ? strlen(fname) : 0; + if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); + else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF); + else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + else ret = hts_bcf_wmode(file_type); + if ( clevel>=0 && clevel<=9 ) + { + if ( strchr(ret,'v') || strchr(ret,'u') ) error("Error: compression level (%d) cannot be set on uncompressed streams (%s)\n",clevel,fname); + len = strlen(ret); + if ( len>6 ) error("Fixme: %s\n", ret); + sprintf(dst, "%s%d", ret, clevel); + } + else + strcpy(dst, ret); +} + diff --git a/bcftools/version.sh b/bcftools/version.sh index 52b1e08..1bcfcea 100755 --- a/bcftools/version.sh +++ b/bcftools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.13 +VERSION=1.14 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/devtools/import.py b/devtools/import.py index ea35792..ffbd88f 100644 --- a/devtools/import.py +++ b/devtools/import.py @@ -96,9 +96,10 @@ def _update_pysam_files(cf, destdir): if basename == "samtools": lines = re.sub(r"main_(reheader)\(", r"samtools_main_\1(", lines) + lines = re.sub(r"\b({}_stdout)\b".format(basename), r"\1_internal", lines) lines = re.sub(r"\bexit\(", "{}_exit(".format(basename), lines) - lines = re.sub("stderr", "{}_stderr".format(basename), lines) - lines = re.sub("stdout", "{}_stdout".format(basename), lines) + lines = re.sub(r"\bstderr\b", "{}_stderr".format(basename), lines) + lines = re.sub(r"\bstdout\b", "{}_stdout".format(basename), lines) lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines) lines = re.sub(r"([^kf])puts\(", r"\1{}_puts(".format(basename), lines) lines = re.sub(r"putchar\(([^)]+)\)", diff --git a/doc/index.rst b/doc/index.rst index 15de2ca..bfdc602 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,7 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as using cython and a high-level, pythonic API for convenient access to the data within genomic file formats. -The current version wraps *htslib-1.13*, *samtools-1.13*, and *bcftools-1.13*. +The current version wraps *htslib-1.14*, *samtools-1.14*, and *bcftools-1.14*. To install the latest release, type:: diff --git a/doc/release.rst b/doc/release.rst index 966ee6a..a2f31d4 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,16 @@ Release notes ============= +Release 0.18.0 +============== + +This release wraps htslib/samtools/bcftools version 1.14. + +* [#1048] and [#1060], clarify documentation of index statistics with CRAM files +* Prevent "retval may be used uninitialised" warning. +* Add new "samples" subcommand to pysam/samtools.py +* Introduce TupleProxyIterator iterator object class + Release 0.17.0 ============== diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index e192ff3..d4bdfba 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -1760,6 +1760,8 @@ cdef class AlignmentFile(HTSFile): """int with total number of mapped alignments according to the statistics recorded in the index. This is a read-only attribute. + (This will be 0 for a CRAM file indexed by a .crai index, as that + index format does not record these statistics.) """ def __get__(self): self.check_index() @@ -1776,6 +1778,8 @@ cdef class AlignmentFile(HTSFile): """int with total number of unmapped reads according to the statistics recorded in the index. This number of reads includes the number of reads without coordinates. This is a read-only attribute. + (This will be 0 for a CRAM file indexed by a .crai index, as that + index format does not record these statistics.) """ def __get__(self): self.check_index() @@ -1792,6 +1796,8 @@ cdef class AlignmentFile(HTSFile): """int with total number of reads without coordinates according to the statistics recorded in the index, i.e., the statistic printed for "*" by the ``samtools idxstats`` command. This is a read-only attribute. + (This will be 0 for a CRAM file indexed by a .crai index, as that + index format does not record these statistics.) """ def __get__(self): self.check_index() @@ -1805,6 +1811,9 @@ cdef class AlignmentFile(HTSFile): they are stored in the index, similarly to the statistics printed by the ``samtools idxstats`` command. + CRAI indexes do not record these statistics, so for a CRAM file + with a .crai index the returned statistics will all be 0. + Returns: list : a list of records for each chromosome. Each record has the diff --git a/pysam/libctabixproxies.pxd b/pysam/libctabixproxies.pxd index edea701..907b40d 100644 --- a/pysam/libctabixproxies.pxd +++ b/pysam/libctabixproxies.pxd @@ -9,7 +9,6 @@ cdef class TupleProxy: char * data char ** fields int nfields - int index int nbytes int offset bint is_modified @@ -26,6 +25,11 @@ cdef class TupleProxy: cdef update(self, char * buffer, size_t nbytes) +cdef class TupleProxyIterator: + cdef TupleProxy proxy + cdef int index + + cdef class NamedTupleProxy(TupleProxy): pass diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx index f95425a..10b3e5a 100644 --- a/pysam/libctabixproxies.pyx +++ b/pysam/libctabixproxies.pyx @@ -42,7 +42,6 @@ cdef class TupleProxy: def __cinit__(self, encoding="ascii"): self.data = NULL self.fields = NULL - self.index = 0 self.nbytes = 0 self.is_modified = 0 self.nfields = 0 @@ -301,20 +300,7 @@ cdef class TupleProxy: return self.nfields def __iter__(self): - self.index = 0 - return self - - def __next__(self): - """python version of next(). - """ - if self.index >= self.nfields: - raise StopIteration - cdef char * retval = self.fields[self.index] - self.index += 1 - if retval == NULL: - return None - else: - return force_str(retval, self.encoding) + return TupleProxyIterator(self) def __str__(self): '''return original data''' @@ -339,6 +325,23 @@ cdef class TupleProxy: r = result.decode(self.encoding) return r + +cdef class TupleProxyIterator: + def __init__(self, proxy): + self.proxy = proxy + self.index = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.index >= self.proxy.nfields: + raise StopIteration + cdef char *retval = self.proxy.fields[self.index] + self.index += 1 + return force_str(retval, self.proxy.encoding) if retval != NULL else None + + def toDot(v): '''convert value to '.' if None''' if v is None: diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index adc9cec..d936dc6 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -425,6 +425,9 @@ def _pysam_dispatch(collection, retval = bcftools_dispatch(n + 2, cargs) bcftools_close_stdout() bcftools_close_stderr() + else: + # unknown -- just return a Unix shell's "command not found" exit status + retval = 127 for i from 0 <= i < n: free(cargs[i + 2]) diff --git a/pysam/samtools.py b/pysam/samtools.py index 9042cc1..30d3edf 100644 --- a/pysam/samtools.py +++ b/pysam/samtools.py @@ -41,6 +41,7 @@ SAMTOOLS_DISPATCH = { "ampliconstats": ("ampliconstats", None), "version": ("version", None), "fqimport": ("import", None), + "samples": ("samples", None), } # instantiate samtools commands as python functions diff --git a/pysam/version.h b/pysam/version.h index 33676ea..4794a2f 100644 --- a/pysam/version.h +++ b/pysam/version.h @@ -1,5 +1,5 @@ // Version information used while compiling samtools, bcftools, and htslib -#define SAMTOOLS_VERSION "1.13 (pysam)" -#define BCFTOOLS_VERSION "1.13 (pysam)" -#define HTS_VERSION_TEXT "1.13 (pysam)" +#define SAMTOOLS_VERSION "1.14 (pysam)" +#define BCFTOOLS_VERSION "1.14 (pysam)" +#define HTS_VERSION_TEXT "1.14 (pysam)" diff --git a/pysam/version.py b/pysam/version.py index 8c871ba..97f673a 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,6 +1,6 @@ # pysam versioning information -__version__ = "0.17.0" +__version__ = "0.18.0" -__samtools_version__ = "1.13" -__bcftools_version__ = "1.13" -__htslib_version__ = "1.13" +__samtools_version__ = "1.14" +__bcftools_version__ = "1.14" +__htslib_version__ = "1.14" diff --git a/samtools/README b/samtools/README index dd27670..9aceb77 100644 --- a/samtools/README +++ b/samtools/README @@ -9,7 +9,7 @@ Building samtools The typical simple case of building Samtools using the HTSlib bundled within this Samtools release tarball is done as follows: - cd .../samtools-1.13 # Within the unpacked release directory + cd .../samtools-1.14 # Within the unpacked release directory ./configure make @@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for installation using the HTSlib bundled within this Samtools release tarball, and building the various HTSlib utilities such as bgzip is done as follows: - cd .../samtools-1.13 # Within the unpacked release directory + cd .../samtools-1.14 # Within the unpacked release directory ./configure --prefix=/path/to/location make all all-htslib make install install-htslib @@ -48,7 +48,7 @@ There are two advantages to this: To build with plug-ins, you need to use the --enable-plugins configure option as follows: - cd .../samtools-1.13 # Within the unpacked release directory + cd .../samtools-1.14 # Within the unpacked release directory ./configure --enable-plugins --prefix=/path/to/location make all all-htslib make install install-htslib @@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from the source distribution instead of installing the package. In that case you can use: - cd .../samtools-1.13 # Within the unpacked release directory - ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.13 + cd .../samtools-1.14 # Within the unpacked release directory + ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.14 make all all-htslib It is possible to override the built-in search path using the HTS_PATH diff --git a/samtools/bam.c b/samtools/bam.c index 926062c..5a77d66 100644 --- a/samtools/bam.c +++ b/samtools/bam.c @@ -1,4 +1,4 @@ -/* bam.c -- BAM format. +/* bam.c -- miscellaneous BAM functions. Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. @@ -31,50 +31,8 @@ DEALINGS IN THE SOFTWARE. */ #include "bam.h" #include "htslib/kstring.h" -char *bam_format1(const bam_header_t *header, const bam1_t *b) -{ - kstring_t str; - str.l = str.m = 0; str.s = NULL; - if (sam_format1(header, b, &str) < 0) { - free(str.s); - str.s = NULL; - return NULL; - } - return str.s; -} - -int bam_view1(const bam_header_t *header, const bam1_t *b) -{ - char *s = bam_format1(header, b); - int ret = -1; - if (!s) return -1; - if (puts(s) != EOF) ret = 0; - free(s); - return ret; -} - -int bam_validate1(const bam_header_t *header, const bam1_t *b) -{ - char *s; - - if (b->core.tid < -1 || b->core.mtid < -1) return 0; - if (header && (b->core.tid >= sam_hdr_nref(header) || b->core.mtid >= sam_hdr_nref(header))) return 0; - - if (b->data_len < b->core.l_qname) return 0; - s = memchr(bam1_qname(b), '\0', b->core.l_qname); - if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0; - - // FIXME: Other fields could also be checked, especially the auxiliary data - - return 1; -} - -#ifndef MIN -#define MIN(a,b) ((a)<(b)?(a):(b)) -#endif - // FIXME: we should also check the LB tag associated with each alignment -const char *bam_get_library(bam_header_t *h, const bam1_t *b) +const char *bam_get_library(sam_hdr_t *h, const bam1_t *b) { const char *rg; kstring_t lib = { 0, 0, NULL }; @@ -99,19 +57,6 @@ const char *bam_get_library(bam_header_t *h, const bam1_t *b) return LB_text; } -int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) -{ - int ret; - bam_iter_t iter; - bam1_t *b; - b = bam_init1(); - iter = bam_iter_query(idx, tid, beg, end); - while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data); - bam_iter_destroy(iter); - bam_destroy1(b); - return (ret == -1)? 0 : ret; -} - /************ * Remove B * ************/ diff --git a/samtools/bam.c.pysam.c b/samtools/bam.c.pysam.c index 2f40ca6..1fdd279 100644 --- a/samtools/bam.c.pysam.c +++ b/samtools/bam.c.pysam.c @@ -1,6 +1,6 @@ #include "samtools.pysam.h" -/* bam.c -- BAM format. +/* bam.c -- miscellaneous BAM functions. Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. @@ -33,50 +33,8 @@ DEALINGS IN THE SOFTWARE. */ #include "bam.h" #include "htslib/kstring.h" -char *bam_format1(const bam_header_t *header, const bam1_t *b) -{ - kstring_t str; - str.l = str.m = 0; str.s = NULL; - if (sam_format1(header, b, &str) < 0) { - free(str.s); - str.s = NULL; - return NULL; - } - return str.s; -} - -int bam_view1(const bam_header_t *header, const bam1_t *b) -{ - char *s = bam_format1(header, b); - int ret = -1; - if (!s) return -1; - if (samtools_puts(s) != EOF) ret = 0; - free(s); - return ret; -} - -int bam_validate1(const bam_header_t *header, const bam1_t *b) -{ - char *s; - - if (b->core.tid < -1 || b->core.mtid < -1) return 0; - if (header && (b->core.tid >= sam_hdr_nref(header) || b->core.mtid >= sam_hdr_nref(header))) return 0; - - if (b->data_len < b->core.l_qname) return 0; - s = memchr(bam1_qname(b), '\0', b->core.l_qname); - if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0; - - // FIXME: Other fields could also be checked, especially the auxiliary data - - return 1; -} - -#ifndef MIN -#define MIN(a,b) ((a)<(b)?(a):(b)) -#endif - // FIXME: we should also check the LB tag associated with each alignment -const char *bam_get_library(bam_header_t *h, const bam1_t *b) +const char *bam_get_library(sam_hdr_t *h, const bam1_t *b) { const char *rg; kstring_t lib = { 0, 0, NULL }; @@ -101,19 +59,6 @@ const char *bam_get_library(bam_header_t *h, const bam1_t *b) return LB_text; } -int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) -{ - int ret; - bam_iter_t iter; - bam1_t *b; - b = bam_init1(); - iter = bam_iter_query(idx, tid, beg, end); - while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data); - bam_iter_destroy(iter); - bam_destroy1(b); - return (ret == -1)? 0 : ret; -} - /************ * Remove B * ************/ diff --git a/samtools/bam.h b/samtools/bam.h index 804d590..6e1c0d5 100644 --- a/samtools/bam.h +++ b/samtools/bam.h @@ -1,7 +1,6 @@ -/* bam.h -- BAM API. +/* bam.h -- miscellaneous BAM functions. Copyright (C) 2008-2014, 2019 Genome Research Ltd. - Portions copyright (C) 2010-2012 Broad Institute. Author: Heng Li @@ -26,545 +25,10 @@ DEALINGS IN THE SOFTWARE. */ #ifndef BAM_BAM_H #define BAM_BAM_H -/*! - @header - - BAM library provides I/O and various operations on manipulating files - in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) - format. It now supports importing from or exporting to SAM, sorting, - merging, generating pileup, and quickly retrieval of reads overlapped - with a specified region. - - @copyright Genome Research Ltd. - */ - -#define BAM_VERSION "1.13" - -#include -#include -#include -#include - -#include "htslib/bgzf.h" #include "htslib/sam.h" -/*! @abstract BAM file handler */ -typedef BGZF *bamFile; -#define bam_open(fn, mode) bgzf_open(fn, mode) -#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode) -#define bam_close(fp) bgzf_close(fp) -#define bam_tell(fp) bgzf_tell(fp) -#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) - -/*! @typedef - @abstract Structure for the alignment header. - @field n_targets number of reference sequences - @field target_name names of the reference sequences - @field target_len lengths of the referene sequences - @field dict header dictionary - @field hash hash table for fast name lookup - @field rg2lib hash table for @RG-ID -> LB lookup - @field l_text length of the plain text in the header - @field text plain text - - @discussion Field hash points to null by default. It is a private - member. - */ -typedef bam_hdr_t bam_header_t; - -// TODO This flag-formatting functionality does not currently exist in htslib -#define BAM_OFDEC 0 -#define BAM_OFHEX 1 -#define BAM_OFSTR 2 - -/*! @abstract default mask for pileup */ -#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) - -/*! @typedef - @abstract Structure for core alignment information. - @field tid chromosome ID, defined by bam_header_t - @field pos 0-based leftmost coordinate - @field bin bin calculated by bam_reg2bin() - @field qual mapping quality - @field l_qname length of the query name - @field flag bitwise flag - @field n_cigar number of CIGAR operations - @field l_qseq length of the query sequence (read) - */ -// typedef struct { ... } bam1_core_t; - -/*! @typedef - @abstract Structure for one alignment. - @field core core information about the alignment - @field l_aux length of auxiliary data - @field data_len current length of bam1_t::data - @field m_data maximum length of bam1_t::data - @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux - - @discussion Notes: - - 1. qname is zero tailing and core.l_qname includes the tailing '\0'. - 2. l_qseq is calculated from the total length of an alignment block - on reading or from CIGAR. - 3. cigar data is encoded 4 bytes per CIGAR operation. - 4. seq is nybble-encoded according to bam_nt16_table. - */ -// typedef struct { ... } bam1_t; -// NOTE htslib version doesn't have l_aux; use bam_get_l_aux(b) instead -#ifndef SAMTOOLS_HTSLIB_SUPPRESS_HACKS -// NOTE htslib also renames data_len to l_data; this macro may help or hinder -#define data_len l_data -#endif - -typedef hts_itr_t *bam_iter_t; - -#define bam1_strand(b) (bam_is_rev((b))) -#define bam1_mstrand(b) (bam_is_mrev((b))) - -/*! @function - @abstract Get the CIGAR array - @param b pointer to an alignment - @return pointer to the CIGAR array - - @discussion In the CIGAR array, each element is a 32-bit integer. The - lower 4 bits gives a CIGAR operation and the higher 28 bits keep the - length of a CIGAR. - */ -#define bam1_cigar(b) (bam_get_cigar((b))) - -/*! @function - @abstract Get the name of the query - @param b pointer to an alignment - @return pointer to the name string, null terminated - */ -#define bam1_qname(b) (bam_get_qname((b))) - -/*! @function - @abstract Get query sequence - @param b pointer to an alignment - @return pointer to sequence - - @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, - 8 for T and 15 for N. Two bases are packed in one byte with the base - at the higher 4 bits having smaller coordinate on the read. It is - recommended to use bam1_seqi() macro to get the base. - */ -#define bam1_seq(b) (bam_get_seq((b))) - -/*! @function - @abstract Get query quality - @param b pointer to an alignment - @return pointer to quality string - */ -#define bam1_qual(b) (bam_get_qual((b))) - -/*! @function - @abstract Get a base on read - @param s Query sequence returned by bam1_seq() - @param i The i-th position, 0-based - @return 4-bit integer representing the base. - */ -#define bam1_seqi(s, i) (bam_seqi((s), (i))) - -/*! @function - @abstract Get auxiliary data - @param b pointer to an alignment - @return pointer to the concatenated auxiliary data - */ -#define bam1_aux(b) (bam_get_aux((b))) - -/*! - @abstract Verbose level between 0 and 3; 0 is supposed to disable all - debugging information, though this may not have been implemented. - */ -#define bam_verbose hts_verbose - -/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ -#define bam_nt16_table seq_nt16_table - -/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ -#define bam_nt16_rev_table seq_nt16_str - -/*! @abstract Table for converting a 4-bit encoded nucleotide to ~2 bits. */ -#define bam_nt16_nt4_table seq_nt16_int - -#ifdef __cplusplus -extern "C" { -#endif - - /********************* - * Low-level SAM I/O * - *********************/ - - /*! @abstract TAM file handler */ - typedef samFile *tamFile; - - /*! - @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. - @param fn SAM file name - @return SAM file handler - */ - static inline tamFile samtools_sam_open(const char *fn) { return sam_open(fn, "r"); } - #undef sam_open - #define sam_open samtools_sam_open - - /*! - @abstract Close a SAM file handler - @param fp SAM file handler - */ - // void sam_close(tamFile fp); - - /*! - @abstract Read one alignment from a SAM file handler - @param fp SAM file handler - @param header header information (ordered names of chromosomes) - @param b read alignment; all members in b will be updated - @return 0 if successful; otherwise negative - */ - // int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); - - /*! - @abstract Read header from a SAM file (if present) - @param fp SAM file handler - @return pointer to header struct; 0 if no @SQ lines available - */ - static inline bam_header_t *sam_header_read(tamFile fp) { return sam_hdr_read(fp); } - - // Note the distressing cast -- bam_name2id is not thread-safe - static inline int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) { return bam_name2id((bam_header_t *)header, seq_name); } - - - /********************* - * Low-level BAM I/O * - *********************/ - - /*! - @abstract Initialize a header structure. - @return the pointer to the header structure - */ - static inline bam_header_t *bam_header_init(void) { return sam_hdr_init(); } - - /*! - @abstract Destroy a header structure. - @param header pointer to the header - */ - static inline void bam_header_destroy(bam_header_t *header) { sam_hdr_destroy(header); } - - /*! - @abstract Read a header structure from BAM. - @param fp BAM file handler, opened by bam_open() - @return pointer to the header structure - - @discussion The file position indicator must be placed at the - beginning of the file. Upon success, the position indicator will - be set at the start of the first alignment. - */ - static inline bam_header_t *bam_header_read(bamFile fp) { return bam_hdr_read(fp); } - - /*! - @abstract Write a header structure to BAM. - @param fp BAM file handler - @param header pointer to the header structure - @return always 0 currently - */ - static inline int bam_header_write(bamFile fp, bam_header_t *header) { return bam_hdr_write(fp, header); } - - /*! - @abstract Read an alignment from BAM. - @param fp BAM file handler - @param b read alignment; all members are updated. - @return number of bytes read from the file - - @discussion The file position indicator must be - placed right before an alignment. Upon success, this function - will set the position indicator to the start of the next - alignment. This function is not affected by the machine - endianness. - */ - // int bam_read1(bamFile fp, bam1_t *b); - - int bam_remove_B(bam1_t *b); - - /*! - @abstract Write an alignment to BAM. - @param fp BAM file handler - @param b alignment to write - @return number of bytes written to the file - */ - // int bam_write1(bamFile fp, const bam1_t *b); - - /*! @function - @abstract Initiate a pointer to bam1_t struct - */ -//#define bam_init1() - - /*! @function - @abstract Free the memory allocated for an alignment. - @param b pointer to an alignment - */ -//#define bam_destroy1(b) - - /*! - @abstract Format a BAM record in the SAM format - @param header pointer to the header structure - @param b alignment to print - @return a pointer to the SAM string - */ - char *bam_format1(const bam_header_t *header, const bam1_t *b); - - /*! - @abstract Formats a BAM record and writes it and \n to stdout - @return 0 if successful, -1 on error - */ - int bam_view1(const bam_header_t *header, const bam1_t *b); - - /*! - @abstract Check whether a BAM record is plausibly valid - @param header associated header structure, or NULL if unavailable - @param b alignment to validate - @return 0 if the alignment is invalid; non-zero otherwise - - @discussion Simple consistency check of some of the fields of the - alignment record. If the header is provided, several additional checks - are made. Not all fields are checked, so a non-zero result is not a - guarantee that the record is valid. However it is usually good enough - to detect when bam_seek() has been called with a virtual file offset - that is not the offset of an alignment record. - */ - int bam_validate1(const bam_header_t *header, const bam1_t *b); - - // TODO Parses headers, so not yet implemented in terms of htslib - const char *bam_get_library(bam_header_t *header, const bam1_t *b); - - - /*************** - * pileup APIs * - ***************/ - - /*! @typedef - @abstract Structure for one alignment covering the pileup position. - @field b pointer to the alignment - @field qpos position of the read base at the pileup site, 0-based - @field indel indel length; 0 for no indel, positive for ins and negative for del - @field is_del 1 iff the base on the padded read is a deletion - @field level the level of the read in the "viewer" mode - - @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The - difference between the two functions is that the former does not - set bam_pileup1_t::level, while the later does. Level helps the - implementation of alignment viewers, but calculating this has some - overhead. - */ - // typedef struct { ... } bam_pileup1_t; - - // typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); - - // typedef struct incomplete *bam_plp_t; - - // bam_plp_t bam_plp_init(bam_plp_auto_f read, void *data); - // int bam_plp_push(bam_plp_t iter, const bam1_t *b); - // const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); - // const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); - // void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); - // void bam_plp_reset(bam_plp_t iter); - // void bam_plp_destroy(bam_plp_t iter); - - // typedef struct incomplete *bam_mplp_t; - - // bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); - // void bam_mplp_destroy(bam_mplp_t iter); - // void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); - // int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); - - /*! @typedef - @abstract Type of function to be called by bam_plbuf_push(). - @param tid chromosome ID as is defined in the header - @param pos start coordinate of the alignment, 0-based - @param n number of elements in pl array - @param pl array of alignments - @param data user provided data - @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t. - */ - typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); - - typedef struct { - bam_plp_t iter; - bam_pileup_f func; - void *data; - } bam_plbuf_t; - - void bam_plbuf_reset(bam_plbuf_t *buf); - bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); - void bam_plbuf_destroy(bam_plbuf_t *buf); - int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); - - int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); - - struct __bam_lplbuf_t; - typedef struct __bam_lplbuf_t bam_lplbuf_t; - - void bam_lplbuf_reset(bam_lplbuf_t *buf); - - /*! @abstract bam_plbuf_init() equivalent with level calculated. */ - bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data); - - /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */ - void bam_lplbuf_destroy(bam_lplbuf_t *tv); - - /*! @abstract bam_plbuf_push() equivalent with level calculated. */ - int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); - - - /********************* - * BAM indexing APIs * - *********************/ - - typedef hts_idx_t bam_index_t; - - /*! - @abstract Build index for a BAM file. - @discussion Index file "fn.bai" will be created. - @param fn name of the BAM file - @return always 0 currently - */ - static inline int samtools_bam_index_build(const char *fn) { return bam_index_build(fn, 0); } - #undef bam_index_build - #define bam_index_build samtools_bam_index_build - - /*! - @abstract Load index from file "fn.bai". - @param fn name of the BAM file (NOT the index file) - @return pointer to the index structure - */ - // bam_index_t *bam_index_load(const char *fn); - - /*! - @abstract Destroy an index structure. - @param idx pointer to the index structure - */ - static inline void bam_index_destroy(bam_index_t *idx) { hts_idx_destroy(idx); } - - /*! @typedef - @abstract Type of function to be called by bam_fetch(). - @param b the alignment - @param data user provided data - */ - typedef int (*bam_fetch_f)(const bam1_t *b, void *data); - - /*! - @abstract Retrieve the alignments that are overlapped with the - specified region. (For BAM files only; see also samfetch() in sam.h.) - - @discussion A user defined function will be called for each - retrieved alignment ordered by its start position. - - @param fp BAM file handler - @param idx pointer to the alignment index - @param tid chromosome ID as is defined in the header - @param beg start coordinate, 0-based - @param end end coordinate, 0-based - @param data user provided data (will be transferred to func) - @param func user defined function - */ - int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); - - static inline bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) { return bam_itr_queryi(idx, tid, beg, end); } - static inline int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) { return iter? hts_itr_next(fp, iter, b, 0) : bam_read1(fp, b); } - static inline void bam_iter_destroy(bam_iter_t iter) { bam_itr_destroy(iter); } - - /*! - @abstract Parse a region in the format: "chr2:100,000-200,000". - @discussion bam_header_t::hash will be initialized if empty. - @param header pointer to the header structure - @param str string to be parsed - @param ref_id the returned chromosome ID - @param begin the returned start coordinate - @param end the returned end coordinate - @return 0 on success; -1 on failure - */ - int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); - - - /************************** - * APIs for optional tags * - **************************/ - - /*! - @abstract Retrieve data of a tag - @param b pointer to an alignment struct - @param tag two-character tag to be retrieved - - @return pointer to the type and data. The first character is the - type that can be 'iIsScCdfAZH'. - - @discussion Use bam_aux2?() series to convert the returned data to - the corresponding type. - */ - // uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); - - // int32_t bam_aux2i(const uint8_t *s); - // float bam_aux2f(const uint8_t *s); - #define bam_aux2d(s) (bam_aux2f((s))) - // char bam_aux2A(const uint8_t *s); - // char *bam_aux2Z(const uint8_t *s); - - // int bam_aux_del(bam1_t *b, uint8_t *s); - // void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); - static inline uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) { return bam_aux_get(b, tag); } // an alias of bam_aux_get() - - - /***************** - * Miscellaneous * - *****************/ - - /*! - @abstract Calculate the rightmost coordinate of an alignment on the - reference genome. - - @param c pointer to the bam1_core_t structure - @param cigar the corresponding CIGAR array (from bam1_t::cigar) - @return the rightmost coordinate, 0-based - */ - static inline uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) { return c->pos + (c->n_cigar? bam_cigar2rlen(c->n_cigar, cigar) : 1); } - - /*! - @abstract Calculate the length of the query sequence from CIGAR. - @param c pointer to the bam1_core_t structure - @param cigar the corresponding CIGAR array (from bam1_t::cigar) - @return length of the query sequence - */ - static inline int32_t samtools_bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) { return bam_cigar2qlen(c->n_cigar, cigar); } - #undef bam_cigar2qlen - #define bam_cigar2qlen samtools_bam_cigar2qlen - -#ifdef __cplusplus -} -#endif - -/*! - @abstract Calculate the minimum bin that contains a region [beg,end). - @param beg start of the region, 0-based - @param end end of the region, 0-based - @return bin - */ -static inline int bam_reg2bin(uint32_t beg, uint32_t end) -{ - return hts_reg2bin(beg, end, 14, 5); -} - -/*! - @abstract Copy an alignment - @param bdst destination alignment struct - @param bsrc source alignment struct - @return pointer to the destination alignment struct - */ -// bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) +int bam_remove_B(bam1_t *b); -/*! - @abstract Duplicate an alignment - @param src source alignment struct - @return pointer to the destination alignment struct - */ -// bam1_t *bam_dup1(const bam1_t *src) +const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); #endif diff --git a/samtools/bam2bcf.c b/samtools/bam2bcf.c index ef3a63e..7cae49d 100644 --- a/samtools/bam2bcf.c +++ b/samtools/bam2bcf.c @@ -1,7 +1,7 @@ /* bam2bcf.c -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2015 Genome Research Ltd. + Copyright (C) 2012-2015, 2021 Genome Research Ltd. Author: Heng Li @@ -371,17 +371,12 @@ double mann_whitney_1947_cdf(int n, int m, int U) double calc_mwu_bias_cdf(int *a, int *b, int n) { int na = 0, nb = 0, i; - double U = 0, ties = 0; + double U = 0; for (i=0; i=8 || nb>=8 ) { double mean = ((double)na*nb)*0.5; - // Correction for ties: - // double N = na+nb; - // double var2 = (N*N-1)*N-ties; - // if ( var2==0 ) return 1.0; - // var2 *= ((double)na*nb)/N/(N-1)/12.0; - // No correction for ties: double var2 = ((double)na*nb)*(na+nb+1)/12.0; double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1) return 2.0 - kf_erfc(z); // which is 1 + erf(z) @@ -415,17 +404,12 @@ double calc_mwu_bias_cdf(int *a, int *b, int n) double calc_mwu_bias(int *a, int *b, int n) { int na = 0, nb = 0, i; - double U = 0, ties = 0; + double U = 0; for (i=0; imean ? (2.0*mean-U)/mean : U/mean; } - // Correction for ties: - // double N = na+nb; - // double var2 = (N*N-1)*N-ties; - // if ( var2==0 ) return 1.0; - // var2 *= ((double)na*nb)/N/(N-1)/12.0; - // No correction for ties: double var2 = ((double)na*nb)*(na+nb+1)/12.0; if ( na>=8 || nb>=8 ) { diff --git a/samtools/bam2bcf.c.pysam.c b/samtools/bam2bcf.c.pysam.c index 485f42f..70b8bee 100644 --- a/samtools/bam2bcf.c.pysam.c +++ b/samtools/bam2bcf.c.pysam.c @@ -3,7 +3,7 @@ /* bam2bcf.c -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2015 Genome Research Ltd. + Copyright (C) 2012-2015, 2021 Genome Research Ltd. Author: Heng Li @@ -373,17 +373,12 @@ double mann_whitney_1947_cdf(int n, int m, int U) double calc_mwu_bias_cdf(int *a, int *b, int n) { int na = 0, nb = 0, i; - double U = 0, ties = 0; + double U = 0; for (i=0; i=8 || nb>=8 ) { double mean = ((double)na*nb)*0.5; - // Correction for ties: - // double N = na+nb; - // double var2 = (N*N-1)*N-ties; - // if ( var2==0 ) return 1.0; - // var2 *= ((double)na*nb)/N/(N-1)/12.0; - // No correction for ties: double var2 = ((double)na*nb)*(na+nb+1)/12.0; double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1) return 2.0 - kf_erfc(z); // which is 1 + erf(z) @@ -417,17 +406,12 @@ double calc_mwu_bias_cdf(int *a, int *b, int n) double calc_mwu_bias(int *a, int *b, int n) { int na = 0, nb = 0, i; - double U = 0, ties = 0; + double U = 0; for (i=0; imean ? (2.0*mean-U)/mean : U/mean; } - // Correction for ties: - // double N = na+nb; - // double var2 = (N*N-1)*N-ties; - // if ( var2==0 ) return 1.0; - // var2 *= ((double)na*nb)/N/(N-1)/12.0; - // No correction for ties: double var2 = ((double)na*nb)*(na+nb+1)/12.0; if ( na>=8 || nb>=8 ) { diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index 5253dfa..ac34316 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -733,6 +733,7 @@ int main_depth(int argc, char *argv[]) sam_hdr_t **header; int c, has_index_file = 0; char *file_list = NULL, **fn = NULL; + char *out_file = NULL; depth_opt opt = { .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL, .min_qual = 0, @@ -807,7 +808,7 @@ int main_depth(int argc, char *argv[]) case 'o': if (opt.out != stdout) break; - opt.out = fopen(optarg, "w"); + opt.out = fopen(out_file = optarg, "w"); if (!opt.out) { print_error_errno("depth", "Cannot open \"%s\" for writing.", optarg); @@ -948,7 +949,13 @@ int main_depth(int argc, char *argv[]) if (opt.bed) bed_destroy(opt.bed); sam_global_args_free(&ga); - if (opt.out != stdout) fclose(opt.out); + if (opt.out != stdout) { + if (fclose(opt.out) != 0 && ret == 0) { + print_error_errno("depth", "error on closing \"%s\"", out_file); + ret = 1; + } + } + return ret; } diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index 8b36457..7375ef7 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -735,6 +735,7 @@ int main_depth(int argc, char *argv[]) sam_hdr_t **header; int c, has_index_file = 0; char *file_list = NULL, **fn = NULL; + char *out_file = NULL; depth_opt opt = { .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL, .min_qual = 0, @@ -809,7 +810,7 @@ int main_depth(int argc, char *argv[]) case 'o': if (opt.out != samtools_stdout) break; - opt.out = fopen(optarg, "w"); + opt.out = fopen(out_file = optarg, "w"); if (!opt.out) { print_error_errno("depth", "Cannot open \"%s\" for writing.", optarg); @@ -950,7 +951,13 @@ int main_depth(int argc, char *argv[]) if (opt.bed) bed_destroy(opt.bed); sam_global_args_free(&ga); - if (opt.out != samtools_stdout) fclose(opt.out); + if (opt.out != samtools_stdout) { + if (fclose(opt.out) != 0 && ret == 0) { + print_error_errno("depth", "error on closing \"%s\"", out_file); + ret = 1; + } + } + return ret; } diff --git a/samtools/bam_aux.c b/samtools/bam_aux.c index 77d94f8..d11ee6e 100644 --- a/samtools/bam_aux.c +++ b/samtools/bam_aux.c @@ -26,8 +26,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include -#include "bam.h" +#include "htslib/sam.h" static inline int bam_aux_type2size(int x) { @@ -60,16 +59,3 @@ int bam_aux_drop_other(bam1_t *b, uint8_t *s) } return 0; } - -// Only here due to libbam.a being used by some applications. -int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) -{ - hts_pos_t beg64, end64; - int r; - r = sam_parse_region(header, str, ref_id, &beg64, &end64, 0) ? 0 : -1; - if (beg64 > INT_MAX || end64 > INT_MAX) - return -1; - *beg = beg64; - *end = end64; - return r; -} diff --git a/samtools/bam_aux.c.pysam.c b/samtools/bam_aux.c.pysam.c index 39fe5ce..621589d 100644 --- a/samtools/bam_aux.c.pysam.c +++ b/samtools/bam_aux.c.pysam.c @@ -28,8 +28,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include -#include "bam.h" +#include "htslib/sam.h" static inline int bam_aux_type2size(int x) { @@ -62,16 +61,3 @@ int bam_aux_drop_other(bam1_t *b, uint8_t *s) } return 0; } - -// Only here due to libbam.a being used by some applications. -int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) -{ - hts_pos_t beg64, end64; - int r; - r = sam_parse_region(header, str, ref_id, &beg64, &end64, 0) ? 0 : -1; - if (beg64 > INT_MAX || end64 > INT_MAX) - return -1; - *beg = beg64; - *end = end64; - return r; -} diff --git a/samtools/bam_endian.h b/samtools/bam_endian.h deleted file mode 100644 index d870ca5..0000000 --- a/samtools/bam_endian.h +++ /dev/null @@ -1,66 +0,0 @@ -/* bam_endian.h -- endianness conversion functions. - - Copyright (C) 2008 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#ifndef BAM_ENDIAN_H -#define BAM_ENDIAN_H - -#include - -static inline int bam_is_big_endian() -{ - long one= 1; - return !(*((char *)(&one))); -} -static inline uint16_t bam_swap_endian_2(uint16_t v) -{ - return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); -} -static inline void *bam_swap_endian_2p(void *x) -{ - *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); - return x; -} -static inline uint32_t bam_swap_endian_4(uint32_t v) -{ - v = ((v & 0x0000FFFFU) << 16) | (v >> 16); - return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); -} -static inline void *bam_swap_endian_4p(void *x) -{ - *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); - return x; -} -static inline uint64_t bam_swap_endian_8(uint64_t v) -{ - v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); - v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); - return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); -} -static inline void *bam_swap_endian_8p(void *x) -{ - *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); - return x; -} - -#endif diff --git a/samtools/bam_fastq.c b/samtools/bam_fastq.c index a4d757c..ccc1f17 100644 --- a/samtools/bam_fastq.c +++ b/samtools/bam_fastq.c @@ -480,6 +480,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) return false; } set_sam_opts(state->hstdout, state, opts); + autoflush_if_stdout(state->hstdout, "-"); } state->fpr[i] = state->hstdout; } @@ -546,6 +547,7 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* } } if (state->hstdout) { + release_autoflush(state->hstdout); if (sam_close(state->hstdout) < 0) { print_error_errno("bam2fq", "Error closing STDOUT"); valid = false; @@ -622,7 +624,7 @@ int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state, int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state, bam2fq_opts_t* opts) { - bam1_t *b[2] = {b1, b2}; + bam1_t *b = b1 ? b1 : b2; char *ifmt = opts->index_format; if (!ifmt) @@ -675,7 +677,7 @@ int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state, break; case 'i': - if (write_index_rec(state->fpi[inum], b[inum], state, opts, + if (write_index_rec(state->fpi[inum], b, state, opts, bc, bc_end-bc, qt, qt_end-qt) < 0) return -1; bc = bc_end + (len==0); @@ -787,7 +789,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) while (true) { int res = sam_read1(state->fp, state->h, b[n]); if (res < -1) { - fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); + print_error("bam2fq", "Failed to read bam record"); goto err; } at_eof = res < 0; diff --git a/samtools/bam_fastq.c.pysam.c b/samtools/bam_fastq.c.pysam.c index f7249d1..55013ed 100644 --- a/samtools/bam_fastq.c.pysam.c +++ b/samtools/bam_fastq.c.pysam.c @@ -151,7 +151,7 @@ typedef struct bam2fq_state { samFile *fpse; samFile *fpr[3]; samFile *fpi[3]; - samFile *hsamtools_stdout; + samFile *hstdout; sam_hdr_t *h; bool has12, use_oq, copy_tags, illumina_tag; int flag_on, flag_off, flag_alloff; @@ -406,7 +406,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->filetype = opts->filetype; state->def_qual = opts->def_qual; state->index_sequence = NULL; - state->hsamtools_stdout = NULL; + state->hstdout = NULL; state->compression_level = opts->compression_level; state->fp = sam_open(opts->fn_input, "r"); @@ -475,15 +475,16 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->fpr[i] = state->fpr[j]; } } else { - if (!state->hsamtools_stdout) { - if (!(state->hsamtools_stdout = sam_open_z("-", mode, state))) { + if (!state->hstdout) { + if (!(state->hstdout = sam_open_z("-", mode, state))) { print_error_errno("bam2fq", "Cannot open STDOUT"); free(state); return false; } - set_sam_opts(state->hsamtools_stdout, state, opts); + set_sam_opts(state->hstdout, state, opts); + autoflush_if_stdout(state->hstdout, "-"); } - state->fpr[i] = state->hsamtools_stdout; + state->fpr[i] = state->hstdout; } } @@ -537,7 +538,7 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* int i, j; for (i = 0; i < 3; ++i) { - if (state->fpr[i] != state->hsamtools_stdout) { + if (state->fpr[i] != state->hstdout) { for (j = 0; j < i; j++) if (state->fpr[i] == state->fpr[j]) break; @@ -547,8 +548,9 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* } } } - if (state->hsamtools_stdout) { - if (sam_close(state->hsamtools_stdout) < 0) { + if (state->hstdout) { + release_autoflush(state->hstdout); + if (sam_close(state->hstdout) < 0) { print_error_errno("bam2fq", "Error closing STDOUT"); valid = false; } @@ -624,7 +626,7 @@ int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state, int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state, bam2fq_opts_t* opts) { - bam1_t *b[2] = {b1, b2}; + bam1_t *b = b1 ? b1 : b2; char *ifmt = opts->index_format; if (!ifmt) @@ -677,7 +679,7 @@ int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state, break; case 'i': - if (write_index_rec(state->fpi[inum], b[inum], state, opts, + if (write_index_rec(state->fpi[inum], b, state, opts, bc, bc_end-bc, qt, qt_end-qt) < 0) return -1; bc = bc_end + (len==0); @@ -789,7 +791,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) while (true) { int res = sam_read1(state->fp, state->h, b[n]); if (res < -1) { - fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); + print_error("bam2fq", "Failed to read bam record"); goto err; } at_eof = res < 0; diff --git a/samtools/bam_import.c b/samtools/bam_import.c index daf6b17..6a25914 100644 --- a/samtools/bam_import.c +++ b/samtools/bam_import.c @@ -55,6 +55,7 @@ static int usage(FILE *fp, int exit_status) { fprintf(fp, " Tag to use with barcode sequences [BC]\n"); fprintf(fp, " --quality-tag TAG\n"); fprintf(fp, " Tag to use with barcode qualities [QT]\n"); + fprintf(fp, " -N, --name2 Use 2nd field as read name (SRA format)\n"); fprintf(fp, " -r STRING Build up a complete @RG line\n"); fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n"); fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n"); @@ -94,6 +95,7 @@ typedef struct { char *order; int compress_level; htsThreadPool p; + int name2; } opts_t; // Append a sequence and quality string from a BAM record to a BC:Z and @@ -174,6 +176,8 @@ static int import_fastq(int argc, char **argv, opts_t *opts) { hts_set_thread_pool(fp_in[i], &opts->p); ids[n++] = i; + if (opts->name2) + hts_set_opt(fp_in[i], FASTQ_OPT_NAME2, 1); if (opts->casava) hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1); if (opts->barcode_seq) // for auto-CASAVA parsing @@ -228,6 +232,7 @@ static int import_fastq(int argc, char **argv, opts_t *opts) { perror(opts->fn_out); goto err; } + autoflush_if_stdout(fp_out, opts->fn_out); if (opts->p.pool) hts_set_thread_pool(fp_out, &opts->p); @@ -378,6 +383,7 @@ err: ks_free(&index_str); ks_free(&read_str); if (fp_out) { + release_autoflush(fp_out); if (sam_close(fp_out) < 0) { perror(opts->fn_out); ret |= -1; @@ -410,6 +416,7 @@ int main_import(int argc, char *argv[]) { .rg_line = NULL, .order = NULL, .compress_level = -1, + .name2 = 0, }; kstring_t rg = {0}; @@ -425,10 +432,11 @@ int main_import(int argc, char *argv[]) { {"order", required_argument, NULL, 3}, {"barcode-tag", required_argument, NULL, 4}, {"quality-tag", required_argument, NULL, 5}, + {"name2", no_argument, NULL, 'N'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:N", lopts, NULL)) >= 0) { switch (c) { case 'b': opts.idx_both = 1; break; case '0': opts.fn[FQ_R0] = optarg; break; @@ -453,6 +461,8 @@ int main_import(int argc, char *argv[]) { opts.rg_line = rg.s; break; + case 'N': opts.name2 = 1; break; + case 9: opts.no_pg = 1; break; case 3: opts.order = optarg; break; diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c index 1307ac6..c66f7c8 100644 --- a/samtools/bam_import.c.pysam.c +++ b/samtools/bam_import.c.pysam.c @@ -57,6 +57,7 @@ static int usage(FILE *fp, int exit_status) { fprintf(fp, " Tag to use with barcode sequences [BC]\n"); fprintf(fp, " --quality-tag TAG\n"); fprintf(fp, " Tag to use with barcode qualities [QT]\n"); + fprintf(fp, " -N, --name2 Use 2nd field as read name (SRA format)\n"); fprintf(fp, " -r STRING Build up a complete @RG line\n"); fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n"); fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n"); @@ -96,6 +97,7 @@ typedef struct { char *order; int compress_level; htsThreadPool p; + int name2; } opts_t; // Append a sequence and quality string from a BAM record to a BC:Z and @@ -176,6 +178,8 @@ static int import_fastq(int argc, char **argv, opts_t *opts) { hts_set_thread_pool(fp_in[i], &opts->p); ids[n++] = i; + if (opts->name2) + hts_set_opt(fp_in[i], FASTQ_OPT_NAME2, 1); if (opts->casava) hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1); if (opts->barcode_seq) // for auto-CASAVA parsing @@ -230,6 +234,7 @@ static int import_fastq(int argc, char **argv, opts_t *opts) { perror(opts->fn_out); goto err; } + autoflush_if_stdout(fp_out, opts->fn_out); if (opts->p.pool) hts_set_thread_pool(fp_out, &opts->p); @@ -380,6 +385,7 @@ err: ks_free(&index_str); ks_free(&read_str); if (fp_out) { + release_autoflush(fp_out); if (sam_close(fp_out) < 0) { perror(opts->fn_out); ret |= -1; @@ -412,6 +418,7 @@ int main_import(int argc, char *argv[]) { .rg_line = NULL, .order = NULL, .compress_level = -1, + .name2 = 0, }; kstring_t rg = {0}; @@ -427,10 +434,11 @@ int main_import(int argc, char *argv[]) { {"order", required_argument, NULL, 3}, {"barcode-tag", required_argument, NULL, 4}, {"quality-tag", required_argument, NULL, 5}, + {"name2", no_argument, NULL, 'N'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:N", lopts, NULL)) >= 0) { switch (c) { case 'b': opts.idx_both = 1; break; case '0': opts.fn[FQ_R0] = optarg; break; @@ -455,6 +463,8 @@ int main_import(int argc, char *argv[]) { opts.rg_line = rg.s; break; + case 'N': opts.name2 = 1; break; + case 9: opts.no_pg = 1; break; case 3: opts.order = optarg; break; diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c index 2da184f..84ec1ec 100644 --- a/samtools/bam_markdup.c +++ b/samtools/bam_markdup.c @@ -1,7 +1,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2020 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Andrew Whitwham @@ -928,10 +928,6 @@ static int optical_duplicate_partial(const char *name, const int oxpos, const lo c->y = dy; c->xpos = dxpos; - if (ret) { - c->opt = ret; - } - return ret; } diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c index 7132687..a478956 100644 --- a/samtools/bam_markdup.c.pysam.c +++ b/samtools/bam_markdup.c.pysam.c @@ -3,7 +3,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2020 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Andrew Whitwham @@ -930,10 +930,6 @@ static int optical_duplicate_partial(const char *name, const int oxpos, const lo c->y = dy; c->xpos = dxpos; - if (ret) { - c->opt = ret; - } - return ret; } diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index 6fd282c..10e79c6 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -42,6 +42,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "samtools.h" #include "bedidx.h" #include "sam_opts.h" @@ -65,11 +66,14 @@ static inline int printw(int c, FILE *fp) } static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, - hts_pos_t ref_len, const char *ref, kstring_t *ks, - int rev_del) + hts_pos_t ref_len, const char *ref, kstring_t *ks, + int rev_del, int no_ins, int no_ins_mods, + int no_del, int no_ends) { + no_ins_mods |= no_ins; int j; - if (p->is_head) { + hts_base_mod_state *m = p->cd.p; + if (!no_ends && p->is_head) { putc('^', fp); putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp); } @@ -86,32 +90,74 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, else c = bam_is_rev(p->b)? tolower(c) : toupper(c); } putc(c, fp); + if (m) { + int nm; + hts_base_mod mod[256]; + if ((nm = bam_mods_at_qpos(p->b, p->qpos, m, mod, 256)) > 0) { + putc('[', fp); + int j; + for (j = 0; j < nm && j < 256; j++) { + char qual[20]; + if (mod[j].qual >= 0) + sprintf(qual, "%d", mod[j].qual); + else + *qual = 0; + if (mod[j].modified_base < 0) + // ChEBI + fprintf(fp, "%c(%d)%s", "+-"[mod[j].strand], + -mod[j].modified_base, qual); + else + fprintf(fp, "%c%c%s", "+-"[mod[j].strand], + mod[j].modified_base, qual); + } + putc(']', fp); + } + } } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp); int del_len = -p->indel; if (p->indel > 0) { - int len = bam_plp_insertion(p, ks, &del_len); + int len = bam_plp_insertion_mod(p, m && !no_ins_mods ? m : NULL, + ks, &del_len); if (len < 0) { print_error("mpileup", "bam_plp_insertion() failed"); return -1; } - putc('+', fp); printw(len, fp); - if (bam_is_rev(p->b)) { - char pad = rev_del ? '#' : '*'; - for (j = 0; j < len; j++) - putc(ks->s[j] != '*' ? tolower(ks->s[j]) : pad, fp); - } else { - for (j = 0; j < len; j++) - putc(toupper(ks->s[j]), fp); + if (no_ins < 2) { + putc('+', fp); + printw(len, fp); + } + if (!no_ins) { + if (bam_is_rev(p->b)) { + char pad = rev_del ? '#' : '*'; + int in_mod = 0; + for (j = 0; j < ks->l; j++) { + if (ks->s[j] == '[') in_mod = 1; + else if (ks->s[j] == ']') in_mod = 0; + putc(ks->s[j] != '*' + ? (in_mod ? ks->s[j] : tolower(ks->s[j])) + : pad, fp); + } + } else { + int in_mod = 0; + for (j = 0; j < ks->l; j++) { + if (ks->s[j] == '[') in_mod = 1; + if (ks->s[j] == ']') in_mod = 0; + putc(in_mod ? ks->s[j] : toupper(ks->s[j]), fp); + } + } } } if (del_len > 0) { - printw(-del_len, fp); - for (j = 1; j <= del_len; ++j) { - int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; - putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); + if (no_del < 2) + printw(-del_len, fp); + if (!no_del) { + for (j = 1; j <= del_len; ++j) { + int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; + putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); + } } } - if (p->is_tail) putc('$', fp); + if (!no_ends && p->is_tail) putc('$', fp); return 0; } @@ -144,6 +190,10 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, #define MPLP_PRINT_TLEN (1<<21) #define MPLP_PRINT_SEQ (1<<22) #define MPLP_PRINT_QUAL (1<<23) +#define MPLP_PRINT_MODS (1<<24) +#define MPLP_PRINT_QPOS5 (1<<25) + +#define MPLP_PRINT_LAST (1<<26) // terminator for loop #define MPLP_MAX_DEPTH 8000 #define MPLP_MAX_INDEL_DEPTH 250 @@ -158,7 +208,7 @@ typedef struct { void *bed, *rghash, *auxlist; int argc; char **argv; - char sep, empty; + char sep, empty, no_ins, no_ins_mods, no_del, no_ends; sam_global_args ga; } mplp_conf_t; @@ -289,6 +339,23 @@ static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, hts_pos_t *ref_len) return 1; } +// Initialise and destroy the base modifier state data. This is called +// as each new read is added or removed from the pileups. +static +int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) { + int ret; + hts_base_mod_state *m = hts_base_mod_state_alloc(); + ret = bam_parse_basemod(b, m); + cd->p = m; + return ret; +} + +static +int pileup_cd_destroy(void *data, const bam1_t *b, bam_pileup_cd *cd) { + hts_base_mod_state_free(cd->p); + return 0; +} + static void print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, hts_pos_t pos, int n, const char *ref, hts_pos_t ref_len) @@ -298,8 +365,8 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, for (i = 0; i < n; ++i) { fputs("\t0\t*\t*", fp); int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_QUAL + 1) { - if (conf->flag & flag_value) + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS && (conf->flag & flag_value)) fputs("\t*", fp); flag_value <<= 1; } @@ -445,6 +512,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) } // read the header of each file in the list and initialize data + refs_t *refs = NULL; for (i = 0; i < n; ++i) { sam_hdr_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); @@ -458,11 +526,22 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } - if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) { - fprintf(stderr, "[%s] failed to process %s: %s\n", - __func__, conf->fai_fname, strerror(errno)); - exit(EXIT_FAILURE); + + if (!refs && conf->fai_fname) { + if (hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) { + fprintf(stderr, "[%s] failed to process %s: %s\n", + __func__, conf->fai_fname, strerror(errno)); + exit(EXIT_FAILURE); + } + refs = cram_get_refs(data[i]->fp); + } else if (conf->fai_fname) { + if (hts_set_opt(data[i]->fp, CRAM_OPT_SHARED_REF, refs) != 0) { + fprintf(stderr, "[%s] failed to process %s: %s\n", + __func__, conf->fai_fname, strerror(errno)); + exit(EXIT_FAILURE); + } } + data[i]->conf = conf; data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(data[i]->fp); @@ -530,6 +609,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } + autoflush_if_stdout(bcf_fp, conf->output_fname); // BCF header creation bcf_hdr = bcf_hdr_init("w"); @@ -657,6 +737,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) // init pileup iter = bam_mplp_init(n, mplp_func, (void**)data); + if (conf->flag & MPLP_PRINT_MODS) { + bam_mplp_constructor(iter, pileup_cd_create); + bam_mplp_destructor(iter, pileup_cd_destroy); + } if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); if ( !conf->max_depth ) { max_depth = INT_MAX; @@ -759,8 +843,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_QUAL + 1) { - if (conf->flag & flag_value) + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS + && (conf->flag & flag_value)) fputs("\t*", pileup_fp); flag_value <<= 1; } @@ -779,7 +864,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) : 0; if (c >= conf->min_baseQ) { n++; - if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref, &ks, conf->rev_del) < 0) { + if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, + ref, &ks, conf->rev_del, + conf->no_ins, conf->no_ins_mods, + conf->no_del, conf->no_ends) < 0) { ret = 1; goto fail; } @@ -806,8 +894,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) /* Print selected columns */ int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_QUAL + 1) { - if (conf->flag & flag_value) { + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS + && (conf->flag & flag_value)) { n = 0; putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { @@ -826,8 +915,17 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) putc(c, pileup_fp); break; case MPLP_PRINT_QPOS: + // query position in current orientation fprintf(pileup_fp, "%d", p->qpos + 1); break; + case MPLP_PRINT_QPOS5: { + // query position in 5' to 3' orientation + int pos5 = bam_is_rev(p->b) + ? p->b->core.l_qseq-p->qpos + p->is_del + : p->qpos + 1; + fprintf(pileup_fp, "%d", pos5); + break; + } case MPLP_PRINT_QNAME: fputs(bam_get_qname(p->b), pileup_fp); break; @@ -951,6 +1049,7 @@ fail: bcf_destroy1(bcf_rec); if (bcf_fp) { + release_autoflush(bcf_fp); hts_close(bcf_fp); bcf_hdr_destroy(bcf_hdr); bcf_call_destroy(bca); @@ -1113,16 +1212,24 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -X, --customized-index use customized index files\n" // -X flag for index filename "\n" "Output options:\n" -" -o, --output FILE write output to FILE [standard output]\n" -" -O, --output-BP output base positions on reads\n" -" -s, --output-MQ output mapping quality\n" -" --output-QNAME output read names\n" -" --output-extra STR output extra read fields and read tag values\n" -" --output-sep CHAR set the separator character for tag lists [,]\n" -" --output-empty CHAR set the no value character for tag lists [*]\n" -" --reverse-del use '#' character for deletions on the reverse strand\n" -" -a output all positions (including zero depth)\n" -" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" +" -o, --output FILE write output to FILE [standard output]\n" +" -O, --output-BP output base positions on reads, current orientation\n" +" --output-BP-5 output base positions on reads, 5' to 3' orientation\n" +" -M, --output-mods output base modifications\n" +" -s, --output-MQ output mapping quality\n" +" --output-QNAME output read names\n" +" --output-extra STR output extra read fields and read tag values\n" +" --output-sep CHAR set the separator character for tag lists [,]\n" +" --output-empty CHAR set the no value character for tag lists [*]\n" +" --no-output-ins skip insertion sequence after +NUM\n" +" Use twice for complete insertion removal\n" +" --no-output-ins-mods don't display base modifications within insertions\n" +" --no-output-del skip deletion sequence after -NUM\n" +" Use twice for complete deletion removal\n" +" --no-output-ends remove ^MQUAL and $ markup in sequence column\n" +" --reverse-del use '#' character for deletions on the reverse strand\n" +" -a output all positions (including zero depth)\n" +" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" "\n" "Generic options:\n"); sam_global_opt_help(fp, "-.--.--."); @@ -1201,8 +1308,11 @@ int bam_mpileup(int argc, char *argv[]) {"bcf", no_argument, NULL, 'g'}, {"VCF", no_argument, NULL, 'v'}, {"vcf", no_argument, NULL, 'v'}, + {"output-mods", no_argument, NULL, 'M'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, + {"output-BP-5", no_argument, NULL, 14}, + {"output-bp-5", no_argument, NULL, 14}, {"output-MQ", no_argument, NULL, 's'}, {"output-mq", no_argument, NULL, 's'}, {"output-tags", required_argument, NULL, 't'}, @@ -1221,10 +1331,14 @@ int bam_mpileup(int argc, char *argv[]) {"output-extra", required_argument, NULL, 7}, {"output-sep", required_argument, NULL, 8}, {"output-empty", required_argument, NULL, 9}, + {"no-output-ins", no_argument, NULL, 10}, + {"no-output-ins-mods", no_argument, NULL, 11}, + {"no-output-del", no_argument, NULL, 12}, + {"no-output-ends", no_argument, NULL, 13}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:a",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:aM",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -1247,6 +1361,10 @@ int bam_mpileup(int argc, char *argv[]) break; case 8: mplp.sep = optarg[0]; break; case 9: mplp.empty = optarg[0]; break; + case 10: mplp.no_ins++; break; + case 11: mplp.no_ins_mods = 1; break; + case 12: mplp.no_del++; break; + case 13: mplp.no_ends = 1; break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == NULL) return 1; @@ -1276,7 +1394,15 @@ int bam_mpileup(int argc, char *argv[]) case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break; - case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; + case 'O': + if (!(mplp.flag & MPLP_PRINT_QPOS5)) + mplp.flag |= MPLP_PRINT_QPOS; + break; + case 14: + mplp.flag |= MPLP_PRINT_QPOS5; + mplp.flag &= ~MPLP_PRINT_QPOS; + break; + case 'M': mplp.flag |= MPLP_PRINT_MODS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index bcb8a5c..7eb601d 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -44,6 +44,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "samtools.h" #include "bedidx.h" #include "sam_opts.h" @@ -67,11 +68,14 @@ static inline int printw(int c, FILE *fp) } static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, - hts_pos_t ref_len, const char *ref, kstring_t *ks, - int rev_del) + hts_pos_t ref_len, const char *ref, kstring_t *ks, + int rev_del, int no_ins, int no_ins_mods, + int no_del, int no_ends) { + no_ins_mods |= no_ins; int j; - if (p->is_head) { + hts_base_mod_state *m = p->cd.p; + if (!no_ends && p->is_head) { putc('^', fp); putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp); } @@ -88,32 +92,74 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, else c = bam_is_rev(p->b)? tolower(c) : toupper(c); } putc(c, fp); + if (m) { + int nm; + hts_base_mod mod[256]; + if ((nm = bam_mods_at_qpos(p->b, p->qpos, m, mod, 256)) > 0) { + putc('[', fp); + int j; + for (j = 0; j < nm && j < 256; j++) { + char qual[20]; + if (mod[j].qual >= 0) + sprintf(qual, "%d", mod[j].qual); + else + *qual = 0; + if (mod[j].modified_base < 0) + // ChEBI + fprintf(fp, "%c(%d)%s", "+-"[mod[j].strand], + -mod[j].modified_base, qual); + else + fprintf(fp, "%c%c%s", "+-"[mod[j].strand], + mod[j].modified_base, qual); + } + putc(']', fp); + } + } } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp); int del_len = -p->indel; if (p->indel > 0) { - int len = bam_plp_insertion(p, ks, &del_len); + int len = bam_plp_insertion_mod(p, m && !no_ins_mods ? m : NULL, + ks, &del_len); if (len < 0) { print_error("mpileup", "bam_plp_insertion() failed"); return -1; } - putc('+', fp); printw(len, fp); - if (bam_is_rev(p->b)) { - char pad = rev_del ? '#' : '*'; - for (j = 0; j < len; j++) - putc(ks->s[j] != '*' ? tolower(ks->s[j]) : pad, fp); - } else { - for (j = 0; j < len; j++) - putc(toupper(ks->s[j]), fp); + if (no_ins < 2) { + putc('+', fp); + printw(len, fp); + } + if (!no_ins) { + if (bam_is_rev(p->b)) { + char pad = rev_del ? '#' : '*'; + int in_mod = 0; + for (j = 0; j < ks->l; j++) { + if (ks->s[j] == '[') in_mod = 1; + else if (ks->s[j] == ']') in_mod = 0; + putc(ks->s[j] != '*' + ? (in_mod ? ks->s[j] : tolower(ks->s[j])) + : pad, fp); + } + } else { + int in_mod = 0; + for (j = 0; j < ks->l; j++) { + if (ks->s[j] == '[') in_mod = 1; + if (ks->s[j] == ']') in_mod = 0; + putc(in_mod ? ks->s[j] : toupper(ks->s[j]), fp); + } + } } } if (del_len > 0) { - printw(-del_len, fp); - for (j = 1; j <= del_len; ++j) { - int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; - putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); + if (no_del < 2) + printw(-del_len, fp); + if (!no_del) { + for (j = 1; j <= del_len; ++j) { + int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; + putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); + } } } - if (p->is_tail) putc('$', fp); + if (!no_ends && p->is_tail) putc('$', fp); return 0; } @@ -146,6 +192,10 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, #define MPLP_PRINT_TLEN (1<<21) #define MPLP_PRINT_SEQ (1<<22) #define MPLP_PRINT_QUAL (1<<23) +#define MPLP_PRINT_MODS (1<<24) +#define MPLP_PRINT_QPOS5 (1<<25) + +#define MPLP_PRINT_LAST (1<<26) // terminator for loop #define MPLP_MAX_DEPTH 8000 #define MPLP_MAX_INDEL_DEPTH 250 @@ -160,7 +210,7 @@ typedef struct { void *bed, *rghash, *auxlist; int argc; char **argv; - char sep, empty; + char sep, empty, no_ins, no_ins_mods, no_del, no_ends; sam_global_args ga; } mplp_conf_t; @@ -291,6 +341,23 @@ static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, hts_pos_t *ref_len) return 1; } +// Initialise and destroy the base modifier state data. This is called +// as each new read is added or removed from the pileups. +static +int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) { + int ret; + hts_base_mod_state *m = hts_base_mod_state_alloc(); + ret = bam_parse_basemod(b, m); + cd->p = m; + return ret; +} + +static +int pileup_cd_destroy(void *data, const bam1_t *b, bam_pileup_cd *cd) { + hts_base_mod_state_free(cd->p); + return 0; +} + static void print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, hts_pos_t pos, int n, const char *ref, hts_pos_t ref_len) @@ -300,8 +367,8 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, for (i = 0; i < n; ++i) { fputs("\t0\t*\t*", fp); int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_QUAL + 1) { - if (conf->flag & flag_value) + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS && (conf->flag & flag_value)) fputs("\t*", fp); flag_value <<= 1; } @@ -447,6 +514,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) } // read the header of each file in the list and initialize data + refs_t *refs = NULL; for (i = 0; i < n; ++i) { sam_hdr_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); @@ -460,11 +528,22 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); samtools_exit(EXIT_FAILURE); } - if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) { - fprintf(samtools_stderr, "[%s] failed to process %s: %s\n", - __func__, conf->fai_fname, strerror(errno)); - samtools_exit(EXIT_FAILURE); + + if (!refs && conf->fai_fname) { + if (hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) { + fprintf(samtools_stderr, "[%s] failed to process %s: %s\n", + __func__, conf->fai_fname, strerror(errno)); + samtools_exit(EXIT_FAILURE); + } + refs = cram_get_refs(data[i]->fp); + } else if (conf->fai_fname) { + if (hts_set_opt(data[i]->fp, CRAM_OPT_SHARED_REF, refs) != 0) { + fprintf(samtools_stderr, "[%s] failed to process %s: %s\n", + __func__, conf->fai_fname, strerror(errno)); + samtools_exit(EXIT_FAILURE); + } } + data[i]->conf = conf; data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(data[i]->fp); @@ -532,6 +611,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); samtools_exit(EXIT_FAILURE); } + autoflush_if_stdout(bcf_fp, conf->output_fname); // BCF header creation bcf_hdr = bcf_hdr_init("w"); @@ -659,6 +739,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) // init pileup iter = bam_mplp_init(n, mplp_func, (void**)data); + if (conf->flag & MPLP_PRINT_MODS) { + bam_mplp_constructor(iter, pileup_cd_create); + bam_mplp_destructor(iter, pileup_cd_destroy); + } if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); if ( !conf->max_depth ) { max_depth = INT_MAX; @@ -761,8 +845,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_QUAL + 1) { - if (conf->flag & flag_value) + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS + && (conf->flag & flag_value)) fputs("\t*", pileup_fp); flag_value <<= 1; } @@ -781,7 +866,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) : 0; if (c >= conf->min_baseQ) { n++; - if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref, &ks, conf->rev_del) < 0) { + if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, + ref, &ks, conf->rev_del, + conf->no_ins, conf->no_ins_mods, + conf->no_del, conf->no_ends) < 0) { ret = 1; goto fail; } @@ -808,8 +896,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) /* Print selected columns */ int flag_value = MPLP_PRINT_MAPQ_CHAR; - while(flag_value < MPLP_PRINT_QUAL + 1) { - if (conf->flag & flag_value) { + while(flag_value < MPLP_PRINT_LAST) { + if (flag_value != MPLP_PRINT_MODS + && (conf->flag & flag_value)) { n = 0; putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { @@ -828,8 +917,17 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) putc(c, pileup_fp); break; case MPLP_PRINT_QPOS: + // query position in current orientation fprintf(pileup_fp, "%d", p->qpos + 1); break; + case MPLP_PRINT_QPOS5: { + // query position in 5' to 3' orientation + int pos5 = bam_is_rev(p->b) + ? p->b->core.l_qseq-p->qpos + p->is_del + : p->qpos + 1; + fprintf(pileup_fp, "%d", pos5); + break; + } case MPLP_PRINT_QNAME: fputs(bam_get_qname(p->b), pileup_fp); break; @@ -953,6 +1051,7 @@ fail: bcf_destroy1(bcf_rec); if (bcf_fp) { + release_autoflush(bcf_fp); hts_close(bcf_fp); bcf_hdr_destroy(bcf_hdr); bcf_call_destroy(bca); @@ -1115,16 +1214,24 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -X, --customized-index use customized index files\n" // -X flag for index filename "\n" "Output options:\n" -" -o, --output FILE write output to FILE [standard output]\n" -" -O, --output-BP output base positions on reads\n" -" -s, --output-MQ output mapping quality\n" -" --output-QNAME output read names\n" -" --output-extra STR output extra read fields and read tag values\n" -" --output-sep CHAR set the separator character for tag lists [,]\n" -" --output-empty CHAR set the no value character for tag lists [*]\n" -" --reverse-del use '#' character for deletions on the reverse strand\n" -" -a output all positions (including zero depth)\n" -" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" +" -o, --output FILE write output to FILE [standard output]\n" +" -O, --output-BP output base positions on reads, current orientation\n" +" --output-BP-5 output base positions on reads, 5' to 3' orientation\n" +" -M, --output-mods output base modifications\n" +" -s, --output-MQ output mapping quality\n" +" --output-QNAME output read names\n" +" --output-extra STR output extra read fields and read tag values\n" +" --output-sep CHAR set the separator character for tag lists [,]\n" +" --output-empty CHAR set the no value character for tag lists [*]\n" +" --no-output-ins skip insertion sequence after +NUM\n" +" Use twice for complete insertion removal\n" +" --no-output-ins-mods don't display base modifications within insertions\n" +" --no-output-del skip deletion sequence after -NUM\n" +" Use twice for complete deletion removal\n" +" --no-output-ends remove ^MQUAL and $ markup in sequence column\n" +" --reverse-del use '#' character for deletions on the reverse strand\n" +" -a output all positions (including zero depth)\n" +" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" "\n" "Generic options:\n"); sam_global_opt_help(fp, "-.--.--."); @@ -1203,8 +1310,11 @@ int bam_mpileup(int argc, char *argv[]) {"bcf", no_argument, NULL, 'g'}, {"VCF", no_argument, NULL, 'v'}, {"vcf", no_argument, NULL, 'v'}, + {"output-mods", no_argument, NULL, 'M'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, + {"output-BP-5", no_argument, NULL, 14}, + {"output-bp-5", no_argument, NULL, 14}, {"output-MQ", no_argument, NULL, 's'}, {"output-mq", no_argument, NULL, 's'}, {"output-tags", required_argument, NULL, 't'}, @@ -1223,10 +1333,14 @@ int bam_mpileup(int argc, char *argv[]) {"output-extra", required_argument, NULL, 7}, {"output-sep", required_argument, NULL, 8}, {"output-empty", required_argument, NULL, 9}, + {"no-output-ins", no_argument, NULL, 10}, + {"no-output-ins-mods", no_argument, NULL, 11}, + {"no-output-del", no_argument, NULL, 12}, + {"no-output-ends", no_argument, NULL, 13}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:a",lopts,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:aM",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : @@ -1249,6 +1363,10 @@ int bam_mpileup(int argc, char *argv[]) break; case 8: mplp.sep = optarg[0]; break; case 9: mplp.empty = optarg[0]; break; + case 10: mplp.no_ins++; break; + case 11: mplp.no_ins_mods = 1; break; + case 12: mplp.no_del++; break; + case 13: mplp.no_ends = 1; break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == NULL) return 1; @@ -1278,7 +1396,15 @@ int bam_mpileup(int argc, char *argv[]) case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break; - case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; + case 'O': + if (!(mplp.flag & MPLP_PRINT_QPOS5)) + mplp.flag |= MPLP_PRINT_QPOS; + break; + case 14: + mplp.flag |= MPLP_PRINT_QPOS5; + mplp.flag &= ~MPLP_PRINT_QPOS; + break; + case 'M': mplp.flag |= MPLP_PRINT_MODS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; diff --git a/samtools/bam_reheader.c b/samtools/bam_reheader.c index 7b2a46c..0ad308a 100644 --- a/samtools/bam_reheader.c +++ b/samtools/bam_reheader.c @@ -380,7 +380,7 @@ int cram_reheader_inplace3(cram_fd *fd, sam_hdr_t *h, const char *arg_list, cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); cram_block_update_size(b); - cram_compress_block(fd, b, NULL, -1, -1); + cram_compress_block(fd, b, NULL, -1, 9); if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26) goto err; diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index a48d7f6..e36e6e7 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -382,7 +382,7 @@ int cram_reheader_inplace3(cram_fd *fd, sam_hdr_t *h, const char *arg_list, cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); cram_block_update_size(b); - cram_compress_block(fd, b, NULL, -1, -1); + cram_compress_block(fd, b, NULL, -1, 9); if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26) goto err; diff --git a/samtools/bam_samples.c b/samtools/bam_samples.c new file mode 100644 index 0000000..f72ca6e --- /dev/null +++ b/samtools/bam_samples.c @@ -0,0 +1,433 @@ +/* bam_samples -- print samples in a set of BAM files + + Copyright (C) 2021 Pierre Lindenbaum + Institut du Thorax. u1087 Nantes. France. + @yokofakun + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +KHASH_MAP_INIT_STR(sm, int) + +/** and chained struct containing the faidx and the fasta filename + will be compared with the @SQ lines in the SAM header + */ +typedef struct FaidxPath { + /** path to reference */ + char* filename; + /** fasta index */ + faidx_t* faidx; + struct FaidxPath* next; +} FaidxPath; + +/** program parameters */ +typedef struct Params { + /** output stream */ + FILE* out; + /** tag in @RG line. default is "SM" */ + char tag[3]; + /** first faidx/path in chained list */ + FaidxPath* faidx; + /** show whether the bam is indexed */ + int test_index; +} Params; + +/** print usage */ +static void usage_samples(FILE *write_to) { + fprintf(write_to, + "Usage: samtools samples [options] [...]\n" + " samtools samples [options] -X f1.bam f2.bam f1.bam.bai f2.bai \n" + " find dir1 dir2 -type f \\(-name \"*.bam\" -o -name \"*.cram\" \\) | samtools samples [options]\n" + " find dir1 dir2 -type f \\(-name \"*.bam\" -o -name \"*.bai\" \\) | sort | paste - - | samtools samples -X [options]\n" + "\n" + "Options:\n" + " -? print help and exit\n" + " -h add the columns header before printing the results\n" + " -i test if the file is indexed.\n" + " -T provide the sample tag name from the @RG line [SM].\n" + " -o output file [stdout].\n" + " -f load an indexed fasta file in the collection of references. Can be used multiple times.\n" + " -F read a file containing the paths to indexed fasta files. One path per line.\n" + " -X use a custom index file.\n" + "\n" + " Using -f or -F will add a column containing the path to the reference or \".\" if the reference was not found.\n" + "\n" + ); +} + + +/** loads fasta fai file into FaidxPath, add it to params->faidx */ +static int load_dictionary(struct Params* params, const char* filename) { + FaidxPath* head = params->faidx; + FaidxPath* ptr = (FaidxPath*)malloc(sizeof(FaidxPath)); + if (ptr == NULL) { + print_error_errno("samples", "Out of memory"); + return EXIT_FAILURE; + } + ptr->filename = strdup(filename); + if (ptr->filename == NULL) { + free(ptr); + print_error_errno("samples", "Out of memory"); + return EXIT_FAILURE; + } + ptr->faidx = fai_load(filename); + if (ptr->faidx == NULL) { + free(ptr->filename); + free(ptr); + print_error_errno("samples", "Cannot load index from \"%s\"", filename); + return EXIT_FAILURE; + } + /* insert at the beginning of the linked list */ + params->faidx = ptr; + ptr->next = head; + return EXIT_SUCCESS; +} + +/** load a faidx file and append it to params */ +static int load_dictionaries(Params* params, const char* filename) { + int ret; + htsFile* in; + int status = EXIT_SUCCESS; + + in = hts_open(filename, "r"); + if (in == NULL) { + print_error_errno("samples", "Cannot open \"%s\"", filename); + status = EXIT_FAILURE; + } else { + kstring_t ks = KS_INITIALIZE; + while ((ret = hts_getline(in, KS_SEP_LINE, &ks)) >= 0) { + if (load_dictionary(params, ks_str(&ks)) != EXIT_SUCCESS) { + status = EXIT_FAILURE; + break; + } + } + ks_free(&ks); + hts_close(in); + } + return status; +} + +/** print the sample information, search for a reference */ +static int print_sample( + Params* params, + sam_hdr_t *header, + int has_index, + const char* sample, + const char* fname) { + fputs(sample, params->out); + fputc('\t', params->out); + fputs(fname, params->out); + if (params->test_index) { + fprintf(params->out, "\t%c", has_index ? 'Y' : 'N'); + } + if (params->faidx != NULL) { + FaidxPath* ref = NULL; + FaidxPath* curr = params->faidx; + while (curr != NULL) { + /** check names and length are the same in the same order */ + if (faidx_nseq(curr->faidx) == header->n_targets) { + int i; + for (i = 0; i < faidx_nseq(curr->faidx); i++) { + /** check name is the same */ + if (strcmp(faidx_iseq(curr->faidx, i), header->target_name[i]) != 0) break; + /** check length is the same */ + if (faidx_seq_len(curr->faidx, faidx_iseq(curr->faidx, i)) != header->target_len[i]) break; + } + /* the ref was found */ + if (i == faidx_nseq(curr->faidx)) { + ref = curr; + break; + } + } + curr = curr->next; + } + fputc('\t', params->out); + if (ref == NULL) { + fputc('.', params->out); + } else { + fputs(curr->filename, params->out); + } + } + fputc('\n', params->out); + return 0; +} + +/** open a sam file. Search for all samples in the @RG lines */ +static int print_samples(Params* params, const char* fname, const char* baifname) { + samFile *in = 0; + sam_hdr_t *header = NULL; + int n_rg; + int status = EXIT_SUCCESS; + khash_t(sm) *sample_set = NULL; + khint_t k; + int count_samples = 0; + int has_index = 0; + + if ((sample_set = kh_init(sm)) == NULL) { + print_error("samples", "Failed to initialise sample hash"); + status = EXIT_FAILURE; + goto end_print; + } + + if ((in = sam_open_format(fname, "r", NULL)) == 0) { + print_error_errno("samples", "Failed to open \"%s\" for reading", fname); + status = EXIT_FAILURE; + goto end_print; + } + if ((header = sam_hdr_read(in)) == 0) { + print_error("samples", "Failed to read the header from \"%s\"", fname); + status = EXIT_FAILURE; + goto end_print; + } + + /* try to load index if required */ + if (params->test_index) { + hts_idx_t *bam_idx; + /* path to bam index was specified */ + if (baifname != NULL) { + bam_idx = sam_index_load3(in, fname, baifname, HTS_IDX_SILENT_FAIL); + } + /* get default index */ + else { + bam_idx = sam_index_load3(in, fname, NULL, HTS_IDX_SILENT_FAIL); + } + has_index = bam_idx != NULL; + if (bam_idx != NULL) hts_idx_destroy(bam_idx); + /* and we continue... we have tested the index file but we always test for the samples and the references */ + } + + /* get the RG lines */ + n_rg = sam_hdr_count_lines(header, "RG"); + if (n_rg > 0) { + int i, r, ret; + char* sample; + kstring_t sm_val = KS_INITIALIZE; + for (i = 0; i < n_rg; i++) { + r = sam_hdr_find_tag_pos(header, "RG", i, params->tag, &sm_val); + if (r < 0) continue; + k = kh_get(sm, sample_set, ks_str(&sm_val)); + if (k != kh_end(sample_set)) continue; + sample = strdup(ks_str(&sm_val)); + if (sample == NULL) { + print_error_errno("samples", "Out of memory"); + status = EXIT_FAILURE; + goto end_print; + } + kh_put(sm, sample_set, sample, &ret); + if (ret < 0) { + print_error("samples", "Failed to insert key '%s' into sample_set", sample); + free(sample); + status = EXIT_FAILURE; + goto end_print; + } + ++count_samples; + } + ks_free(&sm_val); + } + if (count_samples == 0) { + print_sample(params, header, has_index, ".", fname); + } else { + for (k = kh_begin(sample_set); k != kh_end(sample_set); ++k) { + if (kh_exist(sample_set, k)) { + char* sample = (char*)kh_key(sample_set, k); + print_sample(params, header, has_index, sample, fname); + } + } + } + +end_print: + if (sample_set != NULL) { + for (k = kh_begin(sample_set); k != kh_end(sample_set); ++k) { + if (kh_exist(sample_set, k)) { + char* sample = (char*)kh_key(sample_set, k); + free(sample); + } + } + kh_destroy(sm, sample_set); + } + if (header != NULL) sam_hdr_destroy(header); + if (in != NULL) sam_close(in); + + return status; +} + + +int main_samples(int argc, char** argv) { + int status = EXIT_SUCCESS; + int print_header = 0; + int has_index_file = 0; + Params params; + char* out_filename = NULL; + FaidxPath* fai; + + strcpy(params.tag, "SM"); + params.faidx = NULL; + params.test_index =0; + + int opt; + while ((opt = getopt_long(argc, argv, "?hiXo:f:F:T:", NULL, NULL)) != -1) { + switch (opt) { + case 'h': + print_header = 1; + break; + case 'o': + out_filename = optarg; + break; + case 'i': + params.test_index = 1; + break; + case 'f': + if (load_dictionary(¶ms, optarg) != EXIT_SUCCESS) { + return EXIT_FAILURE; + } + break; + case 'F': + if (load_dictionaries(¶ms, optarg) != EXIT_SUCCESS) { + return EXIT_FAILURE; + } + break; + case 'T': + if (strlen(optarg) != 2) { + print_error("samples", "Length of tag \"%s\" is not 2.", optarg); + return EXIT_FAILURE; + } + strcpy(params.tag, optarg); + break; + case '?': + usage_samples(stdout); + return EXIT_SUCCESS; + case 'X': + has_index_file = 1; + break; + default: + usage_samples(stderr); + return EXIT_FAILURE; + } + } + + /* if no file was provided and input is the terminal, print the usage and exit */ + if (argc == optind && isatty(STDIN_FILENO)) { + usage_samples(stderr); + return EXIT_FAILURE; + } + + if (out_filename != NULL) { + params.out = fopen(out_filename, "w"); + if (params.out == NULL) { + print_error_errno("samples", "Cannot open \"%s\" for writing", out_filename); + return EXIT_FAILURE; + } + } else { + params.out = stdout; + } + + if (print_header) { + fprintf(params.out, "#%s\tPATH", params.tag); + if (params.test_index) fprintf(params.out, "\tINDEX"); + if (params.faidx != NULL) fprintf(params.out, "\tREFERENCE"); + fprintf(params.out, "\n"); + } + + /* no file was provided, input is stdin, each line contains the path to a bam file */ + if (argc == optind) { + htsFile* fp = hts_open("-", "r"); + if (fp == NULL) { + print_error_errno("samples", "Cannot read from stdin"); + status = EXIT_FAILURE; + } else { + kstring_t ks = KS_INITIALIZE; + int ret; + while ((ret = hts_getline(fp, KS_SEP_LINE, &ks)) >= 0) { + char* bai_path = NULL; + if (has_index_file) { + /* bam path and bam index file are separated by a tab */ + char* tab = strchr(ks_str(&ks), '\t'); + if (tab == NULL || *(tab+1) == '\0') { + print_error_errno("samples", "Expected path-to-bam(tab)path-to-index but got \"%s\"", ks_str(&ks)); + status = EXIT_FAILURE; + break; + } + *tab=0; + bai_path = (tab + 1); + } + if (print_samples(¶ms, ks_str(&ks), bai_path) != EXIT_SUCCESS) { + status = EXIT_FAILURE; + break; + } + } + ks_free(&ks); + hts_close(fp); + } + } + /* loop over each file in argc/argv bam index provided */ + else if (has_index_file) { + /* calculate number of input BAM files */ + if ((argc - optind) % 2 != 0) { + print_error("samples","Odd number of filenames detected! Each BAM file should have an index file"); + status = EXIT_FAILURE; + } else { + int i; + int n = (argc - optind ) / 2; + for (i = 0; i < n; i++) { + if (print_samples(¶ms, argv[optind+i], argv[optind+i+n]) != EXIT_SUCCESS) { + status = EXIT_FAILURE; + break; + } + } + } + } else { + int i; + for (i = optind; i < argc; i++) { + if (print_samples(¶ms, argv[i], NULL) != EXIT_SUCCESS) { + status = EXIT_FAILURE; + break; + } + } + } + + fai = params.faidx; + while (fai != NULL) { + FaidxPath* next = fai -> next; + free(fai->filename); + fai_destroy(fai->faidx); + free(fai); + fai = next; + } + + if (fflush(params.out) != 0) { + print_error_errno("samples", "Cannot flush output"); + status = EXIT_FAILURE; + } + if (out_filename != NULL) { + fclose(params.out); + } + + return status; +} diff --git a/samtools/bam_samples.c.pysam.c b/samtools/bam_samples.c.pysam.c new file mode 100644 index 0000000..891f875 --- /dev/null +++ b/samtools/bam_samples.c.pysam.c @@ -0,0 +1,435 @@ +#include "samtools.pysam.h" + +/* bam_samples -- print samples in a set of BAM files + + Copyright (C) 2021 Pierre Lindenbaum + Institut du Thorax. u1087 Nantes. France. + @yokofakun + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +KHASH_MAP_INIT_STR(sm, int) + +/** and chained struct containing the faidx and the fasta filename + will be compared with the @SQ lines in the SAM header + */ +typedef struct FaidxPath { + /** path to reference */ + char* filename; + /** fasta index */ + faidx_t* faidx; + struct FaidxPath* next; +} FaidxPath; + +/** program parameters */ +typedef struct Params { + /** output stream */ + FILE* out; + /** tag in @RG line. default is "SM" */ + char tag[3]; + /** first faidx/path in chained list */ + FaidxPath* faidx; + /** show whether the bam is indexed */ + int test_index; +} Params; + +/** print usage */ +static void usage_samples(FILE *write_to) { + fprintf(write_to, + "Usage: samtools samples [options] [...]\n" + " samtools samples [options] -X f1.bam f2.bam f1.bam.bai f2.bai \n" + " find dir1 dir2 -type f \\(-name \"*.bam\" -o -name \"*.cram\" \\) | samtools samples [options]\n" + " find dir1 dir2 -type f \\(-name \"*.bam\" -o -name \"*.bai\" \\) | sort | paste - - | samtools samples -X [options]\n" + "\n" + "Options:\n" + " -? print help and exit\n" + " -h add the columns header before printing the results\n" + " -i test if the file is indexed.\n" + " -T provide the sample tag name from the @RG line [SM].\n" + " -o output file [samtools_stdout].\n" + " -f load an indexed fasta file in the collection of references. Can be used multiple times.\n" + " -F read a file containing the paths to indexed fasta files. One path per line.\n" + " -X use a custom index file.\n" + "\n" + " Using -f or -F will add a column containing the path to the reference or \".\" if the reference was not found.\n" + "\n" + ); +} + + +/** loads fasta fai file into FaidxPath, add it to params->faidx */ +static int load_dictionary(struct Params* params, const char* filename) { + FaidxPath* head = params->faidx; + FaidxPath* ptr = (FaidxPath*)malloc(sizeof(FaidxPath)); + if (ptr == NULL) { + print_error_errno("samples", "Out of memory"); + return EXIT_FAILURE; + } + ptr->filename = strdup(filename); + if (ptr->filename == NULL) { + free(ptr); + print_error_errno("samples", "Out of memory"); + return EXIT_FAILURE; + } + ptr->faidx = fai_load(filename); + if (ptr->faidx == NULL) { + free(ptr->filename); + free(ptr); + print_error_errno("samples", "Cannot load index from \"%s\"", filename); + return EXIT_FAILURE; + } + /* insert at the beginning of the linked list */ + params->faidx = ptr; + ptr->next = head; + return EXIT_SUCCESS; +} + +/** load a faidx file and append it to params */ +static int load_dictionaries(Params* params, const char* filename) { + int ret; + htsFile* in; + int status = EXIT_SUCCESS; + + in = hts_open(filename, "r"); + if (in == NULL) { + print_error_errno("samples", "Cannot open \"%s\"", filename); + status = EXIT_FAILURE; + } else { + kstring_t ks = KS_INITIALIZE; + while ((ret = hts_getline(in, KS_SEP_LINE, &ks)) >= 0) { + if (load_dictionary(params, ks_str(&ks)) != EXIT_SUCCESS) { + status = EXIT_FAILURE; + break; + } + } + ks_free(&ks); + hts_close(in); + } + return status; +} + +/** print the sample information, search for a reference */ +static int print_sample( + Params* params, + sam_hdr_t *header, + int has_index, + const char* sample, + const char* fname) { + fputs(sample, params->out); + fputc('\t', params->out); + fputs(fname, params->out); + if (params->test_index) { + fprintf(params->out, "\t%c", has_index ? 'Y' : 'N'); + } + if (params->faidx != NULL) { + FaidxPath* ref = NULL; + FaidxPath* curr = params->faidx; + while (curr != NULL) { + /** check names and length are the same in the same order */ + if (faidx_nseq(curr->faidx) == header->n_targets) { + int i; + for (i = 0; i < faidx_nseq(curr->faidx); i++) { + /** check name is the same */ + if (strcmp(faidx_iseq(curr->faidx, i), header->target_name[i]) != 0) break; + /** check length is the same */ + if (faidx_seq_len(curr->faidx, faidx_iseq(curr->faidx, i)) != header->target_len[i]) break; + } + /* the ref was found */ + if (i == faidx_nseq(curr->faidx)) { + ref = curr; + break; + } + } + curr = curr->next; + } + fputc('\t', params->out); + if (ref == NULL) { + fputc('.', params->out); + } else { + fputs(curr->filename, params->out); + } + } + fputc('\n', params->out); + return 0; +} + +/** open a sam file. Search for all samples in the @RG lines */ +static int print_samples(Params* params, const char* fname, const char* baifname) { + samFile *in = 0; + sam_hdr_t *header = NULL; + int n_rg; + int status = EXIT_SUCCESS; + khash_t(sm) *sample_set = NULL; + khint_t k; + int count_samples = 0; + int has_index = 0; + + if ((sample_set = kh_init(sm)) == NULL) { + print_error("samples", "Failed to initialise sample hash"); + status = EXIT_FAILURE; + goto end_print; + } + + if ((in = sam_open_format(fname, "r", NULL)) == 0) { + print_error_errno("samples", "Failed to open \"%s\" for reading", fname); + status = EXIT_FAILURE; + goto end_print; + } + if ((header = sam_hdr_read(in)) == 0) { + print_error("samples", "Failed to read the header from \"%s\"", fname); + status = EXIT_FAILURE; + goto end_print; + } + + /* try to load index if required */ + if (params->test_index) { + hts_idx_t *bam_idx; + /* path to bam index was specified */ + if (baifname != NULL) { + bam_idx = sam_index_load3(in, fname, baifname, HTS_IDX_SILENT_FAIL); + } + /* get default index */ + else { + bam_idx = sam_index_load3(in, fname, NULL, HTS_IDX_SILENT_FAIL); + } + has_index = bam_idx != NULL; + if (bam_idx != NULL) hts_idx_destroy(bam_idx); + /* and we continue... we have tested the index file but we always test for the samples and the references */ + } + + /* get the RG lines */ + n_rg = sam_hdr_count_lines(header, "RG"); + if (n_rg > 0) { + int i, r, ret; + char* sample; + kstring_t sm_val = KS_INITIALIZE; + for (i = 0; i < n_rg; i++) { + r = sam_hdr_find_tag_pos(header, "RG", i, params->tag, &sm_val); + if (r < 0) continue; + k = kh_get(sm, sample_set, ks_str(&sm_val)); + if (k != kh_end(sample_set)) continue; + sample = strdup(ks_str(&sm_val)); + if (sample == NULL) { + print_error_errno("samples", "Out of memory"); + status = EXIT_FAILURE; + goto end_print; + } + kh_put(sm, sample_set, sample, &ret); + if (ret < 0) { + print_error("samples", "Failed to insert key '%s' into sample_set", sample); + free(sample); + status = EXIT_FAILURE; + goto end_print; + } + ++count_samples; + } + ks_free(&sm_val); + } + if (count_samples == 0) { + print_sample(params, header, has_index, ".", fname); + } else { + for (k = kh_begin(sample_set); k != kh_end(sample_set); ++k) { + if (kh_exist(sample_set, k)) { + char* sample = (char*)kh_key(sample_set, k); + print_sample(params, header, has_index, sample, fname); + } + } + } + +end_print: + if (sample_set != NULL) { + for (k = kh_begin(sample_set); k != kh_end(sample_set); ++k) { + if (kh_exist(sample_set, k)) { + char* sample = (char*)kh_key(sample_set, k); + free(sample); + } + } + kh_destroy(sm, sample_set); + } + if (header != NULL) sam_hdr_destroy(header); + if (in != NULL) sam_close(in); + + return status; +} + + +int main_samples(int argc, char** argv) { + int status = EXIT_SUCCESS; + int print_header = 0; + int has_index_file = 0; + Params params; + char* out_filename = NULL; + FaidxPath* fai; + + strcpy(params.tag, "SM"); + params.faidx = NULL; + params.test_index =0; + + int opt; + while ((opt = getopt_long(argc, argv, "?hiXo:f:F:T:", NULL, NULL)) != -1) { + switch (opt) { + case 'h': + print_header = 1; + break; + case 'o': + out_filename = optarg; + break; + case 'i': + params.test_index = 1; + break; + case 'f': + if (load_dictionary(¶ms, optarg) != EXIT_SUCCESS) { + return EXIT_FAILURE; + } + break; + case 'F': + if (load_dictionaries(¶ms, optarg) != EXIT_SUCCESS) { + return EXIT_FAILURE; + } + break; + case 'T': + if (strlen(optarg) != 2) { + print_error("samples", "Length of tag \"%s\" is not 2.", optarg); + return EXIT_FAILURE; + } + strcpy(params.tag, optarg); + break; + case '?': + usage_samples(samtools_stdout); + return EXIT_SUCCESS; + case 'X': + has_index_file = 1; + break; + default: + usage_samples(samtools_stderr); + return EXIT_FAILURE; + } + } + + /* if no file was provided and input is the terminal, print the usage and exit */ + if (argc == optind && isatty(STDIN_FILENO)) { + usage_samples(samtools_stderr); + return EXIT_FAILURE; + } + + if (out_filename != NULL) { + params.out = fopen(out_filename, "w"); + if (params.out == NULL) { + print_error_errno("samples", "Cannot open \"%s\" for writing", out_filename); + return EXIT_FAILURE; + } + } else { + params.out = samtools_stdout; + } + + if (print_header) { + fprintf(params.out, "#%s\tPATH", params.tag); + if (params.test_index) fprintf(params.out, "\tINDEX"); + if (params.faidx != NULL) fprintf(params.out, "\tREFERENCE"); + fprintf(params.out, "\n"); + } + + /* no file was provided, input is stdin, each line contains the path to a bam file */ + if (argc == optind) { + htsFile* fp = hts_open("-", "r"); + if (fp == NULL) { + print_error_errno("samples", "Cannot read from stdin"); + status = EXIT_FAILURE; + } else { + kstring_t ks = KS_INITIALIZE; + int ret; + while ((ret = hts_getline(fp, KS_SEP_LINE, &ks)) >= 0) { + char* bai_path = NULL; + if (has_index_file) { + /* bam path and bam index file are separated by a tab */ + char* tab = strchr(ks_str(&ks), '\t'); + if (tab == NULL || *(tab+1) == '\0') { + print_error_errno("samples", "Expected path-to-bam(tab)path-to-index but got \"%s\"", ks_str(&ks)); + status = EXIT_FAILURE; + break; + } + *tab=0; + bai_path = (tab + 1); + } + if (print_samples(¶ms, ks_str(&ks), bai_path) != EXIT_SUCCESS) { + status = EXIT_FAILURE; + break; + } + } + ks_free(&ks); + hts_close(fp); + } + } + /* loop over each file in argc/argv bam index provided */ + else if (has_index_file) { + /* calculate number of input BAM files */ + if ((argc - optind) % 2 != 0) { + print_error("samples","Odd number of filenames detected! Each BAM file should have an index file"); + status = EXIT_FAILURE; + } else { + int i; + int n = (argc - optind ) / 2; + for (i = 0; i < n; i++) { + if (print_samples(¶ms, argv[optind+i], argv[optind+i+n]) != EXIT_SUCCESS) { + status = EXIT_FAILURE; + break; + } + } + } + } else { + int i; + for (i = optind; i < argc; i++) { + if (print_samples(¶ms, argv[i], NULL) != EXIT_SUCCESS) { + status = EXIT_FAILURE; + break; + } + } + } + + fai = params.faidx; + while (fai != NULL) { + FaidxPath* next = fai -> next; + free(fai->filename); + fai_destroy(fai->faidx); + free(fai); + fai = next; + } + + if (fflush(params.out) != 0) { + print_error_errno("samples", "Cannot flush output"); + status = EXIT_FAILURE; + } + if (out_filename != NULL) { + fclose(params.out); + } + + return status; +} diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 46a1d80..0971c3f 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -47,6 +47,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "htslib/sam.h" #include "htslib/hts_endian.h" +#include "htslib/cram.h" #include "sam_opts.h" #include "samtools.h" #include "bedidx.h" @@ -1017,6 +1018,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m hts_reglist_t *lreg = NULL; merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; + refs_t *refs = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { @@ -1099,23 +1101,21 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG, (flag & MERGE_FIRST_CO)? (i == 0) : true, RG[i])) - return -1; // FIXME: memory leak + goto fail; hdr[i] = hin; + int order_ok = 1; if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); + order_ok = 0; } - // Potential future improvement is to share headers between CRAM files for - // samtools sort (where all headers are identical. - // Eg: - // - // if (i > 1) { - // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram)); - // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram)); - // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram)); - // } + if (!refs) + refs = cram_get_refs(fp[i]); + + if (order_ok && refs && hts_set_opt(fp[i], CRAM_OPT_SHARED_REF, refs)) + goto fail; } // Did we get an @HD line? @@ -1134,7 +1134,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m goto fail; hout = merged_hdr->hdr; - if (!hout) return -1; // FIXME: memory leak + if (!hout) + goto fail; // If we're only merging a specified region move our iters to start at that point int tid, nreg; @@ -1284,6 +1285,9 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m } if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); + if (refs && hts_set_opt(fpout, CRAM_OPT_SHARED_REF, refs)) + goto fail; + // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { @@ -1934,9 +1938,11 @@ typedef struct { const char *prefix; bam1_tag *buf; const sam_hdr_t *h; + char *tmpfile_name; int index; int error; int no_save; + int large_pos; } worker_t; // Returns 0 for success @@ -2244,7 +2250,9 @@ static void *worker(void *data) { worker_t *w = (worker_t*)data; char *name; + size_t name_len; w->error = 0; + w->tmpfile_name = NULL; if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) { if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) { @@ -2280,40 +2288,42 @@ static void *worker(void *data) if (w->no_save) return 0; - name = (char*)calloc(strlen(w->prefix) + 20, 1); + name_len = strlen(w->prefix) + 30; + name = (char*)calloc(name_len, 1); if (!name) { w->error = errno; return 0; } - sprintf(name, "%s.%.4d.bam", w->prefix, w->index); - - uint32_t max_ncigar = 0; - int i; - for (i = 0; i < w->buf_len; i++) { - uint32_t nc = w->buf[i].bam_record->core.n_cigar; - if (max_ncigar < nc) - max_ncigar = nc; - } + const int MAX_TRIES = 1000; + int tries = 0; + for (;;) { + if (tries) { + snprintf(name, name_len, "%s.%.4d-%.3d.bam", + w->prefix, w->index, tries); + } else { + snprintf(name, name_len, "%s.%.4d.bam", w->prefix, w->index); + } - if (max_ncigar > 65535) { - htsFormat fmt; - memset(&fmt, 0, sizeof(fmt)); - if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) { + if (write_buffer(name, w->large_pos ? "wzx1" : "wbx1", + w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) == 0) { + break; + } + if (errno == EEXIST && tries < MAX_TRIES) { + tries++; + } else { w->error = errno; - free(name); - return 0; + break; } + } - if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0) - w->error = errno; + if (w->error) { + free(name); } else { - if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0) - w->error = errno; + w->tmpfile_name = name; } - - free(name); return 0; } static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, - const sam_hdr_t *h, int n_threads, buf_region *in_mem) + const sam_hdr_t *h, int n_threads, buf_region *in_mem, + int large_pos, char **fns, size_t fns_size) { int i; size_t pos, rest; @@ -2337,6 +2347,8 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, w[i].prefix = prefix; w[i].h = h; w[i].index = n_files + i; + w[i].tmpfile_name = NULL; + w[i].large_pos = large_pos; if (in_mem) { w[i].no_save = 1; in_mem[i].from = pos; @@ -2349,12 +2361,27 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, } for (i = 0; i < n_threads; ++i) { pthread_join(tid[i], 0); + if (!in_mem) { + assert(w[i].index >= 0 && w[i].index < fns_size); + fns[w[i].index] = w[i].tmpfile_name; + } if (w[i].error != 0) { errno = w[i].error; print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index); n_failed++; } } + if (n_failed && !in_mem) { + // Clean up any temporary files that did get made, as we're + // about to lose track of them + for (i = 0; i < n_threads; ++i) { + if (fns[w[i].index]) { + unlink(fns[w[i].index]); + free(fns[w[i].index]); + fns[w[i].index] = NULL; + } + } + } free(tid); free(w); if (n_failed) return -1; if (in_mem) return n_threads; @@ -2390,7 +2417,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { - int ret = -1, res, i, n_files = 0; + int ret = -1, res, i, nref, n_files = 0; size_t max_k, k, max_mem, bam_mem_offset; sam_hdr_t *header = NULL; samFile *fp; @@ -2398,9 +2425,11 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const bam1_t *b = bam_init1(); uint8_t *bam_mem = NULL; char **fns = NULL; + size_t fns_size = 0; const char *new_so; buf_region *in_mem = NULL; int num_in_mem = 0; + int large_pos = 0; if (!b) { print_error("sort", "couldn't allocate memory for bam record"); @@ -2429,6 +2458,28 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const goto err; } + // Inspect the header looking for long chromosomes + // If there is one, we need to write temporary files in SAM format + nref = sam_hdr_nref(header); + for (i = 0; i < nref; i++) { + if (sam_hdr_tid2len(header, i) > INT32_MAX) + large_pos = 1; + } + + // Also check the output format is large position compatible + if (large_pos) { + int compatible = (out_fmt->format == sam + || (out_fmt->format == cram + && out_fmt->version.major >= 4) + || (out_fmt->format == unknown_format + && modeout[0] == 'w' + && (modeout[1] == 'z' || modeout[1] == '\0'))); + if (!compatible) { + print_error("sort", "output format is not compatible with very large references"); + goto err; + } + } + if (sort_by_tag != NULL) new_so = "unknown"; else if (is_by_qname) @@ -2512,10 +2563,15 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const ++k; if (mem_full) { - n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, - NULL); - if (n_files < 0) { + if (hts_resize(char *, n_files + (n_threads > 0 ? n_threads : 1), + &fns_size, &fns, 0) < 0) + goto err; + int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads, + NULL, large_pos, fns, fns_size); + if (new_n < 0) { goto err; + } else { + n_files = new_n; } k = 0; bam_mem_offset = 0; @@ -2531,7 +2587,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); if (!in_mem) goto err; num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads, - in_mem); + in_mem, large_pos, fns, fns_size); if (num_in_mem < 0) goto err; } else { num_in_mem = 0; @@ -2548,12 +2604,13 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const fprintf(stderr, "[bam_sort_core] merging from %d files and %d in-memory blocks...\n", n_files, num_in_mem); - fns = (char**)calloc(n_files, sizeof(char*)); - if (!fns) goto err; + // Paranoia check - all temporary files should have a name for (i = 0; i < n_files; ++i) { - fns[i] = (char*)calloc(strlen(prefix) + 20, 1); - if (!fns[i]) goto err; - sprintf(fns[i], "%s.%.4d.bam", prefix, i); + if (!fns[i]) { + print_error("sort", + "BUG: no name stored for temporary file %d", i); + abort(); + } } if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, n_files, fns, num_in_mem, in_mem, buf, @@ -2721,8 +2778,13 @@ int bam_sort(int argc, char *argv[]) if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9); if (tmpprefix.l == 0) { - if (strcmp(fnout, "-") != 0) ksprintf(&tmpprefix, "%s.tmp", fnout); - else kputc('.', &tmpprefix); + if (strcmp(fnout, "-") != 0) { + char *idx = strstr(fnout, HTS_IDX_DELIM); + kputsn(fnout, idx ? idx - fnout : strlen(fnout), &tmpprefix); + kputs(".tmp", &tmpprefix); + } else { + kputc('.', &tmpprefix); + } } if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) { unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 6cbf66a..1385b29 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -49,6 +49,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "htslib/sam.h" #include "htslib/hts_endian.h" +#include "htslib/cram.h" #include "sam_opts.h" #include "samtools.h" #include "bedidx.h" @@ -1019,6 +1020,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m hts_reglist_t *lreg = NULL; merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; + refs_t *refs = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { @@ -1101,23 +1103,21 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG, (flag & MERGE_FIRST_CO)? (i == 0) : true, RG[i])) - return -1; // FIXME: memory leak + goto fail; hdr[i] = hin; + int order_ok = 1; if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); + order_ok = 0; } - // Potential future improvement is to share headers between CRAM files for - // samtools sort (where all headers are identical. - // Eg: - // - // if (i > 1) { - // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram)); - // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram)); - // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram)); - // } + if (!refs) + refs = cram_get_refs(fp[i]); + + if (order_ok && refs && hts_set_opt(fp[i], CRAM_OPT_SHARED_REF, refs)) + goto fail; } // Did we get an @HD line? @@ -1136,7 +1136,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m goto fail; hout = merged_hdr->hdr; - if (!hout) return -1; // FIXME: memory leak + if (!hout) + goto fail; // If we're only merging a specified region move our iters to start at that point int tid, nreg; @@ -1286,6 +1287,9 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m } if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); + if (refs && hts_set_opt(fpout, CRAM_OPT_SHARED_REF, refs)) + goto fail; + // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { @@ -1936,9 +1940,11 @@ typedef struct { const char *prefix; bam1_tag *buf; const sam_hdr_t *h; + char *tmpfile_name; int index; int error; int no_save; + int large_pos; } worker_t; // Returns 0 for success @@ -2246,7 +2252,9 @@ static void *worker(void *data) { worker_t *w = (worker_t*)data; char *name; + size_t name_len; w->error = 0; + w->tmpfile_name = NULL; if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) { if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) { @@ -2282,40 +2290,42 @@ static void *worker(void *data) if (w->no_save) return 0; - name = (char*)calloc(strlen(w->prefix) + 20, 1); + name_len = strlen(w->prefix) + 30; + name = (char*)calloc(name_len, 1); if (!name) { w->error = errno; return 0; } - sprintf(name, "%s.%.4d.bam", w->prefix, w->index); - - uint32_t max_ncigar = 0; - int i; - for (i = 0; i < w->buf_len; i++) { - uint32_t nc = w->buf[i].bam_record->core.n_cigar; - if (max_ncigar < nc) - max_ncigar = nc; - } + const int MAX_TRIES = 1000; + int tries = 0; + for (;;) { + if (tries) { + snprintf(name, name_len, "%s.%.4d-%.3d.bam", + w->prefix, w->index, tries); + } else { + snprintf(name, name_len, "%s.%.4d.bam", w->prefix, w->index); + } - if (max_ncigar > 65535) { - htsFormat fmt; - memset(&fmt, 0, sizeof(fmt)); - if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) { + if (write_buffer(name, w->large_pos ? "wzx1" : "wbx1", + w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) == 0) { + break; + } + if (errno == EEXIST && tries < MAX_TRIES) { + tries++; + } else { w->error = errno; - free(name); - return 0; + break; } + } - if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0) - w->error = errno; + if (w->error) { + free(name); } else { - if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0) - w->error = errno; + w->tmpfile_name = name; } - - free(name); return 0; } static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, - const sam_hdr_t *h, int n_threads, buf_region *in_mem) + const sam_hdr_t *h, int n_threads, buf_region *in_mem, + int large_pos, char **fns, size_t fns_size) { int i; size_t pos, rest; @@ -2339,6 +2349,8 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, w[i].prefix = prefix; w[i].h = h; w[i].index = n_files + i; + w[i].tmpfile_name = NULL; + w[i].large_pos = large_pos; if (in_mem) { w[i].no_save = 1; in_mem[i].from = pos; @@ -2351,12 +2363,27 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, } for (i = 0; i < n_threads; ++i) { pthread_join(tid[i], 0); + if (!in_mem) { + assert(w[i].index >= 0 && w[i].index < fns_size); + fns[w[i].index] = w[i].tmpfile_name; + } if (w[i].error != 0) { errno = w[i].error; print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index); n_failed++; } } + if (n_failed && !in_mem) { + // Clean up any temporary files that did get made, as we're + // about to lose track of them + for (i = 0; i < n_threads; ++i) { + if (fns[w[i].index]) { + unlink(fns[w[i].index]); + free(fns[w[i].index]); + fns[w[i].index] = NULL; + } + } + } free(tid); free(w); if (n_failed) return -1; if (in_mem) return n_threads; @@ -2392,7 +2419,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { - int ret = -1, res, i, n_files = 0; + int ret = -1, res, i, nref, n_files = 0; size_t max_k, k, max_mem, bam_mem_offset; sam_hdr_t *header = NULL; samFile *fp; @@ -2400,9 +2427,11 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const bam1_t *b = bam_init1(); uint8_t *bam_mem = NULL; char **fns = NULL; + size_t fns_size = 0; const char *new_so; buf_region *in_mem = NULL; int num_in_mem = 0; + int large_pos = 0; if (!b) { print_error("sort", "couldn't allocate memory for bam record"); @@ -2431,6 +2460,28 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const goto err; } + // Inspect the header looking for long chromosomes + // If there is one, we need to write temporary files in SAM format + nref = sam_hdr_nref(header); + for (i = 0; i < nref; i++) { + if (sam_hdr_tid2len(header, i) > INT32_MAX) + large_pos = 1; + } + + // Also check the output format is large position compatible + if (large_pos) { + int compatible = (out_fmt->format == sam + || (out_fmt->format == cram + && out_fmt->version.major >= 4) + || (out_fmt->format == unknown_format + && modeout[0] == 'w' + && (modeout[1] == 'z' || modeout[1] == '\0'))); + if (!compatible) { + print_error("sort", "output format is not compatible with very large references"); + goto err; + } + } + if (sort_by_tag != NULL) new_so = "unknown"; else if (is_by_qname) @@ -2514,10 +2565,15 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const ++k; if (mem_full) { - n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, - NULL); - if (n_files < 0) { + if (hts_resize(char *, n_files + (n_threads > 0 ? n_threads : 1), + &fns_size, &fns, 0) < 0) + goto err; + int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads, + NULL, large_pos, fns, fns_size); + if (new_n < 0) { goto err; + } else { + n_files = new_n; } k = 0; bam_mem_offset = 0; @@ -2533,7 +2589,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); if (!in_mem) goto err; num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads, - in_mem); + in_mem, large_pos, fns, fns_size); if (num_in_mem < 0) goto err; } else { num_in_mem = 0; @@ -2550,12 +2606,13 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const fprintf(samtools_stderr, "[bam_sort_core] merging from %d files and %d in-memory blocks...\n", n_files, num_in_mem); - fns = (char**)calloc(n_files, sizeof(char*)); - if (!fns) goto err; + // Paranoia check - all temporary files should have a name for (i = 0; i < n_files; ++i) { - fns[i] = (char*)calloc(strlen(prefix) + 20, 1); - if (!fns[i]) goto err; - sprintf(fns[i], "%s.%.4d.bam", prefix, i); + if (!fns[i]) { + print_error("sort", + "BUG: no name stored for temporary file %d", i); + abort(); + } } if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, n_files, fns, num_in_mem, in_mem, buf, @@ -2723,8 +2780,13 @@ int bam_sort(int argc, char *argv[]) if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9); if (tmpprefix.l == 0) { - if (strcmp(fnout, "-") != 0) ksprintf(&tmpprefix, "%s.tmp", fnout); - else kputc('.', &tmpprefix); + if (strcmp(fnout, "-") != 0) { + char *idx = strstr(fnout, HTS_IDX_DELIM); + kputsn(fnout, idx ? idx - fnout : strlen(fnout), &tmpprefix); + kputs(".tmp", &tmpprefix); + } else { + kputc('.', &tmpprefix); + } } if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) { unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c index 85eb372..d075a93 100644 --- a/samtools/bamshuf.c.pysam.c +++ b/samtools/bamshuf.c.pysam.c @@ -183,7 +183,7 @@ static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_f static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, - int is_samtools_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga, char *arg_list, int no_pg) + int is_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga, char *arg_list, int no_pg) { samFile *fp, *fpw = NULL, **fpt = NULL; char **fnt = NULL, modew[8]; @@ -228,7 +228,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL); - if (!is_samtools_stdout && !output_file) { // output to a file (name based on prefix) + if (!is_stdout && !output_file) { // output to a file (name based on prefix) char *fnw = (char*)calloc(l + 5, 1); if (!fnw) goto mem_fail; if (ga->out.format == unknown_format) @@ -246,7 +246,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, fpw = sam_open_format(output_file, modew, &ga->out); } else fpw = sam_open_format("-", modew, &ga->out); // output to samtools_stdout if (fpw == NULL) { - if (is_samtools_stdout) print_error_errno("collate", "Cannot open standard output"); + if (is_stdout) print_error_errno("collate", "Cannot open standard output"); else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre); goto fail; } @@ -582,7 +582,7 @@ char * generate_prefix() { int main_bamshuf(int argc, char *argv[]) { - int c, n_files = 64, clevel = DEF_CLEVEL, is_samtools_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0, no_pg = 0; + int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0, no_pg = 0; const char *output_file = NULL; char *prefix = NULL, *arg_list = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; @@ -597,7 +597,7 @@ int main_bamshuf(int argc, char *argv[]) case 'n': n_files = atoi(optarg); break; case 'l': clevel = atoi(optarg); break; case 'u': is_un = 1; break; - case 'O': is_samtools_stdout = 1; break; + case 'O': is_stdout = 1; break; case 'o': output_file = optarg; break; case 'f': fast_coll = 1; break; case 'r': reads_store = atoi(optarg); break; @@ -609,9 +609,9 @@ int main_bamshuf(int argc, char *argv[]) } if (is_un) clevel = 0; if (argc >= optind + 2) prefix = argv[optind+1]; - if (!(prefix || is_samtools_stdout || output_file)) + if (!(prefix || is_stdout || output_file)) return usage(samtools_stderr, n_files, reads_store); - if (is_samtools_stdout && output_file) { + if (is_stdout && output_file) { fprintf(samtools_stderr, "collate: -o and -O options cannot be used together.\n"); return usage(samtools_stderr, n_files, reads_store); } @@ -627,7 +627,7 @@ int main_bamshuf(int argc, char *argv[]) return 1; } - ret = bamshuf(argv[optind], n_files, prefix, clevel, is_samtools_stdout, + ret = bamshuf(argv[optind], n_files, prefix, clevel, is_stdout, output_file, fast_coll, reads_store, &ga, arg_list, no_pg); if (pre_mem) free(prefix); diff --git a/samtools/bamtk.c b/samtools/bamtk.c index 93e6468..fedfe69 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -68,6 +68,7 @@ int fqidx_main(int argc, char *argv[]); int amplicon_clip_main(int argc, char *argv[]); int main_ampliconstats(int argc, char *argv[]); int main_import(int argc, char *argv[]); +int main_samples(int argc, char *argv[]); const char *samtools_version() { @@ -191,6 +192,7 @@ static void usage(FILE *fp) " tview text alignment viewer\n" " view SAM<->BAM<->CRAM conversion\n" " depad convert padded BAM to unpadded BAM\n" +" samples list the samples in a set of SAM/BAM/CRAM files\n" "\n" " -- Misc\n" " help [cmd] display this help message or help for [cmd]\n" @@ -275,6 +277,7 @@ int main(int argc, char *argv[]) } else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); + else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ strcmp(argv[1], "--version") == 0) { long_version(); diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index dfb2cdd..3257ba1 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -71,6 +71,7 @@ int fqidx_main(int argc, char *argv[]); int amplicon_clip_main(int argc, char *argv[]); int main_ampliconstats(int argc, char *argv[]); int main_import(int argc, char *argv[]); +int main_samples(int argc, char *argv[]); const char *samtools_version() { @@ -194,6 +195,7 @@ static void usage(FILE *fp) " tview text alignment viewer\n" " view SAM<->BAM<->CRAM conversion\n" " depad convert padded BAM to unpadded BAM\n" +" samples list the samples in a set of SAM/BAM/CRAM files\n" "\n" " -- Misc\n" " help [cmd] display this help message or help for [cmd]\n" @@ -278,6 +280,7 @@ int samtools_main(int argc, char *argv[]) } //else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); + else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ strcmp(argv[1], "--version") == 0) { long_version(); diff --git a/samtools/coverage.c b/samtools/coverage.c index cab1f8b..5204cd4 100644 --- a/samtools/coverage.c +++ b/samtools/coverage.c @@ -139,7 +139,7 @@ static int usage() { " endpos End position (or sequence length)\n" " numreads Number reads aligned to the region (after filtering)\n" " covbases Number of covered bases with depth >= 1\n" - " coverage Proportion of covered bases [0..1]\n" + " coverage Percentage of covered bases [0..100]\n" " meandepth Mean depth of coverage\n" " meanbaseq Mean baseQ in covered region\n" " meanmapq Mean mapQ of selected reads\n" diff --git a/samtools/coverage.c.pysam.c b/samtools/coverage.c.pysam.c index 662deb5..d5affdf 100644 --- a/samtools/coverage.c.pysam.c +++ b/samtools/coverage.c.pysam.c @@ -141,7 +141,7 @@ static int usage() { " endpos End position (or sequence length)\n" " numreads Number reads aligned to the region (after filtering)\n" " covbases Number of covered bases with depth >= 1\n" - " coverage Proportion of covered bases [0..1]\n" + " coverage Percentage of covered bases [0..100]\n" " meandepth Mean depth of coverage\n" " meanbaseq Mean baseQ in covered region\n" " meanmapq Mean mapQ of selected reads\n" diff --git a/samtools/sam.c b/samtools/sam.c deleted file mode 100644 index 2df0ed1..0000000 --- a/samtools/sam.c +++ /dev/null @@ -1,147 +0,0 @@ -/* sam.c -- format-neutral SAM/BAM API. - - Copyright (C) 2009, 2012-2016 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include "htslib/faidx.h" -#include "sam.h" - -int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) -{ - if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1; - if (bgzf_mt(fp->x.bam, n_threads, n_sub_blks) < 0) return -1; - return 0; -} - -samfile_t *samopen(const char *fn, const char *mode, const void *aux) -{ - // hts_open() is really sam_open(), except for #define games - samFile *hts_fp = hts_open(fn, mode); - if (hts_fp == NULL) return NULL; - - samfile_t *fp = malloc(sizeof (samfile_t)); - if (!fp) { - sam_close(hts_fp); - return NULL; - } - fp->file = hts_fp; - fp->x.bam = hts_fp->fp.bgzf; - if (strchr(mode, 'r')) { - if (aux) { - if (hts_set_fai_filename(fp->file, aux) != 0) { - sam_close(hts_fp); - free(fp); - return NULL; - } - } - fp->header = sam_hdr_read(fp->file); // samclose() will free this - if (fp->header == NULL) { - sam_close(hts_fp); - free(fp); - return NULL; - } - fp->is_write = 0; - if (sam_hdr_nref(fp->header) == 0 && bam_verbose >= 1) - fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); - } - else { - enum htsExactFormat fmt = hts_get_format(fp->file)->format; - fp->header = (sam_hdr_t *)aux; // For writing, we won't free it - fp->is_write = 1; - if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { - if (sam_hdr_write(fp->file, fp->header) < 0) { - if (bam_verbose >= 1) - fprintf(stderr, "[samopen] Couldn't write header\n"); - sam_close(hts_fp); - free(fp); - return NULL; - } - } - } - - return fp; -} - -void samclose(samfile_t *fp) -{ - if (fp) { - if (!fp->is_write && fp->header) sam_hdr_destroy(fp->header); - sam_close(fp->file); - free(fp); - } -} - -int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) -{ - bam1_t *b = bam_init1(); - hts_itr_t *iter = sam_itr_queryi(idx, tid, beg, end); - int ret; - while ((ret = sam_itr_next(fp->file, iter, b)) >= 0) func(b, data); - hts_itr_destroy(iter); - bam_destroy1(b); - return (ret == -1)? 0 : ret; -} - -int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) -{ - bam_plbuf_t *buf; - int ret; - bam1_t *b; - b = bam_init1(); - buf = bam_plbuf_init(func, func_data); - if (mask < 0) mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; - else mask |= BAM_FUNMAP; - while ((ret = samread(fp, b)) >= 0) { - // bam_plp_push() itself now filters out unmapped reads only - if (b->core.flag & mask) b->core.flag |= BAM_FUNMAP; - bam_plbuf_push(b, buf); - } - bam_plbuf_push(0, buf); - bam_plbuf_destroy(buf); - bam_destroy1(b); - return 0; -} - -char *samfaipath(const char *fn_ref) -{ - char *fn_list = 0; - if (fn_ref == 0) return 0; - fn_list = calloc(strlen(fn_ref) + 5, 1); - strcat(strcpy(fn_list, fn_ref), ".fai"); - if (access(fn_list, R_OK) == -1) { // fn_list is unreadable - if (access(fn_ref, R_OK) == -1) { - fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); - } else { - if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n"); - if (fai_build(fn_ref) == -1) { - fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); - free(fn_list); fn_list = 0; - } - } - } - return fn_list; -} diff --git a/samtools/sam.c.pysam.c b/samtools/sam.c.pysam.c deleted file mode 100644 index 64ac88d..0000000 --- a/samtools/sam.c.pysam.c +++ /dev/null @@ -1,149 +0,0 @@ -#include "samtools.pysam.h" - -/* sam.c -- format-neutral SAM/BAM API. - - Copyright (C) 2009, 2012-2016 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include "htslib/faidx.h" -#include "sam.h" - -int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) -{ - if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1; - if (bgzf_mt(fp->x.bam, n_threads, n_sub_blks) < 0) return -1; - return 0; -} - -samfile_t *samopen(const char *fn, const char *mode, const void *aux) -{ - // hts_open() is really sam_open(), except for #define games - samFile *hts_fp = hts_open(fn, mode); - if (hts_fp == NULL) return NULL; - - samfile_t *fp = malloc(sizeof (samfile_t)); - if (!fp) { - sam_close(hts_fp); - return NULL; - } - fp->file = hts_fp; - fp->x.bam = hts_fp->fp.bgzf; - if (strchr(mode, 'r')) { - if (aux) { - if (hts_set_fai_filename(fp->file, aux) != 0) { - sam_close(hts_fp); - free(fp); - return NULL; - } - } - fp->header = sam_hdr_read(fp->file); // samclose() will free this - if (fp->header == NULL) { - sam_close(hts_fp); - free(fp); - return NULL; - } - fp->is_write = 0; - if (sam_hdr_nref(fp->header) == 0 && bam_verbose >= 1) - fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n"); - } - else { - enum htsExactFormat fmt = hts_get_format(fp->file)->format; - fp->header = (sam_hdr_t *)aux; // For writing, we won't free it - fp->is_write = 1; - if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { - if (sam_hdr_write(fp->file, fp->header) < 0) { - if (bam_verbose >= 1) - fprintf(samtools_stderr, "[samopen] Couldn't write header\n"); - sam_close(hts_fp); - free(fp); - return NULL; - } - } - } - - return fp; -} - -void samclose(samfile_t *fp) -{ - if (fp) { - if (!fp->is_write && fp->header) sam_hdr_destroy(fp->header); - sam_close(fp->file); - free(fp); - } -} - -int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) -{ - bam1_t *b = bam_init1(); - hts_itr_t *iter = sam_itr_queryi(idx, tid, beg, end); - int ret; - while ((ret = sam_itr_next(fp->file, iter, b)) >= 0) func(b, data); - hts_itr_destroy(iter); - bam_destroy1(b); - return (ret == -1)? 0 : ret; -} - -int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) -{ - bam_plbuf_t *buf; - int ret; - bam1_t *b; - b = bam_init1(); - buf = bam_plbuf_init(func, func_data); - if (mask < 0) mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; - else mask |= BAM_FUNMAP; - while ((ret = samread(fp, b)) >= 0) { - // bam_plp_push() itself now filters out unmapped reads only - if (b->core.flag & mask) b->core.flag |= BAM_FUNMAP; - bam_plbuf_push(b, buf); - } - bam_plbuf_push(0, buf); - bam_plbuf_destroy(buf); - bam_destroy1(b); - return 0; -} - -char *samfaipath(const char *fn_ref) -{ - char *fn_list = 0; - if (fn_ref == 0) return 0; - fn_list = calloc(strlen(fn_ref) + 5, 1); - strcat(strcpy(fn_list, fn_ref), ".fai"); - if (access(fn_list, R_OK) == -1) { // fn_list is unreadable - if (access(fn_ref, R_OK) == -1) { - fprintf(samtools_stderr, "[samfaipath] fail to read file %s.\n", fn_ref); - } else { - if (bam_verbose >= 3) fprintf(samtools_stderr, "[samfaipath] build FASTA index...\n"); - if (fai_build(fn_ref) == -1) { - fprintf(samtools_stderr, "[samfaipath] fail to build FASTA index.\n"); - free(fn_list); fn_list = 0; - } - } - } - return fn_list; -} diff --git a/samtools/sam.h b/samtools/sam.h deleted file mode 100644 index 833279d..0000000 --- a/samtools/sam.h +++ /dev/null @@ -1,151 +0,0 @@ -/* sam.h -- format-neutral SAM/BAM API. - - Copyright (C) 2009, 2013-2015, 2019 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#ifndef BAM_SAM_H -#define BAM_SAM_H - -#include "htslib/sam.h" -#include "bam.h" - -/*! - @header - - This file provides higher level of I/O routines and unifies the APIs - for SAM and BAM formats. These APIs are more convenient and - recommended. - - @copyright Genome Research Ltd. - */ - -/*! @typedef - @abstract SAM/BAM file handler - @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format - @field bam BAM file handler; valid if (type&1) == 1 - @field tamr SAM file handler for reading; valid if type == 2 - @field tamw SAM file handler for writing; valid if type == 0 - @field header header struct - */ -typedef struct { - samFile *file; - struct { BGZF *bam; } x; // Hack so that fp->x.bam still works - sam_hdr_t *header; - unsigned short is_write:1; -} samfile_t; - -#ifdef __cplusplus -extern "C" { -#endif - - /*! - @abstract Open a SAM/BAM file - - @param fn SAM/BAM file name; "-" is recognized as stdin (for - reading) or stdout (for writing). - - @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading, - 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output, - 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for - string flag. If 'b' present, it must immediately follow 'r' or - 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX", - "rb", "wb" and "wbu" exclusively. - - @param aux auxiliary data; if mode[0]=='w', aux points to - bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM - are absent, aux points the file name of the list of the reference; - aux is not used otherwise. If @SQ header lines are present in SAM, - aux is not used, either. - - @return SAM/BAM file handler - */ - samfile_t *samopen(const char *fn, const char *mode, const void *aux); - - /*! - @abstract Close a SAM/BAM handler - @param fp file handler to be closed - */ - void samclose(samfile_t *fp); - - /*! - @abstract Read one alignment - @param fp file handler - @param b alignment - @return bytes read - */ - static inline int samread(samfile_t *fp, bam1_t *b) { return sam_read1(fp->file, fp->header, b); } - - /*! - @abstract Write one alignment - @param fp file handler - @param b alignment - @return bytes written - */ - static inline int samwrite(samfile_t *fp, const bam1_t *b) { return sam_write1(fp->file, fp->header, b); } - - /*! - @abstract Load BAM/CRAM index for use with samfetch() with supporting the use of index file - @param fp file handler - @param fn name of the BAM or CRAM file (NOT the index file) - @param fnidx name of the index file - @return pointer to the index structure - */ - static inline bam_index_t *samtools_sam_index_load(samfile_t *fp, const char *fn, const char *fnidx) { - if (fnidx != NULL) { - return sam_index_load2(fp->file, fn, fnidx); - } - return sam_index_load(fp->file, fn); - } - #undef sam_index_load - #define sam_index_load(fp,fn,fnidx) (samtools_sam_index_load((fp), (fn), (fnidx))) - - /*! - @abstract Retrieve the alignments overlapping the specified region. - @discussion A user defined function will be called for each - retrieved alignment ordered by its start position. - @param fp file handler - @param idx index returned by sam_index_load() - @param tid chromosome ID as is defined in the header - @param beg start coordinate, 0-based - @param end end coordinate, 0-based - @param data user provided data (will be transferred to func) - @param func user defined function - */ - int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); - - /*! - @abstract Get the pileup for a whole alignment file - @param fp file handler - @param mask mask transferred to bam_plbuf_set_mask() - @param func user defined function called in the pileup process - #param data user provided data for func() - */ - int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); - - char *samfaipath(const char *fn_ref); - int samthreads(samfile_t *fp, int n_threads, int n_sub_blks); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/samtools/sam_utils.c b/samtools/sam_utils.c index 42a3668..f105687 100644 --- a/samtools/sam_utils.c +++ b/samtools/sam_utils.c @@ -32,9 +32,21 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" +static htsFile *samtools_stdout = NULL; + +void autoflush_if_stdout(htsFile *fp, const char *fname) { + if (fname == NULL || strcmp(fname, "-") == 0) samtools_stdout = fp; +} + +void release_autoflush(htsFile *fp) { + if (samtools_stdout == fp) samtools_stdout = NULL; +} + static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) { fflush(stdout); + if (samtools_stdout) hts_flush(samtools_stdout); + if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand); else fprintf(stderr, "samtools: "); vfprintf(stderr, format, args); @@ -62,6 +74,7 @@ void print_error_errno(const char *subcommand, const char *format, ...) void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) { + release_autoflush(fp); int r = sam_close(fp); if (r >= 0) return; diff --git a/samtools/sam_utils.c.pysam.c b/samtools/sam_utils.c.pysam.c index e150c70..a5f08a9 100644 --- a/samtools/sam_utils.c.pysam.c +++ b/samtools/sam_utils.c.pysam.c @@ -34,9 +34,21 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" +static htsFile *samtools_stdout_internal = NULL; + +void autoflush_if_stdout(htsFile *fp, const char *fname) { + if (fname == NULL || strcmp(fname, "-") == 0) samtools_stdout_internal = fp; +} + +void release_autoflush(htsFile *fp) { + if (samtools_stdout_internal == fp) samtools_stdout_internal = NULL; +} + static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) { fflush(samtools_stdout); + if (samtools_stdout_internal) hts_flush(samtools_stdout_internal); + if (subcommand && *subcommand) fprintf(samtools_stderr, "samtools %s: ", subcommand); else fprintf(samtools_stderr, "samtools: "); vfprintf(samtools_stderr, format, args); @@ -64,6 +76,7 @@ void print_error_errno(const char *subcommand, const char *format, ...) void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) { + release_autoflush(fp); int r = sam_close(fp); if (r >= 0) return; diff --git a/samtools/sam_view.c b/samtools/sam_view.c index 515eaa5..7c4d7cc 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -40,21 +40,29 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts_expr.h" #include "samtools.h" #include "sam_opts.h" +#include "bam.h" // for bam_get_library and bam_remove_B #include "bedidx.h" KHASH_SET_INIT_STR(str) - typedef khash_t(str) *strhash_t; +KHASH_SET_INIT_INT(aux_exists) +typedef khash_t(aux_exists) *auxhash_t; + // This structure contains the settings for a samview run typedef struct samview_settings { strhash_t rghash; strhash_t rnhash; strhash_t tvhash; int min_mapQ; - int flag_on; - int flag_off; - int flag_alloff; + + // Described here in the same terms as the usage statement. + // The code however always negates to "reject if" keep if: + int flag_on; // keep if (FLAG & N) == N (all on) + int flag_off; // keep if (FLAG & N) == 0 (all off) + int flag_anyon; // keep if (FLAG & N) != 0 (any on) + int flag_alloff; // reject if (FLAG & N) == N (any off) + int min_qlen; int remove_B; uint32_t subsam_seed; @@ -68,16 +76,65 @@ typedef struct samview_settings { hts_filter_t *filter; int remove_flag; int add_flag; + int unmap; + auxhash_t remove_tag; + auxhash_t keep_tag; } samview_settings_t; +// Copied from htslib/sam.c. +// TODO: we need a proper interface to find the length of an aux tag, +// or at the very make exportable versions of these in htslib. +static inline int aux_type2size(uint8_t type) +{ + switch (type) { + case 'A': case 'c': case 'C': + return 1; + case 's': case 'S': + return 2; + case 'i': case 'I': case 'f': + return 4; + case 'd': + return 8; + case 'Z': case 'H': case 'B': + return type; + default: + return 0; + } +} -// TODO Add declarations of these to a viable htslib or samtools header -extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); -extern int bam_remove_B(bam1_t *b); +// Copied from htslib/sam.c. +static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) +{ + int size; + uint32_t n; + if (s >= end) return end; + size = aux_type2size(*s); ++s; // skip type + switch (size) { + case 'Z': + case 'H': + while (s < end && *s) ++s; + return s < end ? s + 1 : end; + case 'B': + if (end - s < 5) return NULL; + size = aux_type2size(*s); ++s; + n = le_to_u32(s); + s += 4; + if (size == 0 || end - s < size * n) return NULL; + return s + size * n; + case 0: + return NULL; + default: + if (end - s < size) return NULL; + return s + size; + } +} // Returns 0 to indicate read should be output 1 otherwise static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { + if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) + return 1; + if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { int k, qlen = 0; @@ -91,6 +148,8 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin return 1; if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) return 1; + if (settings->flag_anyon && ((b->core.flag & settings->flag_anyon) == 0)) + return 1; if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, sam_hdr_tid2name(h, b->core.tid), b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { @@ -137,18 +196,50 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin const char *p = bam_get_library((sam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } - if (settings->remove_aux_len) { - size_t i; - for (i = 0; i < settings->remove_aux_len; ++i) { - uint8_t *s = bam_aux_get(b, settings->remove_aux[i]); - if (s) { - bam_aux_del(b, s); + if (settings->keep_tag) { + uint8_t *s_from, *s_to, *end = b->data + b->l_data; + auxhash_t h = settings->keep_tag; + + s_from = s_to = bam_get_aux(b); + while (s_from < end) { + int x = (int)s_from[0]<<8 | s_from[1]; + uint8_t *s = skip_aux(s_from+2, end); + if (s == NULL) { + print_error("view", "malformed aux data for record \"%s\"", + bam_get_qname(b)); + break; } + + if (kh_get(aux_exists, h, x) != kh_end(h) ) { + if (s_to != s_from) memmove(s_to, s_from, s - s_from); + s_to += s - s_from; + } + s_from = s; } - } + b->l_data = s_to - b->data; + + } else if (settings->remove_tag) { + uint8_t *s_from, *s_to, *end = b->data + b->l_data; + auxhash_t h = settings->remove_tag; + + s_from = s_to = bam_get_aux(b); + while (s_from < end) { + int x = (int)s_from[0]<<8 | s_from[1]; + uint8_t *s = skip_aux(s_from+2, end); + if (s == NULL) { + print_error("view", "malformed aux data for record \"%s\"", + bam_get_qname(b)); + break; + } - if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) - return 1; + if (kh_get(aux_exists, h, x) == kh_end(h) ) { + if (s_to != s_from) memmove(s_to, s_from, s - s_from); + s_to += s - s_from; + } + s_from = s; + } + b->l_data = s_to - b->data; + } return 0; } @@ -286,6 +377,33 @@ static inline void change_flag(bam1_t *b, samview_settings_t *settings) b->core.flag &= ~settings->remove_flag; } +int parse_aux_list(auxhash_t *h, char *optarg) { + if (!*h) + *h = kh_init(aux_exists); + + while (strlen(optarg) >= 2) { + int x = optarg[0]<<8 | optarg[1]; + int ret = 0; + kh_put(aux_exists, *h, x, &ret); + if (ret < 0) + return -1; + + optarg += 2; + if (*optarg == ',') // allow white-space too for easy `cat file`? + optarg++; + else if (*optarg != 0) + break; + } + + if (strlen(optarg) != 0) { + fprintf(stderr, "main_samview: Error parsing option, " + "auxiliary tags should be exactly two characters long.\n"); + return -1; + } + + return 0; +} + // Make mnemonic distinct values for longoption-only options #define LONGOPT(c) ((c) + 128) @@ -311,6 +429,7 @@ int main_samview(int argc, char *argv[]) .flag_on = 0, .flag_off = 0, .flag_alloff = 0, + .flag_anyon = 0, .min_qlen = 0, .remove_B = 0, .subsam_seed = 0, @@ -321,7 +440,10 @@ int main_samview(int argc, char *argv[]) .tag = NULL, .filter = NULL, .remove_flag = 0, - .add_flag = 0 + .add_flag = 0, + .keep_tag = NULL, + .remove_tag = NULL, + .unmap = 0, }; static const struct option lopts[] = { @@ -340,6 +462,10 @@ int main_samview(int argc, char *argv[]) {"fast", no_argument, NULL, '1'}, {"header-only", no_argument, NULL, 'H'}, {"help", no_argument, NULL, LONGOPT('?')}, + {"incl-flags", required_argument, NULL, LONGOPT('g')}, + {"include-flags", required_argument, NULL, LONGOPT('g')}, + {"rf", required_argument, NULL, LONGOPT('g')}, // aka incl-flags + {"keep-tag", required_argument, NULL, LONGOPT('x') }, {"library", required_argument, NULL, 'l'}, {"min-mapq", required_argument, NULL, 'q'}, {"min-MQ", required_argument, NULL, 'q'}, @@ -368,10 +494,10 @@ int main_samview(int argc, char *argv[]) {"target-file", required_argument, NULL, 'L'}, {"targets-file", required_argument, NULL, 'L'}, {"uncompressed", no_argument, NULL, 'u'}, + {"unmap", no_argument, NULL, 'p'}, {"unoutput", required_argument, NULL, 'U'}, {"use-index", no_argument, NULL, 'M'}, {"with-header", no_argument, NULL, 'h'}, - { NULL, 0, NULL, 0 } }; /* parse command-line options */ @@ -387,7 +513,7 @@ int main_samview(int argc, char *argv[]) opterr = 0; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:p", lopts, NULL)) >= 0) { switch (c) { case 's': @@ -426,11 +552,14 @@ int main_samview(int argc, char *argv[]) case 'X': has_index_file = 1; break; case 'f': settings.flag_on |= bam_str2flag(optarg); break; case 'F': settings.flag_off |= bam_str2flag(optarg); break; + case LONGOPT('g'): + settings.flag_anyon |= bam_str2flag(optarg); break; case 'G': settings.flag_alloff |= bam_str2flag(optarg); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; + case 'p': settings.unmap = 1; break; case LONGOPT('L'): settings.multi_region = 1; // fall through @@ -541,16 +670,7 @@ int main_samview(int argc, char *argv[]) return usage(stderr, EXIT_FAILURE, 0); } case 'B': settings.remove_B = 1; break; - case 'x': - { - if (strlen(optarg) != 2) { - print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long."); - return usage(stderr, EXIT_FAILURE, 0); - } - settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); - settings.remove_aux[settings.remove_aux_len-1] = optarg; - } - break; + case 'M': settings.multi_region = 1; break; case LONGOPT('P'): no_pg = 1; break; case 'e': @@ -561,6 +681,22 @@ int main_samview(int argc, char *argv[]) break; case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break; case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break; + + case 'x': + if (*optarg == '^') { + if (parse_aux_list(&settings.keep_tag, optarg+1)) + return usage(stderr, EXIT_FAILURE, 0); + } else { + if (parse_aux_list(&settings.remove_tag, optarg)) + return usage(stderr, EXIT_FAILURE, 0); + } + break; + + case LONGOPT('x'): + if (parse_aux_list(&settings.keep_tag, optarg)) + return usage(stderr, EXIT_FAILURE, 0); + break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(stderr, EXIT_FAILURE, 0); @@ -587,6 +723,13 @@ int main_samview(int argc, char *argv[]) print_error("view", "No input provided or missing option argument."); return usage(stderr, EXIT_FAILURE, 0); // potential memory leak... } + + if (settings.unmap && fn_un_out) { + print_error("view", "Options --unoutput and --unmap are mutually exclusive."); + ret = 1; + goto view_end; + } + if (settings.subsam_seed != 0) { // Convert likely user input 1,2,... to pseudo-random // values with more entropy and more bits set @@ -629,6 +772,7 @@ int main_samview(int argc, char *argv[]) goto view_end; } } + autoflush_if_stdout(out, fn_out); if (!no_pg) { if (!(arg_list = stringify_argv(argc+1, argv-1))) { @@ -676,6 +820,7 @@ int main_samview(int argc, char *argv[]) goto view_end; } } + autoflush_if_stdout(un_out, fn_un_out); if (*out_format || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { @@ -763,12 +908,15 @@ int main_samview(int argc, char *argv[]) if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; + } else if (settings.unmap) { + b->core.flag |= BAM_FUNMAP; + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } if (result < -1) { - fprintf(stderr, "[main_samview] retrieval of region %d failed due to truncated file or corrupt BAM index file\n", iter->curr_tid); + print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid); ret = 1; } @@ -797,6 +945,9 @@ int main_samview(int argc, char *argv[]) if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; + } else if (settings.unmap) { + b->core.flag |= BAM_FUNMAP; + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } @@ -838,13 +989,16 @@ int main_samview(int argc, char *argv[]) if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; + } else if (settings.unmap) { + b->core.flag |= BAM_FUNMAP; + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } hts_itr_destroy(iter); if (result < -1) { - fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + print_error("view", "retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file", argv[i]); ret = 1; break; } @@ -902,6 +1056,7 @@ view_end: if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); kh_destroy(str, settings.tvhash); } + if (settings.remove_aux_len) { free(settings.remove_aux); } @@ -920,6 +1075,11 @@ view_end: free(fn_un_out_idx); free(arg_list); + if (settings.keep_tag) + kh_destroy(aux_exists, settings.keep_tag); + if (settings.remove_tag) + kh_destroy(aux_exists, settings.remove_tag); + return ret; } @@ -929,6 +1089,7 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "\n" "Usage: samtools view [options] || [region ...]\n" "\n" + "Output options:\n" " -b, --bam Output BAM\n" " -C, --cram Output CRAM (requires -T)\n" @@ -941,6 +1102,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -o, --output FILE Write output to FILE [standard output]\n" " -U, --unoutput FILE, --output-unselected FILE\n" " Output reads not selected by filters to FILE\n" +" -p, --unmap Set flag to UNMAP on reads not selected\n" +" then write to output file.\n" "Input options:\n" " -t, --fai-reference FILE FILE listing reference names and lengths\n" " -M, --use-index Use index and multi-region iterator for regions\n" @@ -968,7 +1131,11 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "Processing options:\n" " --add-flags FLAG Add FLAGs to reads\n" " --remove-flags FLAG Remove FLAGs from reads\n" -" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n" +" -x, --remove-tag STR\n" +" Comma-separated read tags to strip (repeatable) [null]\n" +" --keep-tag STR\n" +" Comma-separated read tags to preserve (repeatable) [null].\n" +" Equivalent to \"-x ^STR\"\n" " -B, --remove-B Collapse the backward CIGAR operation\n" "\n" "General options:\n" diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 42c42e4..9bcc9ac 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -42,21 +42,29 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts_expr.h" #include "samtools.h" #include "sam_opts.h" +#include "bam.h" // for bam_get_library and bam_remove_B #include "bedidx.h" KHASH_SET_INIT_STR(str) - typedef khash_t(str) *strhash_t; +KHASH_SET_INIT_INT(aux_exists) +typedef khash_t(aux_exists) *auxhash_t; + // This structure contains the settings for a samview run typedef struct samview_settings { strhash_t rghash; strhash_t rnhash; strhash_t tvhash; int min_mapQ; - int flag_on; - int flag_off; - int flag_alloff; + + // Described here in the same terms as the usage statement. + // The code however always negates to "reject if" keep if: + int flag_on; // keep if (FLAG & N) == N (all on) + int flag_off; // keep if (FLAG & N) == 0 (all off) + int flag_anyon; // keep if (FLAG & N) != 0 (any on) + int flag_alloff; // reject if (FLAG & N) == N (any off) + int min_qlen; int remove_B; uint32_t subsam_seed; @@ -70,16 +78,65 @@ typedef struct samview_settings { hts_filter_t *filter; int remove_flag; int add_flag; + int unmap; + auxhash_t remove_tag; + auxhash_t keep_tag; } samview_settings_t; +// Copied from htslib/sam.c. +// TODO: we need a proper interface to find the length of an aux tag, +// or at the very make exportable versions of these in htslib. +static inline int aux_type2size(uint8_t type) +{ + switch (type) { + case 'A': case 'c': case 'C': + return 1; + case 's': case 'S': + return 2; + case 'i': case 'I': case 'f': + return 4; + case 'd': + return 8; + case 'Z': case 'H': case 'B': + return type; + default: + return 0; + } +} -// TODO Add declarations of these to a viable htslib or samtools header -extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); -extern int bam_remove_B(bam1_t *b); +// Copied from htslib/sam.c. +static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) +{ + int size; + uint32_t n; + if (s >= end) return end; + size = aux_type2size(*s); ++s; // skip type + switch (size) { + case 'Z': + case 'H': + while (s < end && *s) ++s; + return s < end ? s + 1 : end; + case 'B': + if (end - s < 5) return NULL; + size = aux_type2size(*s); ++s; + n = le_to_u32(s); + s += 4; + if (size == 0 || end - s < size * n) return NULL; + return s + size * n; + case 0: + return NULL; + default: + if (end - s < size) return NULL; + return s + size; + } +} // Returns 0 to indicate read should be output 1 otherwise static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { + if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) + return 1; + if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { int k, qlen = 0; @@ -93,6 +150,8 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin return 1; if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) return 1; + if (settings->flag_anyon && ((b->core.flag & settings->flag_anyon) == 0)) + return 1; if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, sam_hdr_tid2name(h, b->core.tid), b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { @@ -139,18 +198,50 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin const char *p = bam_get_library((sam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } - if (settings->remove_aux_len) { - size_t i; - for (i = 0; i < settings->remove_aux_len; ++i) { - uint8_t *s = bam_aux_get(b, settings->remove_aux[i]); - if (s) { - bam_aux_del(b, s); + if (settings->keep_tag) { + uint8_t *s_from, *s_to, *end = b->data + b->l_data; + auxhash_t h = settings->keep_tag; + + s_from = s_to = bam_get_aux(b); + while (s_from < end) { + int x = (int)s_from[0]<<8 | s_from[1]; + uint8_t *s = skip_aux(s_from+2, end); + if (s == NULL) { + print_error("view", "malformed aux data for record \"%s\"", + bam_get_qname(b)); + break; } + + if (kh_get(aux_exists, h, x) != kh_end(h) ) { + if (s_to != s_from) memmove(s_to, s_from, s - s_from); + s_to += s - s_from; + } + s_from = s; } - } + b->l_data = s_to - b->data; + + } else if (settings->remove_tag) { + uint8_t *s_from, *s_to, *end = b->data + b->l_data; + auxhash_t h = settings->remove_tag; + + s_from = s_to = bam_get_aux(b); + while (s_from < end) { + int x = (int)s_from[0]<<8 | s_from[1]; + uint8_t *s = skip_aux(s_from+2, end); + if (s == NULL) { + print_error("view", "malformed aux data for record \"%s\"", + bam_get_qname(b)); + break; + } - if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) - return 1; + if (kh_get(aux_exists, h, x) == kh_end(h) ) { + if (s_to != s_from) memmove(s_to, s_from, s - s_from); + s_to += s - s_from; + } + s_from = s; + } + b->l_data = s_to - b->data; + } return 0; } @@ -288,6 +379,33 @@ static inline void change_flag(bam1_t *b, samview_settings_t *settings) b->core.flag &= ~settings->remove_flag; } +int parse_aux_list(auxhash_t *h, char *optarg) { + if (!*h) + *h = kh_init(aux_exists); + + while (strlen(optarg) >= 2) { + int x = optarg[0]<<8 | optarg[1]; + int ret = 0; + kh_put(aux_exists, *h, x, &ret); + if (ret < 0) + return -1; + + optarg += 2; + if (*optarg == ',') // allow white-space too for easy `cat file`? + optarg++; + else if (*optarg != 0) + break; + } + + if (strlen(optarg) != 0) { + fprintf(samtools_stderr, "main_samview: Error parsing option, " + "auxiliary tags should be exactly two characters long.\n"); + return -1; + } + + return 0; +} + // Make mnemonic distinct values for longoption-only options #define LONGOPT(c) ((c) + 128) @@ -313,6 +431,7 @@ int main_samview(int argc, char *argv[]) .flag_on = 0, .flag_off = 0, .flag_alloff = 0, + .flag_anyon = 0, .min_qlen = 0, .remove_B = 0, .subsam_seed = 0, @@ -323,7 +442,10 @@ int main_samview(int argc, char *argv[]) .tag = NULL, .filter = NULL, .remove_flag = 0, - .add_flag = 0 + .add_flag = 0, + .keep_tag = NULL, + .remove_tag = NULL, + .unmap = 0, }; static const struct option lopts[] = { @@ -342,6 +464,10 @@ int main_samview(int argc, char *argv[]) {"fast", no_argument, NULL, '1'}, {"header-only", no_argument, NULL, 'H'}, {"help", no_argument, NULL, LONGOPT('?')}, + {"incl-flags", required_argument, NULL, LONGOPT('g')}, + {"include-flags", required_argument, NULL, LONGOPT('g')}, + {"rf", required_argument, NULL, LONGOPT('g')}, // aka incl-flags + {"keep-tag", required_argument, NULL, LONGOPT('x') }, {"library", required_argument, NULL, 'l'}, {"min-mapq", required_argument, NULL, 'q'}, {"min-MQ", required_argument, NULL, 'q'}, @@ -370,10 +496,10 @@ int main_samview(int argc, char *argv[]) {"target-file", required_argument, NULL, 'L'}, {"targets-file", required_argument, NULL, 'L'}, {"uncompressed", no_argument, NULL, 'u'}, + {"unmap", no_argument, NULL, 'p'}, {"unoutput", required_argument, NULL, 'U'}, {"use-index", no_argument, NULL, 'M'}, {"with-header", no_argument, NULL, 'h'}, - { NULL, 0, NULL, 0 } }; /* parse command-line options */ @@ -389,7 +515,7 @@ int main_samview(int argc, char *argv[]) opterr = 0; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:p", lopts, NULL)) >= 0) { switch (c) { case 's': @@ -428,11 +554,14 @@ int main_samview(int argc, char *argv[]) case 'X': has_index_file = 1; break; case 'f': settings.flag_on |= bam_str2flag(optarg); break; case 'F': settings.flag_off |= bam_str2flag(optarg); break; + case LONGOPT('g'): + settings.flag_anyon |= bam_str2flag(optarg); break; case 'G': settings.flag_alloff |= bam_str2flag(optarg); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; + case 'p': settings.unmap = 1; break; case LONGOPT('L'): settings.multi_region = 1; // fall through @@ -543,16 +672,7 @@ int main_samview(int argc, char *argv[]) return usage(samtools_stderr, EXIT_FAILURE, 0); } case 'B': settings.remove_B = 1; break; - case 'x': - { - if (strlen(optarg) != 2) { - print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long."); - return usage(samtools_stderr, EXIT_FAILURE, 0); - } - settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); - settings.remove_aux[settings.remove_aux_len-1] = optarg; - } - break; + case 'M': settings.multi_region = 1; break; case LONGOPT('P'): no_pg = 1; break; case 'e': @@ -563,6 +683,22 @@ int main_samview(int argc, char *argv[]) break; case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break; case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break; + + case 'x': + if (*optarg == '^') { + if (parse_aux_list(&settings.keep_tag, optarg+1)) + return usage(samtools_stderr, EXIT_FAILURE, 0); + } else { + if (parse_aux_list(&settings.remove_tag, optarg)) + return usage(samtools_stderr, EXIT_FAILURE, 0); + } + break; + + case LONGOPT('x'): + if (parse_aux_list(&settings.keep_tag, optarg)) + return usage(samtools_stderr, EXIT_FAILURE, 0); + break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(samtools_stderr, EXIT_FAILURE, 0); @@ -589,6 +725,13 @@ int main_samview(int argc, char *argv[]) print_error("view", "No input provided or missing option argument."); return usage(samtools_stderr, EXIT_FAILURE, 0); // potential memory leak... } + + if (settings.unmap && fn_un_out) { + print_error("view", "Options --unoutput and --unmap are mutually exclusive."); + ret = 1; + goto view_end; + } + if (settings.subsam_seed != 0) { // Convert likely user input 1,2,... to pseudo-random // values with more entropy and more bits set @@ -631,6 +774,7 @@ int main_samview(int argc, char *argv[]) goto view_end; } } + autoflush_if_stdout(out, fn_out); if (!no_pg) { if (!(arg_list = stringify_argv(argc+1, argv-1))) { @@ -678,6 +822,7 @@ int main_samview(int argc, char *argv[]) goto view_end; } } + autoflush_if_stdout(un_out, fn_un_out); if (*out_format || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { @@ -765,12 +910,15 @@ int main_samview(int argc, char *argv[]) if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; + } else if (settings.unmap) { + b->core.flag |= BAM_FUNMAP; + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } if (result < -1) { - fprintf(samtools_stderr, "[main_samview] retrieval of region %d failed due to truncated file or corrupt BAM index file\n", iter->curr_tid); + print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid); ret = 1; } @@ -799,6 +947,9 @@ int main_samview(int argc, char *argv[]) if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; + } else if (settings.unmap) { + b->core.flag |= BAM_FUNMAP; + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } @@ -840,13 +991,16 @@ int main_samview(int argc, char *argv[]) if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; + } else if (settings.unmap) { + b->core.flag |= BAM_FUNMAP; + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } hts_itr_destroy(iter); if (result < -1) { - fprintf(samtools_stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + print_error("view", "retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file", argv[i]); ret = 1; break; } @@ -904,6 +1058,7 @@ view_end: if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); kh_destroy(str, settings.tvhash); } + if (settings.remove_aux_len) { free(settings.remove_aux); } @@ -922,6 +1077,11 @@ view_end: free(fn_un_out_idx); free(arg_list); + if (settings.keep_tag) + kh_destroy(aux_exists, settings.keep_tag); + if (settings.remove_tag) + kh_destroy(aux_exists, settings.remove_tag); + return ret; } @@ -931,6 +1091,7 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "\n" "Usage: samtools view [options] || [region ...]\n" "\n" + "Output options:\n" " -b, --bam Output BAM\n" " -C, --cram Output CRAM (requires -T)\n" @@ -943,6 +1104,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -o, --output FILE Write output to FILE [standard output]\n" " -U, --unoutput FILE, --output-unselected FILE\n" " Output reads not selected by filters to FILE\n" +" -p, --unmap Set flag to UNMAP on reads not selected\n" +" then write to output file.\n" "Input options:\n" " -t, --fai-reference FILE FILE listing reference names and lengths\n" " -M, --use-index Use index and multi-region iterator for regions\n" @@ -970,7 +1133,11 @@ static int usage(FILE *fp, int exit_status, int is_long_help) "Processing options:\n" " --add-flags FLAG Add FLAGs to reads\n" " --remove-flags FLAG Remove FLAGs from reads\n" -" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n" +" -x, --remove-tag STR\n" +" Comma-separated read tags to strip (repeatable) [null]\n" +" --keep-tag STR\n" +" Comma-separated read tags to preserve (repeatable) [null].\n" +" Equivalent to \"-x ^STR\"\n" " -B, --remove-B Collapse the backward CIGAR operation\n" "\n" "General options:\n" diff --git a/samtools/samtools.h b/samtools/samtools.h index 85102d2..e0f99c2 100644 --- a/samtools/samtools.h +++ b/samtools/samtools.h @@ -37,6 +37,16 @@ void print_error_errno(const char *subcommand, const char *format, ...) CHECK_PR void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp); +/* Utility functions to register an output htsFile/samFile/vcfFile that + * might be stdout. If FNAME is "-" or NULL, records FP so that print_error() + * et al can automatically flush it before printing an error message. + */ +void autoflush_if_stdout(htsFile *fp, const char *fname); + +/* Call this before closing FP; check_sam_close() does this automatically. + */ +void release_autoflush(htsFile *fp); + /* * Utility function to add an index to a file we've opened for write. * NB: Call this after writing the header and before writing sequences. diff --git a/samtools/stats.c b/samtools/stats.c index f030cf5..1b4f051 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -55,7 +55,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include "samtools.h" #include #include @@ -271,6 +270,8 @@ typedef struct { } pair_t; KHASH_MAP_INIT_STR(qn2pair, pair_t*) +KHASH_SET_INIT_STR(rg) + static void HTS_NORETURN error(const char *format, ...); int is_in_regions(bam1_t *bam_line, stats_t *stats); @@ -1144,7 +1145,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair { const uint8_t *rg = bam_aux_get(bam_line, "RG"); if ( !rg ) return; // certain read groups were requested but this record has none - if ( !khash_str2int_has_key(stats->rg_hash, (const char*)(rg + 1)) ) return; + khint_t k = kh_get(rg, stats->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end((kh_rg_t *)stats->rg_hash) ) return; } if ( stats->info->flag_require && (bam_line->core.flag & stats->info->flag_require)!=stats->info->flag_require ) { @@ -1802,7 +1804,8 @@ void output_stats(FILE *to, stats_t *stats, int sparse) if ( stats->gcd[igcd].depth ) stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); } - qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); + if ( stats->ngcd ) + qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); igcd = 0; while ( igcd < stats->igcd ) { @@ -2020,34 +2023,32 @@ int replicate_regions(stats_t *stats, hts_itr_multi_t *iter, stats_info_t *info) return 0; } -void init_group_id(stats_t *stats, const char *id) +static void init_group_id(stats_t *stats, stats_info_t *info, const char *id) { -#if 0 - if ( !stats->sam_header->dict ) - stats->sam_header->dict = sam_header_parse2(stats->sam_header->text); - void *iter = stats->sam_header->dict; - const char *key, *val; - int n = 0; - stats->rg_hash = khash_str2int_init(); - while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) - { - if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) - { - khiter_t k = kh_get(kh_rg, stats->rg_hash, key); - if ( k != kh_end(stats->rg_hash) ) - fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); - int ret; - k = kh_put(kh_rg, stats->rg_hash, key, &ret); - kh_value(stats->rg_hash, k) = val; - n++; + stats->rg_hash = kh_init(rg); + if (!stats->rg_hash) error("Could not initialise RG set\n"); + sam_hdr_t *hdr = info->sam_header; + const char *key; + kstring_t sm = KS_INITIALIZE; + int i, ret, nrg = sam_hdr_count_lines(hdr, "RG"); + if (nrg < 0) error("Could not parse header\n"); + + for (i=0; irg_hash, key, &ret); + if (ret == -1) { ks_free(&sm); error("Could not add key \"%s\" to RG set\n", key); } + } else { /* Check for SM name, as per manual */ + if (!sam_hdr_find_tag_pos(hdr, "RG", i, "SM", &sm)) { + if (!strcmp(ks_c_str(&sm), id)) { + kh_put(rg, stats->rg_hash, key, &ret); + if (ret == -1) { ks_free(&sm); error("Could not add key \"%s\" to RG set\n", key); } + } + } } } - if ( !n ) - error("The sample or read group \"%s\" not present.\n", id); -#else - fprintf(stderr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n"); - abort(); -#endif + + ks_free(&sm); } @@ -2124,7 +2125,7 @@ void cleanup_stats(stats_t* stats) if (stats->quals_barcode) free(stats->quals_barcode); free(stats->tags_barcode); destroy_regions(stats); - if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash); + if ( stats->rg_hash ) kh_destroy(rg, stats->rg_hash); free(stats->split_name); free(stats); } @@ -2271,7 +2272,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr stats->cov_rbuf.size = stats->nbases*5; stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); if (!stats->cov_rbuf.buffer) goto nomem; - if ( group_id ) init_group_id(stats, group_id); + if ( group_id ) init_group_id(stats, info, group_id); // .. arrays stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); if (!stats->quals_1st) goto nomem; diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index 9e8165d..7f763f8 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -57,7 +57,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include "samtools.h" #include #include @@ -273,6 +272,8 @@ typedef struct { } pair_t; KHASH_MAP_INIT_STR(qn2pair, pair_t*) +KHASH_SET_INIT_STR(rg) + static void HTS_NORETURN error(const char *format, ...); int is_in_regions(bam1_t *bam_line, stats_t *stats); @@ -1146,7 +1147,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair { const uint8_t *rg = bam_aux_get(bam_line, "RG"); if ( !rg ) return; // certain read groups were requested but this record has none - if ( !khash_str2int_has_key(stats->rg_hash, (const char*)(rg + 1)) ) return; + khint_t k = kh_get(rg, stats->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end((kh_rg_t *)stats->rg_hash) ) return; } if ( stats->info->flag_require && (bam_line->core.flag & stats->info->flag_require)!=stats->info->flag_require ) { @@ -1804,7 +1806,8 @@ void output_stats(FILE *to, stats_t *stats, int sparse) if ( stats->gcd[igcd].depth ) stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); } - qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); + if ( stats->ngcd ) + qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); igcd = 0; while ( igcd < stats->igcd ) { @@ -2022,34 +2025,32 @@ int replicate_regions(stats_t *stats, hts_itr_multi_t *iter, stats_info_t *info) return 0; } -void init_group_id(stats_t *stats, const char *id) +static void init_group_id(stats_t *stats, stats_info_t *info, const char *id) { -#if 0 - if ( !stats->sam_header->dict ) - stats->sam_header->dict = sam_header_parse2(stats->sam_header->text); - void *iter = stats->sam_header->dict; - const char *key, *val; - int n = 0; - stats->rg_hash = khash_str2int_init(); - while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) - { - if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) - { - khiter_t k = kh_get(kh_rg, stats->rg_hash, key); - if ( k != kh_end(stats->rg_hash) ) - fprintf(samtools_stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); - int ret; - k = kh_put(kh_rg, stats->rg_hash, key, &ret); - kh_value(stats->rg_hash, k) = val; - n++; + stats->rg_hash = kh_init(rg); + if (!stats->rg_hash) error("Could not initialise RG set\n"); + sam_hdr_t *hdr = info->sam_header; + const char *key; + kstring_t sm = KS_INITIALIZE; + int i, ret, nrg = sam_hdr_count_lines(hdr, "RG"); + if (nrg < 0) error("Could not parse header\n"); + + for (i=0; irg_hash, key, &ret); + if (ret == -1) { ks_free(&sm); error("Could not add key \"%s\" to RG set\n", key); } + } else { /* Check for SM name, as per manual */ + if (!sam_hdr_find_tag_pos(hdr, "RG", i, "SM", &sm)) { + if (!strcmp(ks_c_str(&sm), id)) { + kh_put(rg, stats->rg_hash, key, &ret); + if (ret == -1) { ks_free(&sm); error("Could not add key \"%s\" to RG set\n", key); } + } + } } } - if ( !n ) - error("The sample or read group \"%s\" not present.\n", id); -#else - fprintf(samtools_stderr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n"); - abort(); -#endif + + ks_free(&sm); } @@ -2126,7 +2127,7 @@ void cleanup_stats(stats_t* stats) if (stats->quals_barcode) free(stats->quals_barcode); free(stats->tags_barcode); destroy_regions(stats); - if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash); + if ( stats->rg_hash ) kh_destroy(rg, stats->rg_hash); free(stats->split_name); free(stats); } @@ -2273,7 +2274,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr stats->cov_rbuf.size = stats->nbases*5; stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); if (!stats->cov_rbuf.buffer) goto nomem; - if ( group_id ) init_group_id(stats, group_id); + if ( group_id ) init_group_id(stats, info, group_id); // .. arrays stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); if (!stats->quals_1st) goto nomem; diff --git a/samtools/version.sh b/samtools/version.sh index 9d28100..0347be5 100755 --- a/samtools/version.sh +++ b/samtools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.13 +VERSION=1.14 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/tests/tabix_data/Makefile b/tests/tabix_data/Makefile index 22e5f55..19812df 100644 --- a/tests/tabix_data/Makefile +++ b/tests/tabix_data/Makefile @@ -1,7 +1,40 @@ all: all.stamp -all.stamp: +DERIVED_FILES = \ + empty.bed.gz.tbi \ + example.bed.gz example.bed.gz.tbi \ + example.gff2.gz.tbi \ + example.gff3.gz.tbi \ + example.sam.gz.tbi \ + example.vcf.gz.tbi \ + example_badcomments.bed.gz.tbi \ + example_badcomments.gtf.gz.tbi \ + example_badcomments.sam.gz.tbi \ + example_badcomments.vcf.gz.tbi \ + example_comments.bed.gz.tbi \ + example_comments.gtf.gz.tbi \ + example_comments.sam.gz.tbi \ + example_comments.vcf.gz.tbi \ + example_large.bed.gz.tbi \ + fivecolumns.bed.gz fivecolumns.bed.gz.tbi + +all.stamp: $(DERIVED_FILES) touch $@ +%.gz: % + bgzip -c $< > $@ + +%.gff2.gz.tbi: %.gff2.gz + tabix -p gff $< + +%.gff3.gz.tbi: %.gff3.gz + tabix -p gff $< + +%.gtf.gz.tbi: %.gtf.gz + tabix -p gff $< + +%.gz.tbi: %.gz + tabix -p $(subst .,,$(suffix $*)) $< + clean: - -rm -f all.stamp + -rm -f all.stamp $(DERIVED_FILES) diff --git a/tests/tabix_data/empty.bed.gz.tbi b/tests/tabix_data/empty.bed.gz.tbi deleted file mode 100644 index 891fe9f541b79fd9b136813f44519e605529e4b7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 75 zcmb2|=3rp}f&Xj_PR>jWdJNo!pHfm7nBCX}nmIdWHb%^36lP$Uz2u{i4p5amnsR9d JX0UD$0RTSz4F>=K diff --git a/tests/tabix_data/example.bed.gz b/tests/tabix_data/example.bed.gz deleted file mode 100644 index b67da76bc82bb554a4ded832c9fb696b351205a0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 819 zcmV-31I+v%iwFb&00000{{{d;LjnL618tT&0yQTHh3m;`b|NQ+AJYZxGju?Oa*^<#@NjG{Ra5PCm^hz9z%@^79LD5q>FX`kD2CSZGI6s7B zP4YnKa!-Q@w2hl~I1BC)IhI7!hee@F4Qw+1X!rGM{k?2E< zVjtRx9*$A1uHK|*`ifQ8uXI?8nGM#YvW-U-eMtj*G+;}FCT)a11`J&y+99c0fub8j zZgFd6;8|#wI;6>YTE1pGnpI92U7Go0QJd;OS>)!Na_w=}tV_$I=DD6-29gVoT87i!R~{9!^#+Rfr`}v!4=wR42Mfr`EU^!8T~?^%L{^Z10JVErjA)g zoS?ZZ&|Jb!7agNpa3-LcE27q80k(4CJeBkVdV_6YIZ6Y94q1{(1*_-w&r;|B5e6Fw z|4KbT>{B|cG0W|;!0y8`mwE;USAKZ`z_5^mAaO;C#V=$dfI9Fz5EUK+GUasub83!I zyX$#c=)V9UhS%90!!u>W0K^NWRc#=Q+#s4C)pbyE_H$GFPh?%mFZ5f`qpuI`NhD%^ zk)Ein2+UxxiELZpyqGR4$qCF)?!txai{V0%xxP@QP<=msqb~^+x*X%SV!~_0$p5<> zvko^>q4}}8K?>2p)>q7h?udZqN)Wp7g+i?QqPz#;vRX(rHd35FL`sGi>V2sUIUVyM z!M}WKb;mbUL6@hNM$C^?*lh&lwS>7rW*5q9+~-GkK3{0gv<{|-%@H%_)QvTf;Qu36 z+Vjd(LK6+#BRU&hRz4FENR~BXt__a)5rIOaGlurO^7-YN0iJvxy&K!WWsxx?CvaOX zG^4b;EP(MogI)l)1)P0oCiFIsSv_4algiPlaHjk=v00@PcwV$^lzf0nX2Fc&^=|r5 x;Cf0V%0}fv{{lVu=?3Qt001A02m}BC000301^_}s0stET0{{R300000000jW3mLczKc%E3F)+F*q$DtGoY@#LlaXmhgN5bd69O(0h8q|v z=lSp?Of^tA&bW}dRpF7rza$M=7SZ;P`~^-K>pcZ-Corhl&14iUP~2$m@UmP-;MPo0 zr}s^lmS23ad(DcxmYt1~C*=*QuRH2;HSQ{xy^~a!mh%13mBU9^FWx!byoAlo-f44l bkQ5t(sQ>;e`xqG*& diff --git a/tests/tabix_data/example.gff2.gz.tbi b/tests/tabix_data/example.gff2.gz.tbi deleted file mode 100644 index 30d39ae813cb8f70d131055a1bac589cfe223f4f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 107 zcmb2|=3rp}f&Xj_PR>jWehl1&pHfm%8W;r5G96jtu!cu8nduRSL8Uv#A|^(?M$h%0 r0--M+gr|;yA=*)Vwiqh|gFKpX(hSTX!@&Rm+I1KX diff --git a/tests/tabix_data/example.gff3.gz.tbi b/tests/tabix_data/example.gff3.gz.tbi deleted file mode 100644 index d23afbb6fc39c07e57879c620a1b4b113dd33f9c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1454 zcmZWpdsNZ~6sD)kG$T{nNn=p!WMN=_AG|3m1EAx>OQxOH>)UtkQNcPZ~ zB&ieg1wJN-3AVH>OVNBVMdjc#BPAe05oB0r|7_p6=l*lQ^WE>kac`QL*<@tvlu2_w(cOli%geq_o(peK|4je$dw(9-~KC);zYo8#qCUo&pw( zudF?meVMWcO2L&Ot-wAm(EJH+7{2crvtzB-dT3f8TcsnOaW^@qq|MVB)tBWXYC^Gx zK&Bt=DOSSA>}{5o(&ankeG=9c32U&6LTuAN$!lpX&WlaIn8Je<6ETUd)ouz-IE-Jn zTYKsJ)GHSx@kEd9O>Hj?a5!SwGroO^h0b0a^N@6w=OM&9fF4rbc%lCz zjM%U@;sbF?#`Y#wdcgT7r-?|B)m3^WpLqdw{&e#}#q~6wQMTQIFpG4da^QI#3$ADi zA4H3un!7QnoHX1bSCLyi5TNUF$1OiNy=}?sIE-{QZZT_wE0Y6if$52Z6z$w?fXWQ! zyDD-;sH~_en+W~ws_#m0mvqMmnB{KF<-tw|<_#h_p(ClSf22)K@XfamT7INQ+ih+O zLA2-tQm<~`%MJw?=j`S~UG|Z=`zG$>v}dST%Y#Y=^Lxdn?li%2;|R5rPF4OkGCVI`iR*QZ zQ2TYH+kI@Ek=4EdJgDS>mBv<9;#|Wr?H2bE%2}l zt^B*ZQcXM`GQ1rtovFj7PVoZLM@?%eL`<^b2su>+Qe|I5U8GJ*(w#ZGD4lqLI*^V3 z2^Wu8K(LGo{^W6z5m6yakx5i(K&=zW8r55_Yqn-tUbBgZa9(`?v}9x&$*py0nnFUZ zP+IG$;zJO)*}|)+UGK(3i>%sTQYP?>5BXQ3h893>d{Qm zY+)2NV=yVnfhEw#!tvJlu|?E$8dtaM#h=l=z250pl_HwDzaTASl3j{wUbeNHLZe4i z;$2*ZKHszBy->F1T=vRNlgTgn~GNs?k|yTpi4DqHQ>~zqySEWU;GU_ zY!*aUfK|*j;Oq?UR>9~B6>SZgbvB3?(6I?0;~rDS>o$)%FkCQ3YuL diff --git a/tests/tabix_data/example.sam.gz.tbi b/tests/tabix_data/example.sam.gz.tbi deleted file mode 100644 index a6c84f17ec90c8a57e3c41e1b0f0d546446a4137..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 128 zcmb2|=3rp}f&Xj_PR>jW$qd|upHfni5)u+vlF}5?5*rxUHO{ggIVtJdrf{f(aml4% zjzz!pLyqdWalAF{WYp=`<6HWGAyR)+<>m)m-4YYeI*X?tOS-3H;?1D!b!*i%pwaSZ Mc1SZYgRKM+00O)uuK)l5 diff --git a/tests/tabix_data/example.vcf.gz.tbi b/tests/tabix_data/example.vcf.gz.tbi deleted file mode 100644 index 97c80efec315040f37a7d0c230ce868d17739b1b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 180 zcmb2|=3rp}f&Xj_PR>jW(;2u6Kc%FkB_t#;CAn$vGMx&T#oSRV$#Yh+hvz_1aErpL z1O+LtYG0=pu43+C+)Q>nixWgHh&5bVsBSX%(4~i-8ob`iUp#v8QY2EJ+e7|}&5Wv4 zw~)_%he}^Hq|Uf_=5^z%_Ls9cmnQzqR8wA6C$D0B@vY2K_e_Bqo{GH84D*+=++ks3 QV30?1mox)2*qI;#03Sg+xc~qF diff --git a/tests/tabix_data/example_badcomments.bed.gz.tbi b/tests/tabix_data/example_badcomments.bed.gz.tbi deleted file mode 100644 index 0ab947f612fe56e7947ef2af0285085eebdba6dc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 194 zcmb2|=3rp}f&Xj_PR>jWOBuKeKc%E3F)+F*q$DtGoY@#LlaXmhgN5bd69O(0h8q|v z=lSp?Og$7JI!{KxDu6?Y@A3TROO64B68}xrTfWU=*~pv5=Tl?Eto6Vt=jWD;c;8Kc%E3H88j-q%kIiDTK9k6*3kUxlL5;Fkm`eW#kyb z0Ax(XYcno4oPqwNv_jN+4|#4$G1`yshwM-P;vge))x%AG&j(=L!Fey}3~WRxGdLB<43h)RNy+o%ky8(T9QtCb<;j ig(d4)R>@AeWWc~M*LH8!K1K!xc{GjW$qd|upHfni5)u+vlF}5?5*rxUHO{ggIVtJdrf{f(aml4% zjzz!pLyqdWalAF{WYp=`<6HWGAyR)+<>m)m-4YYeI*X?tOS-3H;?1D!b!*i%pwaSZ Mc1SZYgRKM+00O)uuK)l5 diff --git a/tests/tabix_data/example_badcomments.vcf.gz.tbi b/tests/tabix_data/example_badcomments.vcf.gz.tbi deleted file mode 100644 index 38f4b591f25cea7ada5d617a324070f784b2cfd2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 184 zcmb2|=3rp}f&Xj_PR>jWvl+MxKc%FkB_t#;CAn$vGMx&T#oSRV$#Yh+hvz_1aErpL zd4|qUWFr-}TvzFcD=Bni3whAIByEb^(oDzZOAFOa<{rBAuxd+5mtNmIzlmR)H1tgfZ@YkUBYk<_W`3W^M>f}|7FTRyo>YgdEV)=wNMh3rs VuV%BbF)+xZxlNjZ8SGRL0RYj@LA3w? diff --git a/tests/tabix_data/example_comments.bed.gz.tbi b/tests/tabix_data/example_comments.bed.gz.tbi deleted file mode 100644 index 89b1bb3384da9062c4055b64eb115855b3170981..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 194 zcmb2|=3rp}f&Xj_PR>jWOBuKeKc%E3F)+F*q$DtGoY@#LlaXmhgN5bd69O(0h8q|v z=lSp?Og$7J+9o4l6~Ljy_c*`#l4C%j#D638mT$9IHu9$N`P3LOYdvrZI(&v>p=6|D ziNq8;?xpV+UX+gryvtF{Y&^+6;hWr(rVEMz-w*xZ;aPU(#Jkp&?Va38BI<5NG# diff --git a/tests/tabix_data/example_comments.gtf.gz.tbi b/tests/tabix_data/example_comments.gtf.gz.tbi deleted file mode 100644 index 54f5389f960f8bb24e6375befeb007509d2bc953..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 198 zcmb2|=3rp}f&Xj_PR>jWD;c;8Kc%E3H88j-q%kIiDTK9k6*3kUxlL5;Fkm`eW#kyb z`tufr5{@f-4eXc?i`k`oKXm2rk;J@aM!6K@ ig(WLkR>@AeWWd1C^S3{FA0q>UJeo_T8JNKi2N3{RGC}qL diff --git a/tests/tabix_data/example_comments.sam.gz.tbi b/tests/tabix_data/example_comments.sam.gz.tbi deleted file mode 100644 index a6c84f17ec90c8a57e3c41e1b0f0d546446a4137..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 128 zcmb2|=3rp}f&Xj_PR>jW$qd|upHfni5)u+vlF}5?5*rxUHO{ggIVtJdrf{f(aml4% zjzz!pLyqdWalAF{WYp=`<6HWGAyR)+<>m)m-4YYeI*X?tOS-3H;?1D!b!*i%pwaSZ Mc1SZYgRKM+00O)uuK)l5 diff --git a/tests/tabix_data/example_comments.vcf.gz.tbi b/tests/tabix_data/example_comments.vcf.gz.tbi deleted file mode 100644 index 38f4b591f25cea7ada5d617a324070f784b2cfd2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 184 zcmb2|=3rp}f&Xj_PR>jWvl+MxKc%FkB_t#;CAn$vGMx&T#oSRV$#Yh+hvz_1aErpL zd4|qUWFr-}TvzFcD=Bni3whAIByEb^(oDzZOAFOa<{rBAuxd+5mtNmIzlmR)H1tgfZ@YkUBYk<_W`3W^M>f}|7FTRyo>YgdEV)=wNMh3rs VuV%BbF)+xZxlNjZ8SGRL0RYj@LA3w? diff --git a/tests/tabix_data/example_large.bed.gz.tbi b/tests/tabix_data/example_large.bed.gz.tbi deleted file mode 100644 index 35c9cd2b01f5ce162a056ce3032f93512e2df718..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7877 zcmZ{pd0bKn-^H70(zmjl#>!C9%smxHG))w(+y%9=Tv02bBDWOW$aJd7$`nNu!xS?> zrgF(8_hrfo6%)BOn&ncV!W}CmF?W4$&*%B$o#*pD{J|gm$vx+L&iP%f+yC>^hVP$s zg$+j>f86kgX7I3LtzcffE;Dwy1z$d{bHL}(AoUpHWYjLt^3wE|O_|4zCPjttgF64u5FRkEg57sqFNC5vq?Q7$*04+K;{ zZ*9Tfy7b_~SolC{W#tPnM?4lT#QPn2)lkS>e*S{6J|CDo$!#%fkGgW@dQ4{MC^Eec z{Yb+sD<^#*yjlO;vY$ce1nwd@5p(s5I4pl#+DdY9Eg7IUw)|<~KgU?OY$D|{}>+^9|QGG^taYc?VV?B(Qy(_j@UJ~fVkRW3tua^nRkj2-9W&LvLO|i{OVIN)44<%rzKT;7MQg9 zkFL8Yv;wWizxkU^RpccNkP0(_a#$M(*@ zw0oy>BVv7b+nedMccJqq-aM-Aj|-S%SPnC;*v77f#=gHi^pV!HoBn|pE8c1&$_|<4 z2Gq5=8M&t~(=iAn~3FC?Hw2h}3!qgmtdH1oypi+0KD?udw8t+zN ztLG!^Mo0*Ax4-rh8mAv(n-Pj5Ii}G9jf9Eo?f9k3HjA`~U36}7*RmPDZ^mvmS~nqn zNQYN;H)yiUAZus#=#kjOV?%Ynn}m9V7@Mw6jQ4!4_SOp!?JM@pD^ZJ$yNa#Qx%)c* zx_Y-;SR?O&#yed61*1oGj5I4`pKouf_x4YFgUF1bPsrhuQJ)Y7DE#1P^@-{X)I`Y- zv0ZzIB5#(Vv1+cJ0-Xs!6#M4j(Ale&-Z@#k&PbmLIjlP0HBi#|#xBXuvm(&M)if`Y zf$86$M0zs(#J4xZ%r+v2@d_FB$gVZGcSL)6nsMugfaG?&OGhh|yEnUXR~s@Mw5N-5 z6nnd!CETg|*oJ+i5V~rm;8p#2-#a8&VWOU=JS{NI8h?FqchZ+yU#PEvcb5{lDY|A}TY^^KmS(5-Wg zlWMih6hC5yf#-aP&#_uQJod&vk_1OfAAD)XnfxF(bzG&IK-_vINRbi}O zl#Olir=v#$w+I@}d?jzTsn7is>Y`sjkZt)Iq1QON&)@|8TLH3-(c3 zX{Li+8tUWMJ&~7d3R_OreH8$<6RRW0J=tGYtNGdi^N)K-SWO#5tHJn8fAwdZfVHbF z@yC`bJDG#{V)g1XPWG*)jtLWg|A9@}d)JO}y|}Z}qS3%B>k9vD-gQ!^n^&WO^}E?F zuT6x%o&iTQ_sb%!AlrXxN+0d9`{qB}HZs>179Pw|Zz&>0lcR3K72_51VHFH2!)Oh;bWmFqwtZ`BcF!1AI4eXsyuX;^B~X z!vq|S6^>j?E7PC}qXqN?uG#(~Wqd!sTB?$?%*3f$S0qgeBdWU*HuyOlGDx%0tQc!- zsJM$^db>+Q_VtJnGsMNqO5nQ-4|acTb5?ldp5QseW|Rf_O%nH7sCh)I*t9Ss-W_42 z28D#1u11dc#8+o|`hGGjj%R2$<5p=JBaT*}(bjny4d2Ks*04PquV(USdtRE5aO(F$ z2aoLxb-JCmU62|8k{m$b|lws}6KC8;o$-!YG(T^*3tkCD0`AaLF z>o*pZi;6|&U-EV^Q=@9SIU08`sIQ&5D?r$@eSjZM!K3>k(Zql~!L6#cY)6#;CEG@P zWLh+bf4ECsP+N*@ZB(jzK7eFrW;~&n&U3jcDSK6#JuW4Hv8SRzFUDvJ;PA7epzkPb z5vS(Ty&yVBWvNIH-l9eox(J!%LT7$s!*8tH(uHmC)`lEbxU`W2?ot!LJuejH$ygzeoP5~z+E6|* zo|)@|iWTH0`yZuj!GxN_Zu0#I4P_Z^`k8s0p{sv!o|M-9mDAhw=1{3s4l%l0?v#vC z2=w3*Sh2tS0LS!)DYtW%nQxDl9(Z~d*?L{6&D0`iCKwiO<_YI; zYp}IqS76k!iQr-!#pIO&RtS1rOoU_3jvuO-0b5nN(fuL z4dpW9O&Mj2$H>Vy*ZXE+XmRg&%HoNTBfiYsjU;zhO=eCg1?%s$lrA{r4nt&4;A>kf zeHoL3+$X3u?Z_wG8W#4+nK0(SiBjwQr;Im&XKQIy@qoOomi$Z^f5wkEjR@ZU$#kR( zc;~&O`0*4>ZNE-2ro#>9pvD)GeuNu{M4A0k8#=W{dQerhkF*2cUcy%O14!G@gQA#? z@P3&hC5kQ1A`f485RtUt%7FPzS1_rNWdJ6oQBSXAvhu;iJnEoqH#)jshJdpdeq&{U zi@8)Y>G?j==;k59aWAmb)aUEz!=RDLEN8vz4~dXr~Vz zQESVVg@FfIRF#-5XdV!w07oCZ!I}j-!+(B;xd=o1^^p$12Z0!L)P2cWR1}x3l>m^G z;q!oO!x~noBVV~rHIOE%2}QF%!p%hH8&i1_Pt@9BzU_i4db(e>176%GvpL)N>)x7k zR!n(1d5?J|)@U#KI;YZ*k$vSj66fOb%Kxz1BRA4XV!3wYW`9@Il;Cw5vV5}@9W=$y zL^5f=Pj~xYBL2PCw1{vY?vV3M6Zp%gY5BbY@Gf-8Ax~R1MPH|bdpb8m%aMO#C5J@^ z7k0sU;z4<#CqIdd?RX)_dGII5D-5dr3YDb}E}-G{@8c<7?unzm0}HJzlKl$L5}E(b zlO{2!5-@Q`W#6nCJi5>PYAP_R312T2JFeVd38k?4Q|@Q^Wk@5%$sDR3QmwFTi%;Z%iCOj_L@$Ci7IW+Uai@`Q_hei{v#Vo7C7enNTOXRGr>Bz^cfQ49icPqGh{ z;^=Bt!I7v@D%fHLRH;~5KJ3ER0voob-j~>;G&x~EX92SZ(cXYe3Er`wjyCI;?T1&2 zO(~qvoug7-Y*hDyf8Z^}R`6xRH8W~@%>^kHO ztGIfSP@85skTVIS@42_+^# zv{KU@apWXGv|2lcv`nKyn7ZjBwaKGeNTpA0tx(jfwb`|3$nQ@fBK>h+PXi+ z7{1y-XI+vu_eq=e9&0ry`lvDP7Ugb{tfbk#B0k*g$*Y3Xa zC~C)Mc_EfxDLn{pFO@`~df8!Wh*HUM6oMz-kE(bYCm%ko$k#0+ZtM|TRVBN?LKC^O1@!fj;L23A$o9?0vRL_X20V-O?p6kAwr zR|fTs^Go~O(l5ZrKSz<~0dJSx2M=kOj5u=VNuW2a#Oz{OMg~u;i)xBdL!Sd;cEcqf zf21Jpi>=A>j~Ua^)?3v zBLrDhS!)~f8+lvK7j3{+$<=M$5T*v19Od=gNM7i{hmqy)H7LD2@evf_DM5}qEQ|)F zY;i&Zjdd4n$fKqVyL*p@zaPf%H!fJ9v-)OLQEAe|Eka-xijV(~Rbk}hv>5(b!ndCN z+bTN~p%C1>34dH0G)&jSdl-R6I+xFu?cJD1Scrp_BQhaY6-;5m9f{#rUvX-#V=o7l z>V0TNwzev@nOUH^-YC8~SwL~acr)K9I?%qY0do(?%`+At9I>iHIByB(fB1>wRVwx) z7g9s}Nmg))$b6e>KgkH5-ygFJKKLF*iMl7ACl6nJ2R2@#HiLWqT$S57@jJ+NF8olL zE?U@Uj^*D@yv`bwCSIdL)rvJQU9g60eRPl~dk9s@mQLRY@jX`>@yVLZ>`;B?5J*!N>JjEaO`WaNuV}%=}Ss6}RMrmu}TC))9mMwO2g?8yVCo@W1z}!~A}--3?V~ z5|io&Cc>jdvsUnFz&u@9x_F1>EnO(22DqR*E$>>6=gJxhN9;`u>02Jy%*j~pe=Szug9DXvLmm7_vC=r_FspHf)zL%B_}m`5 zLQhOf>a`F$Q&TrpnhUp0-g+F8`Vl?wIeOr;_SBs6)ZEU{$5CmXyF%M0`f?ouV^3=` z?}Wbf1Wx{R84M4^wgj>|n3zlD%YH(8_d~)On?a&00T~J& z{UMSv$_>kq%acGxFA*dEWpt1&rl6vpy^;@m zxH`ZSK5A0FaK%5M)-qhBi(Ao-WIJa*De(#`UwY>-e@v#2UcV=Xqt>p^gNrv=<|uo1 zSmEeKO~HQ8O1Im`^e12Z2!yybrF$f7HqmP(aK=m=ci}n6HpI3A_~fhDKd#LYjyrh_#qm*Ef!Vmpp>BM(OLDzvVwD{;HXjk6l-~A$ae` zG%uivr3-pqr4q>LUi``!D>UTvOzQ0zBXoX0337U$%y!sK2nEP>>bn?4G!!6*_^XLG zSvSBWC_n(2F015w5OT!3#w|8QSWEYmh>H`mSgHLoZTNbL*eCh0cL(z`6Hao&AbNCE zW--m~P3fm%K!i~%m)owv$S?7TsQ)!2Q&!-QarxWh+R2yJFH4anjo1rD@baomEYSdM z1x^`m44!d>ebHxYVZx%ZjL8%Lb@VMHD{uu@TH0A_hF|j6FuSHgc)1gHH{C*tzTHa# zTQ;bQmeo<0V4+-I$D>HXx^XNlZxlm_;=ojFGPl`)9jP{a~VNk>2gUnMm0s z@DPL}tM1qO(JCJLJ2t6s>Z$TUej4(%lS_enN6;IOm+d+#Z@W439QMX|hEY7Y=(a>T z~m`H{Jt1i#XU*A zTrQH?qN1KpgN@MR02?+Y^jjR_ZzQk4zUK!8%8zvFIhh^W43KSsSAN(=agi<)`e$JD{fe#TQ z(QeU^Wd^IhunQ(So>h&OEv8~R{#i5f0lLi$UWD&f1p0S1V~QOWD1wW~Z*DvAdn4@K<;$TTE#25KfU>ocZq(^ZzNU4^~Wgji9#hPEe_Y z{`4x6{p)UeRU&b}sVRgjY?`9R*90`m3$jOEgti06&OL1F65J}>|$ ztZjrpZe>`~)9N4RWu&_5*XFrTySplWGV%EY%+lm@f3lE$u4xW5(b~cT%k8wu(*Wwo zTbB=eKqx};$%PrAA`%MfiQqIALHd)<|P zue)&J$0W*km!m>D)a4m{GVg!jYPeyJc{cBX4L%UTWGT)H?zx}cNvrzkf4rdE?RJ2^ zZ}zwDg(mXJ{4ds=daD6?$|I!%3|i?MoZ2Wu=8pUj%zUjmmQVFUPHY6h^bxL5dt)o% zAZ)oydWcANpFMXF7wGEo4l28B@lSLwTQI_Qa3Pzz-X}9^yT?0Ryz5Ru z@Y;O9RbnH7VsHQM@x30yj2C#5kOB`3#>T`<#n`>sx#=Nv*ecU&jpMdoQ9AI6{%=`s5>n4cPA%x64f%Y*1D1oBh70#L%AlU_@bY#ZZhqH;*IUv>?}O;fkD`*Y*YZ9vUKrChS#?%;&ql02J=6 zi-+XAhO2NhxzNeg+KVm0qI&O(akxXme@}GW_5c48_wO~Ol**O3qo6glCzbt=A>zu1 z1orcKiNpuhc8`6!-njo0(v*i4nqFJ{E-sU>7n}#W3S=CbOmxF+ACSMAGAj}eU zGJ##t)rt0SBt<~X68vt3d{9Jw^B7{*KD3rcD0GTbpvacnq|bkd(9-R2#FOt>mcCa7 z?0f_Hn0HQv`-*%(yKB9gg^tsUT67oK)L zJ70z@a8%2@<5#fNEnJ&uoUX^I$;F!L^!FfJfAnQM9VEEK!7gnK$M{?(5)D=Ep0XfD zYX?VT5_05iw?A{&FT?)U)zLcTN0e%dFLnQJzix2}i22e;dFz#yrNt_M?k2q3KG= zSHcL1c5nmWKTdIw(b@S_Rs1B{6XoD}O+gl?X+@|-oW_^rlD!#W& yq!f2-CAT}xD^b>|*5I1r$aMEQVqLkoFVOedsSO*n{^Lcp`i71FdT;&x>wf|8F!@RV diff --git a/tests/tabix_data/fivecolumns.bed b/tests/tabix_data/fivecolumns.bed new file mode 100644 index 0000000..2ba7294 --- /dev/null +++ b/tests/tabix_data/fivecolumns.bed @@ -0,0 +1,4 @@ +chr1 100 200 one apple +chr1 300 400 two banana +chr2 100 600 three carrot +chr2 700 800 four durian diff --git a/tests/tabix_data/vcf/16.vcf b/tests/tabix_data/vcf/16.vcf index eaa067e..d36970e 100644 --- a/tests/tabix_data/vcf/16.vcf +++ b/tests/tabix_data/vcf/16.vcf @@ -129,7 +129,7 @@ ##source_20110031.1=/nfs/users/nfs_p/pd3/cvs/vcftools/perl/vcf-annotate -d /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.desc -a /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.tab.gz -c CHROM,FROM,INFO/VLD,INFO/KGPilot123,INFO/dbSNP ##vcfCTools=filter ##vcfCtools=merge freebayes.20:0-100000.baq.20110328.vcf, freebayes.20:100000-200000.baq.20110328.vcf, freebayes.20:200000-300000.baq.20110328.vcf, freebayes.20:300000-400000.baq.20110328.vcf, freebayes.20:400000-500000.baq.20110328.vcf, freebayes.20:500000-600000.baq.20110328.vcf, freebayes.20:600000-700000.baq.20110328.vcf, freebayes.20:700000-800000.baq.20110328.vcf, freebayes.20:800000-900000.baq.20110328.vcf, freebayes.20:900000-1000000.baq.20110328.vcf, freebayes.20:1000000-1100000.baq.20110328.vcf, freebayes.20:1100000-1200000.baq.20110328.vcf, freebayes.20:1200000-1300000.baq.20110328.vcf, freebayes.20:1300000-1400000.baq.20110328.vcf, freebayes.20:1400000-1500000.baq.20110328.vcf, freebayes.20:1500000-1600000.baq.20110328.vcf, freebayes.20:1600000-1700000.baq.20110328.vcf, freebayes.20:1700000-1800000.baq.20110328.vcf, freebayes.20:1800000-1900000.baq.20110328.vcf, freebayes.20:1900000-2000000.baq.20110328.vcf, freebayes.20:2000000-2100000.baq.20110328.vcf, freebayes.20:2100000-2200000.baq.20110328.vcf, freebayes.20:2200000-2300000.baq.20110328.vcf, freebayes.20:2300000-2400000.baq.20110328.vcf, freebayes.20:2400000-2500000.baq.20110328.vcf, freebayes.20:2500000-2600000.baq.20110328.vcf, freebayes.20:2600000-2700000.baq.20110328.vcf, freebayes.20:2700000-2800000.baq.20110328.vcf, freebayes.20:2800000-2900000.baq.20110328.vcf, freebayes.20:2900000-3000000.baq.20110328.vcf, freebayes.20:3000000-3100000.baq.20110328.vcf, freebayes.20:3100000-3200000.baq.20110328.vcf, freebayes.20:3200000-3300000.baq.20110328.vcf, freebayes.20:3300000-3400000.baq.20110328.vcf, freebayes.20:3400000-3500000.baq.20110328.vcf, freebayes.20:3500000-3600000.baq.20110328.vcf, freebayes.20:3600000-3700000.baq.20110328.vcf, freebayes.20:3700000-3800000.baq.20110328.vcf, freebayes.20:3800000-3900000.baq.20110328.vcf, freebayes.20:3900000-4000000.baq.20110328.vcf, freebayes.20:4000000-4100000.baq.20110328.vcf, freebayes.20:4100000-4200000.baq.20110328.vcf, freebayes.20:4200000-4300000.baq.20110328.vcf, freebayes.20:4300000-4400000.baq.20110328.vcf, freebayes.20:4400000-4500000.baq.20110328.vcf, freebayes.20:4500000-4600000.baq.20110328.vcf, freebayes.20:4600000-4700000.baq.20110328.vcf, freebayes.20:4700000-4800000.baq.20110328.vcf, freebayes.20:4800000-4900000.baq.20110328.vcf, freebayes.20:4900000-5000000.baq.20110328.vcf, freebayes.20:5000000-5100000.baq.20110328.vcf, freebayes.20:5100000-5200000.baq.20110328.vcf, freebayes.20:5200000-5300000.baq.20110328.vcf, freebayes.20:5300000-5400000.baq.20110328.vcf, freebayes.20:5400000-5500000.baq.20110328.vcf, freebayes.20:5500000-5600000.baq.20110328.vcf, freebayes.20:5600000-5700000.baq.20110328.vcf, freebayes.20:5700000-5800000.baq.20110328.vcf, freebayes.20:5800000-5900000.baq.20110328.vcf, freebayes.20:5900000-6000000.baq.20110328.vcf, freebayes.20:6000000-6100000.baq.20110328.vcf, freebayes.20:6100000-6200000.baq.20110328.vcf, freebayes.20:6200000-6300000.baq.20110328.vcf, freebayes.20:6300000-6400000.baq.20110328.vcf, freebayes.20:6400000-6500000.baq.20110328.vcf, freebayes.20:6500000-6600000.baq.20110328.vcf, freebayes.20:6600000-6700000.baq.20110328.vcf, freebayes.20:6700000-6800000.baq.20110328.vcf, freebayes.20:6800000-6900000.baq.20110328.vcf, freebayes.20:6900000-7000000.baq.20110328.vcf, freebayes.20:7000000-7100000.baq.20110328.vcf, freebayes.20:7100000-7200000.baq.20110328.vcf, freebayes.20:7200000-7300000.baq.20110328.vcf, freebayes.20:7300000-7400000.baq.20110328.vcf, freebayes.20:7400000-7500000.baq.20110328.vcf, freebayes.20:7500000-7600000.baq.20110328.vcf, freebayes.20:7600000-7700000.baq.20110328.vcf, freebayes.20:7700000-7800000.baq.20110328.vcf, freebayes.20:7800000-7900000.baq.20110328.vcf, freebayes.20:7900000-8000000.baq.20110328.vcf, freebayes.20:8000000-8100000.baq.20110328.vcf, freebayes.20:8100000-8200000.baq.20110328.vcf, freebayes.20:8200000-8300000.baq.20110328.vcf, freebayes.20:8300000-8400000.baq.20110328.vcf, freebayes.20:8400000-8500000.baq.20110328.vcf, freebayes.20:8500000-8600000.baq.20110328.vcf, freebayes.20:8600000-8700000.baq.20110328.vcf, freebayes.20:8700000-8800000.baq.20110328.vcf, freebayes.20:8800000-8900000.baq.20110328.vcf, freebayes.20:8900000-9000000.baq.20110328.vcf, freebayes.20:9000000-9100000.baq.20110328.vcf, freebayes.20:9100000-9200000.baq.20110328.vcf, freebayes.20:9200000-9300000.baq.20110328.vcf, freebayes.20:9300000-9400000.baq.20110328.vcf, freebayes.20:9400000-9500000.baq.20110328.vcf, freebayes.20:9500000-9600000.baq.20110328.vcf, freebayes.20:9600000-9700000.baq.20110328.vcf, freebayes.20:9700000-9800000.baq.20110328.vcf, freebayes.20:9800000-9900000.baq.20110328.vcf, freebayes.20:9900000-10000000.baq.20110328.vcf, freebayes.20:10000000-10100000.baq.20110328.vcf, freebayes.20:10100000-10200000.baq.20110328.vcf, freebayes.20:10200000-10300000.baq.20110328.vcf, freebayes.20:10300000-10400000.baq.20110328.vcf, freebayes.20:10400000-10500000.baq.20110328.vcf, freebayes.20:10500000-10600000.baq.20110328.vcf, freebayes.20:10600000-10700000.baq.20110328.vcf, freebayes.20:10700000-10800000.baq.20110328.vcf, freebayes.20:10800000-10900000.baq.20110328.vcf, freebayes.20:10900000-11000000.baq.20110328.vcf, freebayes.20:11000000-11100000.baq.20110328.vcf, freebayes.20:11100000-11200000.baq.20110328.vcf, freebayes.20:11200000-11300000.baq.20110328.vcf, freebayes.20:11300000-11400000.baq.20110328.vcf, freebayes.20:11400000-11500000.baq.20110328.vcf, freebayes.20:11500000-11600000.baq.20110328.vcf, freebayes.20:11600000-11700000.baq.20110328.vcf, freebayes.20:11700000-11800000.baq.20110328.vcf, freebayes.20:11800000-11900000.baq.20110328.vcf, freebayes.20:11900000-12000000.baq.20110328.vcf, freebayes.20:12000000-12100000.baq.20110328.vcf, freebayes.20:12100000-12200000.baq.20110328.vcf, freebayes.20:12200000-12300000.baq.20110328.vcf, freebayes.20:12300000-12400000.baq.20110328.vcf, freebayes.20:12400000-12500000.baq.20110328.vcf, freebayes.20:12500000-12600000.baq.20110328.vcf, freebayes.20:12600000-12700000.baq.20110328.vcf, freebayes.20:12700000-12800000.baq.20110328.vcf, freebayes.20:12800000-12900000.baq.20110328.vcf, freebayes.20:12900000-13000000.baq.20110328.vcf, freebayes.20:13000000-13100000.baq.20110328.vcf, freebayes.20:13100000-13200000.baq.20110328.vcf, freebayes.20:13200000-13300000.baq.20110328.vcf, freebayes.20:13300000-13400000.baq.20110328.vcf, freebayes.20:13400000-13500000.baq.20110328.vcf, freebayes.20:13500000-13600000.baq.20110328.vcf, freebayes.20:13600000-13700000.baq.20110328.vcf, freebayes.20:13700000-13800000.baq.20110328.vcf, freebayes.20:13800000-13900000.baq.20110328.vcf, freebayes.20:13900000-14000000.baq.20110328.vcf, freebayes.20:14000000-14100000.baq.20110328.vcf, freebayes.20:14100000-14200000.baq.20110328.vcf, freebayes.20:14200000-14300000.baq.20110328.vcf, freebayes.20:14300000-14400000.baq.20110328.vcf, freebayes.20:14400000-14500000.baq.20110328.vcf, freebayes.20:14500000-14600000.baq.20110328.vcf, freebayes.20:14600000-14700000.baq.20110328.vcf, freebayes.20:14700000-14800000.baq.20110328.vcf, freebayes.20:14800000-14900000.baq.20110328.vcf, freebayes.20:14900000-15000000.baq.20110328.vcf, freebayes.20:15000000-15100000.baq.20110328.vcf, freebayes.20:15100000-15200000.baq.20110328.vcf, freebayes.20:15200000-15300000.baq.20110328.vcf, freebayes.20:15300000-15400000.baq.20110328.vcf, freebayes.20:15400000-15500000.baq.20110328.vcf, freebayes.20:15500000-15600000.baq.20110328.vcf, freebayes.20:15600000-15700000.baq.20110328.vcf, freebayes.20:15700000-15800000.baq.20110328.vcf, freebayes.20:15800000-15900000.baq.20110328.vcf, freebayes.20:15900000-16000000.baq.20110328.vcf, freebayes.20:16000000-16100000.baq.20110328.vcf, freebayes.20:16100000-16200000.baq.20110328.vcf, freebayes.20:16200000-16300000.baq.20110328.vcf, freebayes.20:16300000-16400000.baq.20110328.vcf, freebayes.20:16400000-16500000.baq.20110328.vcf, freebayes.20:16500000-16600000.baq.20110328.vcf, freebayes.20:16600000-16700000.baq.20110328.vcf, freebayes.20:16700000-16800000.baq.20110328.vcf, freebayes.20:16800000-16900000.baq.20110328.vcf, freebayes.20:16900000-17000000.baq.20110328.vcf, freebayes.20:17000000-17100000.baq.20110328.vcf, freebayes.20:17100000-17200000.baq.20110328.vcf, freebayes.20:17200000-17300000.baq.20110328.vcf, freebayes.20:17300000-17400000.baq.20110328.vcf, freebayes.20:17400000-17500000.baq.20110328.vcf, freebayes.20:17500000-17600000.baq.20110328.vcf, freebayes.20:17600000-17700000.baq.20110328.vcf, freebayes.20:17700000-17800000.baq.20110328.vcf, freebayes.20:17800000-17900000.baq.20110328.vcf, freebayes.20:17900000-18000000.baq.20110328.vcf, freebayes.20:18000000-18100000.baq.20110328.vcf, freebayes.20:18100000-18200000.baq.20110328.vcf, freebayes.20:18200000-18300000.baq.20110328.vcf, freebayes.20:18300000-18400000.baq.20110328.vcf, freebayes.20:18400000-18500000.baq.20110328.vcf, freebayes.20:18500000-18600000.baq.20110328.vcf, freebayes.20:18600000-18700000.baq.20110328.vcf, freebayes.20:18700000-18800000.baq.20110328.vcf, freebayes.20:18800000-18900000.baq.20110328.vcf, freebayes.20:18900000-19000000.baq.20110328.vcf, freebayes.20:19000000-19100000.baq.20110328.vcf, freebayes.20:19100000-19200000.baq.20110328.vcf, freebayes.20:19200000-19300000.baq.20110328.vcf, freebayes.20:19300000-19400000.baq.20110328.vcf, freebayes.20:19400000-19500000.baq.20110328.vcf, freebayes.20:19500000-19600000.baq.20110328.vcf, freebayes.20:19600000-19700000.baq.20110328.vcf, freebayes.20:19700000-19800000.baq.20110328.vcf, freebayes.20:19800000-19900000.baq.20110328.vcf, freebayes.20:19900000-20000000.baq.20110328.vcf, freebayes.20:20000000-20100000.baq.20110328.vcf, freebayes.20:20100000-20200000.baq.20110328.vcf, freebayes.20:20200000-20300000.baq.20110328.vcf, freebayes.20:20300000-20400000.baq.20110328.vcf, freebayes.20:20400000-20500000.baq.20110328.vcf, freebayes.20:20500000-20600000.baq.20110328.vcf, freebayes.20:20600000-20700000.baq.20110328.vcf, freebayes.20:20700000-20800000.baq.20110328.vcf, freebayes.20:20800000-20900000.baq.20110328.vcf, freebayes.20:20900000-21000000.baq.20110328.vcf, freebayes.20:21000000-21100000.baq.20110328.vcf, freebayes.20:21100000-21200000.baq.20110328.vcf, freebayes.20:21200000-21300000.baq.20110328.vcf, freebayes.20:21300000-21400000.baq.20110328.vcf, freebayes.20:21400000-21500000.baq.20110328.vcf, freebayes.20:21500000-21600000.baq.20110328.vcf, freebayes.20:21600000-21700000.baq.20110328.vcf, freebayes.20:21700000-21800000.baq.20110328.vcf, freebayes.20:21800000-21900000.baq.20110328.vcf, freebayes.20:21900000-22000000.baq.20110328.vcf, freebayes.20:22000000-22100000.baq.20110328.vcf, freebayes.20:22100000-22200000.baq.20110328.vcf, freebayes.20:22200000-22300000.baq.20110328.vcf, freebayes.20:22300000-22400000.baq.20110328.vcf, freebayes.20:22400000-22500000.baq.20110328.vcf, freebayes.20:22500000-22600000.baq.20110328.vcf, freebayes.20:22600000-22700000.baq.20110328.vcf, freebayes.20:22700000-22800000.baq.20110328.vcf, freebayes.20:22800000-22900000.baq.20110328.vcf, freebayes.20:22900000-23000000.baq.20110328.vcf, freebayes.20:23000000-23100000.baq.20110328.vcf, freebayes.20:23100000-23200000.baq.20110328.vcf, freebayes.20:23200000-23300000.baq.20110328.vcf, freebayes.20:23300000-23400000.baq.20110328.vcf, freebayes.20:23400000-23500000.baq.20110328.vcf, freebayes.20:23500000-23600000.baq.20110328.vcf, freebayes.20:23600000-23700000.baq.20110328.vcf, freebayes.20:23700000-23800000.baq.20110328.vcf, freebayes.20:23800000-23900000.baq.20110328.vcf, freebayes.20:23900000-24000000.baq.20110328.vcf, freebayes.20:24000000-24100000.baq.20110328.vcf, freebayes.20:24100000-24200000.baq.20110328.vcf, freebayes.20:24200000-24300000.baq.20110328.vcf, freebayes.20:24300000-24400000.baq.20110328.vcf, freebayes.20:24400000-24500000.baq.20110328.vcf, freebayes.20:24500000-24600000.baq.20110328.vcf, freebayes.20:24600000-24700000.baq.20110328.vcf, freebayes.20:24700000-24800000.baq.20110328.vcf, freebayes.20:24800000-24900000.baq.20110328.vcf, freebayes.20:24900000-25000000.baq.20110328.vcf, freebayes.20:25000000-25100000.baq.20110328.vcf, freebayes.20:25100000-25200000.baq.20110328.vcf, freebayes.20:25200000-25300000.baq.20110328.vcf, freebayes.20:25300000-25400000.baq.20110328.vcf, freebayes.20:25400000-25500000.baq.20110328.vcf, freebayes.20:25500000-25600000.baq.20110328.vcf, freebayes.20:25600000-25700000.baq.20110328.vcf, freebayes.20:25700000-25800000.baq.20110328.vcf, freebayes.20:25800000-25900000.baq.20110328.vcf, freebayes.20:25900000-26000000.baq.20110328.vcf, freebayes.20:26000000-26100000.baq.20110328.vcf, freebayes.20:26100000-26200000.baq.20110328.vcf, freebayes.20:26200000-26300000.baq.20110328.vcf, freebayes.20:26300000-26400000.baq.20110328.vcf, freebayes.20:26400000-26500000.baq.20110328.vcf, freebayes.20:26500000-26600000.baq.20110328.vcf, freebayes.20:26600000-26700000.baq.20110328.vcf, freebayes.20:26700000-26800000.baq.20110328.vcf, freebayes.20:26800000-26900000.baq.20110328.vcf, freebayes.20:26900000-27000000.baq.20110328.vcf, freebayes.20:27000000-27100000.baq.20110328.vcf, freebayes.20:27100000-27200000.baq.20110328.vcf, freebayes.20:27200000-27300000.baq.20110328.vcf, freebayes.20:27300000-27400000.baq.20110328.vcf, freebayes.20:27400000-27500000.baq.20110328.vcf, freebayes.20:27500000-27600000.baq.20110328.vcf, freebayes.20:27600000-27700000.baq.20110328.vcf, freebayes.20:27700000-27800000.baq.20110328.vcf, freebayes.20:27800000-27900000.baq.20110328.vcf, freebayes.20:27900000-28000000.baq.20110328.vcf, freebayes.20:28000000-28100000.baq.20110328.vcf, freebayes.20:28100000-28200000.baq.20110328.vcf, freebayes.20:28200000-28300000.baq.20110328.vcf, freebayes.20:28300000-28400000.baq.20110328.vcf, freebayes.20:28400000-28500000.baq.20110328.vcf, freebayes.20:28500000-28600000.baq.20110328.vcf, freebayes.20:28600000-28700000.baq.20110328.vcf, freebayes.20:28700000-28800000.baq.20110328.vcf, freebayes.20:28800000-28900000.baq.20110328.vcf, freebayes.20:28900000-29000000.baq.20110328.vcf, freebayes.20:29000000-29100000.baq.20110328.vcf, freebayes.20:29100000-29200000.baq.20110328.vcf, freebayes.20:29200000-29300000.baq.20110328.vcf, freebayes.20:29300000-29400000.baq.20110328.vcf, freebayes.20:29400000-29500000.baq.20110328.vcf, freebayes.20:29500000-29600000.baq.20110328.vcf, freebayes.20:29600000-29700000.baq.20110328.vcf, freebayes.20:29700000-29800000.baq.20110328.vcf, freebayes.20:29800000-29900000.baq.20110328.vcf, freebayes.20:29900000-30000000.baq.20110328.vcf, freebayes.20:30000000-30100000.baq.20110328.vcf, freebayes.20:30100000-30200000.baq.20110328.vcf, freebayes.20:30200000-30300000.baq.20110328.vcf, freebayes.20:30300000-30400000.baq.20110328.vcf, freebayes.20:30400000-30500000.baq.20110328.vcf, freebayes.20:30500000-30600000.baq.20110328.vcf, freebayes.20:30600000-30700000.baq.20110328.vcf, freebayes.20:30700000-30800000.baq.20110328.vcf, freebayes.20:30800000-30900000.baq.20110328.vcf, freebayes.20:30900000-31000000.baq.20110328.vcf, freebayes.20:31000000-31100000.baq.20110328.vcf, freebayes.20:31100000-31200000.baq.20110328.vcf, freebayes.20:31200000-31300000.baq.20110328.vcf, freebayes.20:31300000-31400000.baq.20110328.vcf, freebayes.20:31400000-31500000.baq.20110328.vcf, freebayes.20:31500000-31600000.baq.20110328.vcf, freebayes.20:31600000-31700000.baq.20110328.vcf, freebayes.20:31700000-31800000.baq.20110328.vcf, freebayes.20:31800000-31900000.baq.20110328.vcf, freebayes.20:31900000-32000000.baq.20110328.vcf, freebayes.20:32000000-32100000.baq.20110328.vcf, freebayes.20:32100000-32200000.baq.20110328.vcf, freebayes.20:32200000-32300000.baq.20110328.vcf, freebayes.20:32300000-32400000.baq.20110328.vcf, freebayes.20:32400000-32500000.baq.20110328.vcf, freebayes.20:32500000-32600000.baq.20110328.vcf, freebayes.20:32600000-32700000.baq.20110328.vcf, freebayes.20:32700000-32800000.baq.20110328.vcf, freebayes.20:32800000-32900000.baq.20110328.vcf, freebayes.20:32900000-33000000.baq.20110328.vcf, freebayes.20:33000000-33100000.baq.20110328.vcf, freebayes.20:33100000-33200000.baq.20110328.vcf, freebayes.20:33200000-33300000.baq.20110328.vcf, freebayes.20:33300000-33400000.baq.20110328.vcf, freebayes.20:33400000-33500000.baq.20110328.vcf, freebayes.20:33500000-33600000.baq.20110328.vcf, freebayes.20:33600000-33700000.baq.20110328.vcf, freebayes.20:33700000-33800000.baq.20110328.vcf, freebayes.20:33800000-33900000.baq.20110328.vcf, freebayes.20:33900000-34000000.baq.20110328.vcf, freebayes.20:34000000-34100000.baq.20110328.vcf, freebayes.20:34100000-34200000.baq.20110328.vcf, freebayes.20:34200000-34300000.baq.20110328.vcf, freebayes.20:34300000-34400000.baq.20110328.vcf, freebayes.20:34400000-34500000.baq.20110328.vcf, freebayes.20:34500000-34600000.baq.20110328.vcf, freebayes.20:34600000-34700000.baq.20110328.vcf, freebayes.20:34700000-34800000.baq.20110328.vcf, freebayes.20:34800000-34900000.baq.20110328.vcf, freebayes.20:34900000-35000000.baq.20110328.vcf, freebayes.20:35000000-35100000.baq.20110328.vcf, freebayes.20:35100000-35200000.baq.20110328.vcf, freebayes.20:35200000-35300000.baq.20110328.vcf, freebayes.20:35300000-35400000.baq.20110328.vcf, freebayes.20:35400000-35500000.baq.20110328.vcf, freebayes.20:35500000-35600000.baq.20110328.vcf, freebayes.20:35600000-35700000.baq.20110328.vcf, freebayes.20:35700000-35800000.baq.20110328.vcf, freebayes.20:35800000-35900000.baq.20110328.vcf, freebayes.20:35900000-36000000.baq.20110328.vcf, freebayes.20:36000000-36100000.baq.20110328.vcf, freebayes.20:36100000-36200000.baq.20110328.vcf, freebayes.20:36200000-36300000.baq.20110328.vcf, freebayes.20:36300000-36400000.baq.20110328.vcf, freebayes.20:36400000-36500000.baq.20110328.vcf, freebayes.20:36500000-36600000.baq.20110328.vcf, freebayes.20:36600000-36700000.baq.20110328.vcf, freebayes.20:36700000-36800000.baq.20110328.vcf, freebayes.20:36800000-36900000.baq.20110328.vcf, freebayes.20:36900000-37000000.baq.20110328.vcf, freebayes.20:37000000-37100000.baq.20110328.vcf, freebayes.20:37100000-37200000.baq.20110328.vcf, freebayes.20:37200000-37300000.baq.20110328.vcf, freebayes.20:37300000-37400000.baq.20110328.vcf, freebayes.20:37400000-37500000.baq.20110328.vcf, freebayes.20:37500000-37600000.baq.20110328.vcf, freebayes.20:37600000-37700000.baq.20110328.vcf, freebayes.20:37700000-37800000.baq.20110328.vcf, freebayes.20:37800000-37900000.baq.20110328.vcf, freebayes.20:37900000-38000000.baq.20110328.vcf, freebayes.20:38000000-38100000.baq.20110328.vcf, freebayes.20:38100000-38200000.baq.20110328.vcf, freebayes.20:38200000-38300000.baq.20110328.vcf, freebayes.20:38300000-38400000.baq.20110328.vcf, freebayes.20:38400000-38500000.baq.20110328.vcf, freebayes.20:38500000-38600000.baq.20110328.vcf, freebayes.20:38600000-38700000.baq.20110328.vcf, freebayes.20:38700000-38800000.baq.20110328.vcf, freebayes.20:38800000-38900000.baq.20110328.vcf, freebayes.20:38900000-39000000.baq.20110328.vcf, freebayes.20:39000000-39100000.baq.20110328.vcf, freebayes.20:39100000-39200000.baq.20110328.vcf, freebayes.20:39200000-39300000.baq.20110328.vcf, freebayes.20:39300000-39400000.baq.20110328.vcf, freebayes.20:39400000-39500000.baq.20110328.vcf, freebayes.20:39500000-39600000.baq.20110328.vcf, freebayes.20:39600000-39700000.baq.20110328.vcf, freebayes.20:39700000-39800000.baq.20110328.vcf, freebayes.20:39800000-39900000.baq.20110328.vcf, freebayes.20:39900000-40000000.baq.20110328.vcf, freebayes.20:40000000-40100000.baq.20110328.vcf, freebayes.20:40100000-40200000.baq.20110328.vcf, freebayes.20:40200000-40300000.baq.20110328.vcf, freebayes.20:40300000-40400000.baq.20110328.vcf, freebayes.20:40400000-40500000.baq.20110328.vcf, freebayes.20:40500000-40600000.baq.20110328.vcf, freebayes.20:40600000-40700000.baq.20110328.vcf, freebayes.20:40700000-40800000.baq.20110328.vcf, freebayes.20:40800000-40900000.baq.20110328.vcf, freebayes.20:40900000-41000000.baq.20110328.vcf, freebayes.20:41000000-41100000.baq.20110328.vcf, freebayes.20:41100000-41200000.baq.20110328.vcf, freebayes.20:41200000-41300000.baq.20110328.vcf, freebayes.20:41300000-41400000.baq.20110328.vcf, freebayes.20:41400000-41500000.baq.20110328.vcf, freebayes.20:41500000-41600000.baq.20110328.vcf, freebayes.20:41600000-41700000.baq.20110328.vcf, freebayes.20:41700000-41800000.baq.20110328.vcf, freebayes.20:41800000-41900000.baq.20110328.vcf, freebayes.20:41900000-42000000.baq.20110328.vcf, freebayes.20:42000000-42100000.baq.20110328.vcf, freebayes.20:42100000-42200000.baq.20110328.vcf, freebayes.20:42200000-42300000.baq.20110328.vcf, freebayes.20:42300000-42400000.baq.20110328.vcf, freebayes.20:42400000-42500000.baq.20110328.vcf, freebayes.20:42500000-42600000.baq.20110328.vcf, freebayes.20:42600000-42700000.baq.20110328.vcf, freebayes.20:42700000-42800000.baq.20110328.vcf, freebayes.20:42800000-42900000.baq.20110328.vcf, freebayes.20:42900000-43000000.baq.20110328.vcf, freebayes.20:43000000-43100000.baq.20110328.vcf, freebayes.20:43100000-43200000.baq.20110328.vcf, freebayes.20:43200000-43300000.baq.20110328.vcf, freebayes.20:43300000-43400000.baq.20110328.vcf, freebayes.20:43400000-43500000.baq.20110328.vcf, freebayes.20:43500000-43600000.baq.20110328.vcf, freebayes.20:43600000-43700000.baq.20110328.vcf, freebayes.20:43700000-43800000.baq.20110328.vcf, freebayes.20:43800000-43900000.baq.20110328.vcf, freebayes.20:43900000-44000000.baq.20110328.vcf, freebayes.20:44000000-44100000.baq.20110328.vcf, freebayes.20:44100000-44200000.baq.20110328.vcf, freebayes.20:44200000-44300000.baq.20110328.vcf, freebayes.20:44300000-44400000.baq.20110328.vcf, freebayes.20:44400000-44500000.baq.20110328.vcf, freebayes.20:44500000-44600000.baq.20110328.vcf, freebayes.20:44600000-44700000.baq.20110328.vcf, freebayes.20:44700000-44800000.baq.20110328.vcf, freebayes.20:44800000-44900000.baq.20110328.vcf, freebayes.20:44900000-45000000.baq.20110328.vcf, freebayes.20:45000000-45100000.baq.20110328.vcf, freebayes.20:45100000-45200000.baq.20110328.vcf, freebayes.20:45200000-45300000.baq.20110328.vcf, freebayes.20:45300000-45400000.baq.20110328.vcf, freebayes.20:45400000-45500000.baq.20110328.vcf, freebayes.20:45500000-45600000.baq.20110328.vcf, freebayes.20:45600000-45700000.baq.20110328.vcf, freebayes.20:45700000-45800000.baq.20110328.vcf, freebayes.20:45800000-45900000.baq.20110328.vcf, freebayes.20:45900000-46000000.baq.20110328.vcf, freebayes.20:46000000-46100000.baq.20110328.vcf, freebayes.20:46100000-46200000.baq.20110328.vcf, freebayes.20:46200000-46300000.baq.20110328.vcf, freebayes.20:46300000-46400000.baq.20110328.vcf, freebayes.20:46400000-46500000.baq.20110328.vcf, freebayes.20:46500000-46600000.baq.20110328.vcf, freebayes.20:46600000-46700000.baq.20110328.vcf, freebayes.20:46700000-46800000.baq.20110328.vcf, freebayes.20:46800000-46900000.baq.20110328.vcf, freebayes.20:46900000-47000000.baq.20110328.vcf, freebayes.20:47000000-47100000.baq.20110328.vcf, freebayes.20:47100000-47200000.baq.20110328.vcf, freebayes.20:47200000-47300000.baq.20110328.vcf, freebayes.20:47300000-47400000.baq.20110328.vcf, freebayes.20:47400000-47500000.baq.20110328.vcf, freebayes.20:47500000-47600000.baq.20110328.vcf, freebayes.20:47600000-47700000.baq.20110328.vcf, freebayes.20:47700000-47800000.baq.20110328.vcf, freebayes.20:47800000-47900000.baq.20110328.vcf, freebayes.20:47900000-48000000.baq.20110328.vcf, freebayes.20:48000000-48100000.baq.20110328.vcf, freebayes.20:48100000-48200000.baq.20110328.vcf, freebayes.20:48200000-48300000.baq.20110328.vcf, freebayes.20:48300000-48400000.baq.20110328.vcf, freebayes.20:48400000-48500000.baq.20110328.vcf, freebayes.20:48500000-48600000.baq.20110328.vcf, freebayes.20:48600000-48700000.baq.20110328.vcf, freebayes.20:48700000-48800000.baq.20110328.vcf, freebayes.20:48800000-48900000.baq.20110328.vcf, freebayes.20:48900000-49000000.baq.20110328.vcf, freebayes.20:49000000-49100000.baq.20110328.vcf, freebayes.20:49100000-49200000.baq.20110328.vcf, freebayes.20:49200000-49300000.baq.20110328.vcf, freebayes.20:49300000-49400000.baq.20110328.vcf, freebayes.20:49400000-49500000.baq.20110328.vcf, freebayes.20:49500000-49600000.baq.20110328.vcf, freebayes.20:49600000-49700000.baq.20110328.vcf, freebayes.20:49700000-49800000.baq.20110328.vcf, freebayes.20:49800000-49900000.baq.20110328.vcf, freebayes.20:49900000-50000000.baq.20110328.vcf, freebayes.20:50000000-50100000.baq.20110328.vcf, freebayes.20:50100000-50200000.baq.20110328.vcf, freebayes.20:50200000-50300000.baq.20110328.vcf, freebayes.20:50300000-50400000.baq.20110328.vcf, freebayes.20:50400000-50500000.baq.20110328.vcf, freebayes.20:50500000-50600000.baq.20110328.vcf, freebayes.20:50600000-50700000.baq.20110328.vcf, freebayes.20:50700000-50800000.baq.20110328.vcf, freebayes.20:50800000-50900000.baq.20110328.vcf, freebayes.20:50900000-51000000.baq.20110328.vcf, freebayes.20:51000000-51100000.baq.20110328.vcf, freebayes.20:51100000-51200000.baq.20110328.vcf, freebayes.20:51200000-51300000.baq.20110328.vcf, freebayes.20:51300000-51400000.baq.20110328.vcf, freebayes.20:51400000-51500000.baq.20110328.vcf, freebayes.20:51500000-51600000.baq.20110328.vcf, freebayes.20:51600000-51700000.baq.20110328.vcf, freebayes.20:51700000-51800000.baq.20110328.vcf, freebayes.20:51800000-51900000.baq.20110328.vcf, freebayes.20:51900000-52000000.baq.20110328.vcf, freebayes.20:52000000-52100000.baq.20110328.vcf, freebayes.20:52100000-52200000.baq.20110328.vcf, freebayes.20:52200000-52300000.baq.20110328.vcf, freebayes.20:52300000-52400000.baq.20110328.vcf, freebayes.20:52400000-52500000.baq.20110328.vcf, freebayes.20:52500000-52600000.baq.20110328.vcf, freebayes.20:52600000-52700000.baq.20110328.vcf, freebayes.20:52700000-52800000.baq.20110328.vcf, freebayes.20:52800000-52900000.baq.20110328.vcf, freebayes.20:52900000-53000000.baq.20110328.vcf, freebayes.20:53000000-53100000.baq.20110328.vcf, freebayes.20:53100000-53200000.baq.20110328.vcf, freebayes.20:53200000-53300000.baq.20110328.vcf, freebayes.20:53300000-53400000.baq.20110328.vcf, freebayes.20:53400000-53500000.baq.20110328.vcf, freebayes.20:53500000-53600000.baq.20110328.vcf, freebayes.20:53600000-53700000.baq.20110328.vcf, freebayes.20:53700000-53800000.baq.20110328.vcf, freebayes.20:53800000-53900000.baq.20110328.vcf, freebayes.20:53900000-54000000.baq.20110328.vcf, freebayes.20:54000000-54100000.baq.20110328.vcf, freebayes.20:54100000-54200000.baq.20110328.vcf, freebayes.20:54200000-54300000.baq.20110328.vcf, freebayes.20:54300000-54400000.baq.20110328.vcf, freebayes.20:54400000-54500000.baq.20110328.vcf, freebayes.20:54500000-54600000.baq.20110328.vcf, freebayes.20:54600000-54700000.baq.20110328.vcf, freebayes.20:54700000-54800000.baq.20110328.vcf, freebayes.20:54800000-54900000.baq.20110328.vcf, freebayes.20:54900000-55000000.baq.20110328.vcf, freebayes.20:55000000-55100000.baq.20110328.vcf, freebayes.20:55100000-55200000.baq.20110328.vcf, freebayes.20:55200000-55300000.baq.20110328.vcf, freebayes.20:55300000-55400000.baq.20110328.vcf, freebayes.20:55400000-55500000.baq.20110328.vcf, freebayes.20:55500000-55600000.baq.20110328.vcf, freebayes.20:55600000-55700000.baq.20110328.vcf, freebayes.20:55700000-55800000.baq.20110328.vcf, freebayes.20:55800000-55900000.baq.20110328.vcf, freebayes.20:55900000-56000000.baq.20110328.vcf, freebayes.20:56000000-56100000.baq.20110328.vcf, freebayes.20:56100000-56200000.baq.20110328.vcf, freebayes.20:56200000-56300000.baq.20110328.vcf, freebayes.20:56300000-56400000.baq.20110328.vcf, freebayes.20:56400000-56500000.baq.20110328.vcf, freebayes.20:56500000-56600000.baq.20110328.vcf, freebayes.20:56600000-56700000.baq.20110328.vcf, freebayes.20:56700000-56800000.baq.20110328.vcf, freebayes.20:56800000-56900000.baq.20110328.vcf, freebayes.20:56900000-57000000.baq.20110328.vcf, freebayes.20:57000000-57100000.baq.20110328.vcf, freebayes.20:57100000-57200000.baq.20110328.vcf, freebayes.20:57200000-57300000.baq.20110328.vcf, freebayes.20:57300000-57400000.baq.20110328.vcf, freebayes.20:57400000-57500000.baq.20110328.vcf, freebayes.20:57500000-57600000.baq.20110328.vcf, freebayes.20:57600000-57700000.baq.20110328.vcf, freebayes.20:57700000-57800000.baq.20110328.vcf, freebayes.20:57800000-57900000.baq.20110328.vcf, freebayes.20:57900000-58000000.baq.20110328.vcf, freebayes.20:58000000-58100000.baq.20110328.vcf, freebayes.20:58100000-58200000.baq.20110328.vcf, freebayes.20:58200000-58300000.baq.20110328.vcf, freebayes.20:58300000-58400000.baq.20110328.vcf, freebayes.20:58400000-58500000.baq.20110328.vcf, freebayes.20:58500000-58600000.baq.20110328.vcf, freebayes.20:58600000-58700000.baq.20110328.vcf, freebayes.20:58700000-58800000.baq.20110328.vcf, freebayes.20:58800000-58900000.baq.20110328.vcf, freebayes.20:58900000-59000000.baq.20110328.vcf, freebayes.20:59000000-59100000.baq.20110328.vcf, freebayes.20:59100000-59200000.baq.20110328.vcf, freebayes.20:59200000-59300000.baq.20110328.vcf, freebayes.20:59300000-59400000.baq.20110328.vcf, freebayes.20:59400000-59500000.baq.20110328.vcf, freebayes.20:59500000-59600000.baq.20110328.vcf, freebayes.20:59600000-59700000.baq.20110328.vcf, freebayes.20:59700000-59800000.baq.20110328.vcf, freebayes.20:59800000-59900000.baq.20110328.vcf, freebayes.20:59900000-60000000.baq.20110328.vcf, freebayes.20:60000000-60100000.baq.20110328.vcf, freebayes.20:60100000-60200000.baq.20110328.vcf, freebayes.20:60200000-60300000.baq.20110328.vcf, freebayes.20:60300000-60400000.baq.20110328.vcf, freebayes.20:60400000-60500000.baq.20110328.vcf, freebayes.20:60500000-60600000.baq.20110328.vcf, freebayes.20:60600000-60700000.baq.20110328.vcf, freebayes.20:60700000-60800000.baq.20110328.vcf, freebayes.20:60800000-60900000.baq.20110328.vcf, freebayes.20:60900000-61000000.baq.20110328.vcf, freebayes.20:61000000-61100000.baq.20110328.vcf, freebayes.20:61100000-61200000.baq.20110328.vcf, freebayes.20:61200000-61300000.baq.20110328.vcf, freebayes.20:61300000-61400000.baq.20110328.vcf, freebayes.20:61400000-61500000.baq.20110328.vcf, freebayes.20:61500000-61600000.baq.20110328.vcf, freebayes.20:61600000-61700000.baq.20110328.vcf, freebayes.20:61700000-61800000.baq.20110328.vcf, freebayes.20:61800000-61900000.baq.20110328.vcf, freebayes.20:61900000-62000000.baq.20110328.vcf, freebayes.20:62000000-62100000.baq.20110328.vcf, freebayes.20:62100000-62200000.baq.20110328.vcf, freebayes.20:62200000-62300000.baq.20110328.vcf, freebayes.20:62300000-62400000.baq.20110328.vcf, freebayes.20:62400000-62500000.baq.20110328.vcf, freebayes.20:62500000-62600000.baq.20110328.vcf, freebayes.20:62600000-62700000.baq.20110328.vcf, freebayes.20:62700000-62800000.baq.20110328.vcf, freebayes.20:62800000-62900000.baq.20110328.vcf, freebayes.20:62900000-63000000.baq.20110328.vcf, freebayes.20:63000000-63025520.baq.20110328.vcf -#CHROM POS ID REF ALT QUAL FILTER INFO +#CHROM POS ID REF ALT QUAL FILTER INFO 20 458502 . G GA 4567.01 PASS AA=20;AB=0.61111;ABA=14;ABP=6.8707;ABR=22;AC=38;AF=0.0544;AN=698;BL=374;BR=1129;BVAR;BaseQRankSum=13.364;DP=15979;DP4=1882,2188,45,37;Dels=0.00;EL=5;EPP=13.868;ER=15;FR;FS=6.503;HETAR=11;HOMA=2;HOMR=985;HP=1;HPLen=2;HR=2;HRun=0;HU=G;INDEL;INS;InbreedingCoeff=0.0157;LEN=1;LRB=0.50233;LRBP=826.56;MQ=66.16;MQ0Fraction=0.0110;MQM=70.5;MQRankSum=-3.158;NF;NR;NS=998;PP;PV4=0.15,1,0.42,0.15;RA=3173;RL=1;RPP=38.188;RR=19;RUN=1;ReadPosRankSum=-2.346;SAB=0.7;SAF=14;SAP=9.959;SAR=6;SC=GGGCGTGGTGGTGCATGTAAT;SRB=0.50047;SRF=1588;SRP=3.0165;SRR=1585;TC;TR=9;TU=GGT;VQSLOD=10.0079;set=Intersection;sumGLbyD=23.94 20 539571 . TG T 18546 PASS AA=71;AB=0.92482;ABA=63;ABP=1316.6;ABR=775;AC=42;AF=0.03512;AN=1196;BL=3915;BR=252;BVAR;BaseQRankSum=0.556;DEL;DP=10073;Dels=0.01;EL=47;EPP=19.189;ER=24;FS=2.124;HETAR=290;HOMA=156;HOMR=570;HRun=1;InbreedingCoeff=0.0620;LEN=1;LRB=0.87905;LRBP=6995.1;MQ0=0;MQ0Fraction=0.0000;MQM=127.99;MQRankSum=0.410;NS=1016;RA=3090;RL=71;RPP=157.18;RR=0;RUN=1;ReadPosRankSum=-11.038;SAB=0.66197;SAF=47;SAP=19.189;SAR=24;SRB=0.55016;SRF=1700;SRP=70.544;SRR=1390;VQSLOD=2.6772;set=filterInVQSR-2of5;sumGLbyD=4.71 20 573764 . TA T 591.51 PASS AC=91;AF=0.1987;AN=458;BaseQRankSum=0.137;DP=519;FS=3.153;HRun=1;HaplotypeScore=14.0744;InbreedingCoeff=0.1460;MQ=48.16;MQ0=26;MQ0Fraction=0.0501;MQRankSum=-1.636;QD=3.63;ReadPosRankSum=-4.140;SB=-408.14;VQSLOD=5.2458;set=VQSR diff --git a/tests/tabix_data/vcf/2.vcf b/tests/tabix_data/vcf/2.vcf index c77f2c4..1b338fc 100644 --- a/tests/tabix_data/vcf/2.vcf +++ b/tests/tabix_data/vcf/2.vcf @@ -131,7 +131,7 @@ ##source_20110031.1=/nfs/users/nfs_p/pd3/cvs/vcftools/perl/vcf-annotate -d /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.desc -a /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.tab.gz -c CHROM,FROM,INFO/VLD,INFO/KGPilot123,INFO/dbSNP ##vcfCTools=filter ##vcfCtools=merge freebayes.20:0-100000.baq.20110328.vcf, freebayes.20:100000-200000.baq.20110328.vcf, freebayes.20:200000-300000.baq.20110328.vcf, freebayes.20:300000-400000.baq.20110328.vcf, freebayes.20:400000-500000.baq.20110328.vcf, freebayes.20:500000-600000.baq.20110328.vcf, freebayes.20:600000-700000.baq.20110328.vcf, freebayes.20:700000-800000.baq.20110328.vcf, freebayes.20:800000-900000.baq.20110328.vcf, freebayes.20:900000-1000000.baq.20110328.vcf, freebayes.20:1000000-1100000.baq.20110328.vcf, freebayes.20:1100000-1200000.baq.20110328.vcf, freebayes.20:1200000-1300000.baq.20110328.vcf, freebayes.20:1300000-1400000.baq.20110328.vcf, freebayes.20:1400000-1500000.baq.20110328.vcf, freebayes.20:1500000-1600000.baq.20110328.vcf, freebayes.20:1600000-1700000.baq.20110328.vcf, freebayes.20:1700000-1800000.baq.20110328.vcf, freebayes.20:1800000-1900000.baq.20110328.vcf, freebayes.20:1900000-2000000.baq.20110328.vcf, freebayes.20:2000000-2100000.baq.20110328.vcf, freebayes.20:2100000-2200000.baq.20110328.vcf, freebayes.20:2200000-2300000.baq.20110328.vcf, freebayes.20:2300000-2400000.baq.20110328.vcf, freebayes.20:2400000-2500000.baq.20110328.vcf, freebayes.20:2500000-2600000.baq.20110328.vcf, freebayes.20:2600000-2700000.baq.20110328.vcf, freebayes.20:2700000-2800000.baq.20110328.vcf, freebayes.20:2800000-2900000.baq.20110328.vcf, freebayes.20:2900000-3000000.baq.20110328.vcf, freebayes.20:3000000-3100000.baq.20110328.vcf, freebayes.20:3100000-3200000.baq.20110328.vcf, freebayes.20:3200000-3300000.baq.20110328.vcf, freebayes.20:3300000-3400000.baq.20110328.vcf, freebayes.20:3400000-3500000.baq.20110328.vcf, freebayes.20:3500000-3600000.baq.20110328.vcf, freebayes.20:3600000-3700000.baq.20110328.vcf, freebayes.20:3700000-3800000.baq.20110328.vcf, freebayes.20:3800000-3900000.baq.20110328.vcf, freebayes.20:3900000-4000000.baq.20110328.vcf, freebayes.20:4000000-4100000.baq.20110328.vcf, freebayes.20:4100000-4200000.baq.20110328.vcf, freebayes.20:4200000-4300000.baq.20110328.vcf, freebayes.20:4300000-4400000.baq.20110328.vcf, freebayes.20:4400000-4500000.baq.20110328.vcf, freebayes.20:4500000-4600000.baq.20110328.vcf, freebayes.20:4600000-4700000.baq.20110328.vcf, freebayes.20:4700000-4800000.baq.20110328.vcf, freebayes.20:4800000-4900000.baq.20110328.vcf, freebayes.20:4900000-5000000.baq.20110328.vcf, freebayes.20:5000000-5100000.baq.20110328.vcf, freebayes.20:5100000-5200000.baq.20110328.vcf, freebayes.20:5200000-5300000.baq.20110328.vcf, freebayes.20:5300000-5400000.baq.20110328.vcf, freebayes.20:5400000-5500000.baq.20110328.vcf, freebayes.20:5500000-5600000.baq.20110328.vcf, freebayes.20:5600000-5700000.baq.20110328.vcf, freebayes.20:5700000-5800000.baq.20110328.vcf, freebayes.20:5800000-5900000.baq.20110328.vcf, freebayes.20:5900000-6000000.baq.20110328.vcf, freebayes.20:6000000-6100000.baq.20110328.vcf, freebayes.20:6100000-6200000.baq.20110328.vcf, freebayes.20:6200000-6300000.baq.20110328.vcf, freebayes.20:6300000-6400000.baq.20110328.vcf, freebayes.20:6400000-6500000.baq.20110328.vcf, freebayes.20:6500000-6600000.baq.20110328.vcf, freebayes.20:6600000-6700000.baq.20110328.vcf, freebayes.20:6700000-6800000.baq.20110328.vcf, freebayes.20:6800000-6900000.baq.20110328.vcf, freebayes.20:6900000-7000000.baq.20110328.vcf, freebayes.20:7000000-7100000.baq.20110328.vcf, freebayes.20:7100000-7200000.baq.20110328.vcf, freebayes.20:7200000-7300000.baq.20110328.vcf, freebayes.20:7300000-7400000.baq.20110328.vcf, freebayes.20:7400000-7500000.baq.20110328.vcf, freebayes.20:7500000-7600000.baq.20110328.vcf, freebayes.20:7600000-7700000.baq.20110328.vcf, freebayes.20:7700000-7800000.baq.20110328.vcf, freebayes.20:7800000-7900000.baq.20110328.vcf, freebayes.20:7900000-8000000.baq.20110328.vcf, freebayes.20:8000000-8100000.baq.20110328.vcf, freebayes.20:8100000-8200000.baq.20110328.vcf, freebayes.20:8200000-8300000.baq.20110328.vcf, freebayes.20:8300000-8400000.baq.20110328.vcf, freebayes.20:8400000-8500000.baq.20110328.vcf, freebayes.20:8500000-8600000.baq.20110328.vcf, freebayes.20:8600000-8700000.baq.20110328.vcf, freebayes.20:8700000-8800000.baq.20110328.vcf, freebayes.20:8800000-8900000.baq.20110328.vcf, freebayes.20:8900000-9000000.baq.20110328.vcf, freebayes.20:9000000-9100000.baq.20110328.vcf, freebayes.20:9100000-9200000.baq.20110328.vcf, freebayes.20:9200000-9300000.baq.20110328.vcf, freebayes.20:9300000-9400000.baq.20110328.vcf, freebayes.20:9400000-9500000.baq.20110328.vcf, freebayes.20:9500000-9600000.baq.20110328.vcf, freebayes.20:9600000-9700000.baq.20110328.vcf, freebayes.20:9700000-9800000.baq.20110328.vcf, freebayes.20:9800000-9900000.baq.20110328.vcf, freebayes.20:9900000-10000000.baq.20110328.vcf, freebayes.20:10000000-10100000.baq.20110328.vcf, freebayes.20:10100000-10200000.baq.20110328.vcf, freebayes.20:10200000-10300000.baq.20110328.vcf, freebayes.20:10300000-10400000.baq.20110328.vcf, freebayes.20:10400000-10500000.baq.20110328.vcf, freebayes.20:10500000-10600000.baq.20110328.vcf, freebayes.20:10600000-10700000.baq.20110328.vcf, freebayes.20:10700000-10800000.baq.20110328.vcf, freebayes.20:10800000-10900000.baq.20110328.vcf, freebayes.20:10900000-11000000.baq.20110328.vcf, freebayes.20:11000000-11100000.baq.20110328.vcf, freebayes.20:11100000-11200000.baq.20110328.vcf, freebayes.20:11200000-11300000.baq.20110328.vcf, freebayes.20:11300000-11400000.baq.20110328.vcf, freebayes.20:11400000-11500000.baq.20110328.vcf, freebayes.20:11500000-11600000.baq.20110328.vcf, freebayes.20:11600000-11700000.baq.20110328.vcf, freebayes.20:11700000-11800000.baq.20110328.vcf, freebayes.20:11800000-11900000.baq.20110328.vcf, freebayes.20:11900000-12000000.baq.20110328.vcf, freebayes.20:12000000-12100000.baq.20110328.vcf, freebayes.20:12100000-12200000.baq.20110328.vcf, freebayes.20:12200000-12300000.baq.20110328.vcf, freebayes.20:12300000-12400000.baq.20110328.vcf, freebayes.20:12400000-12500000.baq.20110328.vcf, freebayes.20:12500000-12600000.baq.20110328.vcf, freebayes.20:12600000-12700000.baq.20110328.vcf, freebayes.20:12700000-12800000.baq.20110328.vcf, freebayes.20:12800000-12900000.baq.20110328.vcf, freebayes.20:12900000-13000000.baq.20110328.vcf, freebayes.20:13000000-13100000.baq.20110328.vcf, freebayes.20:13100000-13200000.baq.20110328.vcf, freebayes.20:13200000-13300000.baq.20110328.vcf, freebayes.20:13300000-13400000.baq.20110328.vcf, freebayes.20:13400000-13500000.baq.20110328.vcf, freebayes.20:13500000-13600000.baq.20110328.vcf, freebayes.20:13600000-13700000.baq.20110328.vcf, freebayes.20:13700000-13800000.baq.20110328.vcf, freebayes.20:13800000-13900000.baq.20110328.vcf, freebayes.20:13900000-14000000.baq.20110328.vcf, freebayes.20:14000000-14100000.baq.20110328.vcf, freebayes.20:14100000-14200000.baq.20110328.vcf, freebayes.20:14200000-14300000.baq.20110328.vcf, freebayes.20:14300000-14400000.baq.20110328.vcf, freebayes.20:14400000-14500000.baq.20110328.vcf, freebayes.20:14500000-14600000.baq.20110328.vcf, freebayes.20:14600000-14700000.baq.20110328.vcf, freebayes.20:14700000-14800000.baq.20110328.vcf, freebayes.20:14800000-14900000.baq.20110328.vcf, freebayes.20:14900000-15000000.baq.20110328.vcf, freebayes.20:15000000-15100000.baq.20110328.vcf, freebayes.20:15100000-15200000.baq.20110328.vcf, freebayes.20:15200000-15300000.baq.20110328.vcf, freebayes.20:15300000-15400000.baq.20110328.vcf, freebayes.20:15400000-15500000.baq.20110328.vcf, freebayes.20:15500000-15600000.baq.20110328.vcf, freebayes.20:15600000-15700000.baq.20110328.vcf, freebayes.20:15700000-15800000.baq.20110328.vcf, freebayes.20:15800000-15900000.baq.20110328.vcf, freebayes.20:15900000-16000000.baq.20110328.vcf, freebayes.20:16000000-16100000.baq.20110328.vcf, freebayes.20:16100000-16200000.baq.20110328.vcf, freebayes.20:16200000-16300000.baq.20110328.vcf, freebayes.20:16300000-16400000.baq.20110328.vcf, freebayes.20:16400000-16500000.baq.20110328.vcf, freebayes.20:16500000-16600000.baq.20110328.vcf, freebayes.20:16600000-16700000.baq.20110328.vcf, freebayes.20:16700000-16800000.baq.20110328.vcf, freebayes.20:16800000-16900000.baq.20110328.vcf, freebayes.20:16900000-17000000.baq.20110328.vcf, freebayes.20:17000000-17100000.baq.20110328.vcf, freebayes.20:17100000-17200000.baq.20110328.vcf, freebayes.20:17200000-17300000.baq.20110328.vcf, freebayes.20:17300000-17400000.baq.20110328.vcf, freebayes.20:17400000-17500000.baq.20110328.vcf, freebayes.20:17500000-17600000.baq.20110328.vcf, freebayes.20:17600000-17700000.baq.20110328.vcf, freebayes.20:17700000-17800000.baq.20110328.vcf, freebayes.20:17800000-17900000.baq.20110328.vcf, freebayes.20:17900000-18000000.baq.20110328.vcf, freebayes.20:18000000-18100000.baq.20110328.vcf, freebayes.20:18100000-18200000.baq.20110328.vcf, freebayes.20:18200000-18300000.baq.20110328.vcf, freebayes.20:18300000-18400000.baq.20110328.vcf, freebayes.20:18400000-18500000.baq.20110328.vcf, freebayes.20:18500000-18600000.baq.20110328.vcf, freebayes.20:18600000-18700000.baq.20110328.vcf, freebayes.20:18700000-18800000.baq.20110328.vcf, freebayes.20:18800000-18900000.baq.20110328.vcf, freebayes.20:18900000-19000000.baq.20110328.vcf, freebayes.20:19000000-19100000.baq.20110328.vcf, freebayes.20:19100000-19200000.baq.20110328.vcf, freebayes.20:19200000-19300000.baq.20110328.vcf, freebayes.20:19300000-19400000.baq.20110328.vcf, freebayes.20:19400000-19500000.baq.20110328.vcf, freebayes.20:19500000-19600000.baq.20110328.vcf, freebayes.20:19600000-19700000.baq.20110328.vcf, freebayes.20:19700000-19800000.baq.20110328.vcf, freebayes.20:19800000-19900000.baq.20110328.vcf, freebayes.20:19900000-20000000.baq.20110328.vcf, freebayes.20:20000000-20100000.baq.20110328.vcf, freebayes.20:20100000-20200000.baq.20110328.vcf, freebayes.20:20200000-20300000.baq.20110328.vcf, freebayes.20:20300000-20400000.baq.20110328.vcf, freebayes.20:20400000-20500000.baq.20110328.vcf, freebayes.20:20500000-20600000.baq.20110328.vcf, freebayes.20:20600000-20700000.baq.20110328.vcf, freebayes.20:20700000-20800000.baq.20110328.vcf, freebayes.20:20800000-20900000.baq.20110328.vcf, freebayes.20:20900000-21000000.baq.20110328.vcf, freebayes.20:21000000-21100000.baq.20110328.vcf, freebayes.20:21100000-21200000.baq.20110328.vcf, freebayes.20:21200000-21300000.baq.20110328.vcf, freebayes.20:21300000-21400000.baq.20110328.vcf, freebayes.20:21400000-21500000.baq.20110328.vcf, freebayes.20:21500000-21600000.baq.20110328.vcf, freebayes.20:21600000-21700000.baq.20110328.vcf, freebayes.20:21700000-21800000.baq.20110328.vcf, freebayes.20:21800000-21900000.baq.20110328.vcf, freebayes.20:21900000-22000000.baq.20110328.vcf, freebayes.20:22000000-22100000.baq.20110328.vcf, freebayes.20:22100000-22200000.baq.20110328.vcf, freebayes.20:22200000-22300000.baq.20110328.vcf, freebayes.20:22300000-22400000.baq.20110328.vcf, freebayes.20:22400000-22500000.baq.20110328.vcf, freebayes.20:22500000-22600000.baq.20110328.vcf, freebayes.20:22600000-22700000.baq.20110328.vcf, freebayes.20:22700000-22800000.baq.20110328.vcf, freebayes.20:22800000-22900000.baq.20110328.vcf, freebayes.20:22900000-23000000.baq.20110328.vcf, freebayes.20:23000000-23100000.baq.20110328.vcf, freebayes.20:23100000-23200000.baq.20110328.vcf, freebayes.20:23200000-23300000.baq.20110328.vcf, freebayes.20:23300000-23400000.baq.20110328.vcf, freebayes.20:23400000-23500000.baq.20110328.vcf, freebayes.20:23500000-23600000.baq.20110328.vcf, freebayes.20:23600000-23700000.baq.20110328.vcf, freebayes.20:23700000-23800000.baq.20110328.vcf, freebayes.20:23800000-23900000.baq.20110328.vcf, freebayes.20:23900000-24000000.baq.20110328.vcf, freebayes.20:24000000-24100000.baq.20110328.vcf, freebayes.20:24100000-24200000.baq.20110328.vcf, freebayes.20:24200000-24300000.baq.20110328.vcf, freebayes.20:24300000-24400000.baq.20110328.vcf, freebayes.20:24400000-24500000.baq.20110328.vcf, freebayes.20:24500000-24600000.baq.20110328.vcf, freebayes.20:24600000-24700000.baq.20110328.vcf, freebayes.20:24700000-24800000.baq.20110328.vcf, freebayes.20:24800000-24900000.baq.20110328.vcf, freebayes.20:24900000-25000000.baq.20110328.vcf, freebayes.20:25000000-25100000.baq.20110328.vcf, freebayes.20:25100000-25200000.baq.20110328.vcf, freebayes.20:25200000-25300000.baq.20110328.vcf, freebayes.20:25300000-25400000.baq.20110328.vcf, freebayes.20:25400000-25500000.baq.20110328.vcf, freebayes.20:25500000-25600000.baq.20110328.vcf, freebayes.20:25600000-25700000.baq.20110328.vcf, freebayes.20:25700000-25800000.baq.20110328.vcf, freebayes.20:25800000-25900000.baq.20110328.vcf, freebayes.20:25900000-26000000.baq.20110328.vcf, freebayes.20:26000000-26100000.baq.20110328.vcf, freebayes.20:26100000-26200000.baq.20110328.vcf, freebayes.20:26200000-26300000.baq.20110328.vcf, freebayes.20:26300000-26400000.baq.20110328.vcf, freebayes.20:26400000-26500000.baq.20110328.vcf, freebayes.20:26500000-26600000.baq.20110328.vcf, freebayes.20:26600000-26700000.baq.20110328.vcf, freebayes.20:26700000-26800000.baq.20110328.vcf, freebayes.20:26800000-26900000.baq.20110328.vcf, freebayes.20:26900000-27000000.baq.20110328.vcf, freebayes.20:27000000-27100000.baq.20110328.vcf, freebayes.20:27100000-27200000.baq.20110328.vcf, freebayes.20:27200000-27300000.baq.20110328.vcf, freebayes.20:27300000-27400000.baq.20110328.vcf, freebayes.20:27400000-27500000.baq.20110328.vcf, freebayes.20:27500000-27600000.baq.20110328.vcf, freebayes.20:27600000-27700000.baq.20110328.vcf, freebayes.20:27700000-27800000.baq.20110328.vcf, freebayes.20:27800000-27900000.baq.20110328.vcf, freebayes.20:27900000-28000000.baq.20110328.vcf, freebayes.20:28000000-28100000.baq.20110328.vcf, freebayes.20:28100000-28200000.baq.20110328.vcf, freebayes.20:28200000-28300000.baq.20110328.vcf, freebayes.20:28300000-28400000.baq.20110328.vcf, freebayes.20:28400000-28500000.baq.20110328.vcf, freebayes.20:28500000-28600000.baq.20110328.vcf, freebayes.20:28600000-28700000.baq.20110328.vcf, freebayes.20:28700000-28800000.baq.20110328.vcf, freebayes.20:28800000-28900000.baq.20110328.vcf, freebayes.20:28900000-29000000.baq.20110328.vcf, freebayes.20:29000000-29100000.baq.20110328.vcf, freebayes.20:29100000-29200000.baq.20110328.vcf, freebayes.20:29200000-29300000.baq.20110328.vcf, freebayes.20:29300000-29400000.baq.20110328.vcf, freebayes.20:29400000-29500000.baq.20110328.vcf, freebayes.20:29500000-29600000.baq.20110328.vcf, freebayes.20:29600000-29700000.baq.20110328.vcf, freebayes.20:29700000-29800000.baq.20110328.vcf, freebayes.20:29800000-29900000.baq.20110328.vcf, freebayes.20:29900000-30000000.baq.20110328.vcf, freebayes.20:30000000-30100000.baq.20110328.vcf, freebayes.20:30100000-30200000.baq.20110328.vcf, freebayes.20:30200000-30300000.baq.20110328.vcf, freebayes.20:30300000-30400000.baq.20110328.vcf, freebayes.20:30400000-30500000.baq.20110328.vcf, freebayes.20:30500000-30600000.baq.20110328.vcf, freebayes.20:30600000-30700000.baq.20110328.vcf, freebayes.20:30700000-30800000.baq.20110328.vcf, freebayes.20:30800000-30900000.baq.20110328.vcf, freebayes.20:30900000-31000000.baq.20110328.vcf, freebayes.20:31000000-31100000.baq.20110328.vcf, freebayes.20:31100000-31200000.baq.20110328.vcf, freebayes.20:31200000-31300000.baq.20110328.vcf, freebayes.20:31300000-31400000.baq.20110328.vcf, freebayes.20:31400000-31500000.baq.20110328.vcf, freebayes.20:31500000-31600000.baq.20110328.vcf, freebayes.20:31600000-31700000.baq.20110328.vcf, freebayes.20:31700000-31800000.baq.20110328.vcf, freebayes.20:31800000-31900000.baq.20110328.vcf, freebayes.20:31900000-32000000.baq.20110328.vcf, freebayes.20:32000000-32100000.baq.20110328.vcf, freebayes.20:32100000-32200000.baq.20110328.vcf, freebayes.20:32200000-32300000.baq.20110328.vcf, freebayes.20:32300000-32400000.baq.20110328.vcf, freebayes.20:32400000-32500000.baq.20110328.vcf, freebayes.20:32500000-32600000.baq.20110328.vcf, freebayes.20:32600000-32700000.baq.20110328.vcf, freebayes.20:32700000-32800000.baq.20110328.vcf, freebayes.20:32800000-32900000.baq.20110328.vcf, freebayes.20:32900000-33000000.baq.20110328.vcf, freebayes.20:33000000-33100000.baq.20110328.vcf, freebayes.20:33100000-33200000.baq.20110328.vcf, freebayes.20:33200000-33300000.baq.20110328.vcf, freebayes.20:33300000-33400000.baq.20110328.vcf, freebayes.20:33400000-33500000.baq.20110328.vcf, freebayes.20:33500000-33600000.baq.20110328.vcf, freebayes.20:33600000-33700000.baq.20110328.vcf, freebayes.20:33700000-33800000.baq.20110328.vcf, freebayes.20:33800000-33900000.baq.20110328.vcf, freebayes.20:33900000-34000000.baq.20110328.vcf, freebayes.20:34000000-34100000.baq.20110328.vcf, freebayes.20:34100000-34200000.baq.20110328.vcf, freebayes.20:34200000-34300000.baq.20110328.vcf, freebayes.20:34300000-34400000.baq.20110328.vcf, freebayes.20:34400000-34500000.baq.20110328.vcf, freebayes.20:34500000-34600000.baq.20110328.vcf, freebayes.20:34600000-34700000.baq.20110328.vcf, freebayes.20:34700000-34800000.baq.20110328.vcf, freebayes.20:34800000-34900000.baq.20110328.vcf, freebayes.20:34900000-35000000.baq.20110328.vcf, freebayes.20:35000000-35100000.baq.20110328.vcf, freebayes.20:35100000-35200000.baq.20110328.vcf, freebayes.20:35200000-35300000.baq.20110328.vcf, freebayes.20:35300000-35400000.baq.20110328.vcf, freebayes.20:35400000-35500000.baq.20110328.vcf, freebayes.20:35500000-35600000.baq.20110328.vcf, freebayes.20:35600000-35700000.baq.20110328.vcf, freebayes.20:35700000-35800000.baq.20110328.vcf, freebayes.20:35800000-35900000.baq.20110328.vcf, freebayes.20:35900000-36000000.baq.20110328.vcf, freebayes.20:36000000-36100000.baq.20110328.vcf, freebayes.20:36100000-36200000.baq.20110328.vcf, freebayes.20:36200000-36300000.baq.20110328.vcf, freebayes.20:36300000-36400000.baq.20110328.vcf, freebayes.20:36400000-36500000.baq.20110328.vcf, freebayes.20:36500000-36600000.baq.20110328.vcf, freebayes.20:36600000-36700000.baq.20110328.vcf, freebayes.20:36700000-36800000.baq.20110328.vcf, freebayes.20:36800000-36900000.baq.20110328.vcf, freebayes.20:36900000-37000000.baq.20110328.vcf, freebayes.20:37000000-37100000.baq.20110328.vcf, freebayes.20:37100000-37200000.baq.20110328.vcf, freebayes.20:37200000-37300000.baq.20110328.vcf, freebayes.20:37300000-37400000.baq.20110328.vcf, freebayes.20:37400000-37500000.baq.20110328.vcf, freebayes.20:37500000-37600000.baq.20110328.vcf, freebayes.20:37600000-37700000.baq.20110328.vcf, freebayes.20:37700000-37800000.baq.20110328.vcf, freebayes.20:37800000-37900000.baq.20110328.vcf, freebayes.20:37900000-38000000.baq.20110328.vcf, freebayes.20:38000000-38100000.baq.20110328.vcf, freebayes.20:38100000-38200000.baq.20110328.vcf, freebayes.20:38200000-38300000.baq.20110328.vcf, freebayes.20:38300000-38400000.baq.20110328.vcf, freebayes.20:38400000-38500000.baq.20110328.vcf, freebayes.20:38500000-38600000.baq.20110328.vcf, freebayes.20:38600000-38700000.baq.20110328.vcf, freebayes.20:38700000-38800000.baq.20110328.vcf, freebayes.20:38800000-38900000.baq.20110328.vcf, freebayes.20:38900000-39000000.baq.20110328.vcf, freebayes.20:39000000-39100000.baq.20110328.vcf, freebayes.20:39100000-39200000.baq.20110328.vcf, freebayes.20:39200000-39300000.baq.20110328.vcf, freebayes.20:39300000-39400000.baq.20110328.vcf, freebayes.20:39400000-39500000.baq.20110328.vcf, freebayes.20:39500000-39600000.baq.20110328.vcf, freebayes.20:39600000-39700000.baq.20110328.vcf, freebayes.20:39700000-39800000.baq.20110328.vcf, freebayes.20:39800000-39900000.baq.20110328.vcf, freebayes.20:39900000-40000000.baq.20110328.vcf, freebayes.20:40000000-40100000.baq.20110328.vcf, freebayes.20:40100000-40200000.baq.20110328.vcf, freebayes.20:40200000-40300000.baq.20110328.vcf, freebayes.20:40300000-40400000.baq.20110328.vcf, freebayes.20:40400000-40500000.baq.20110328.vcf, freebayes.20:40500000-40600000.baq.20110328.vcf, freebayes.20:40600000-40700000.baq.20110328.vcf, freebayes.20:40700000-40800000.baq.20110328.vcf, freebayes.20:40800000-40900000.baq.20110328.vcf, freebayes.20:40900000-41000000.baq.20110328.vcf, freebayes.20:41000000-41100000.baq.20110328.vcf, freebayes.20:41100000-41200000.baq.20110328.vcf, freebayes.20:41200000-41300000.baq.20110328.vcf, freebayes.20:41300000-41400000.baq.20110328.vcf, freebayes.20:41400000-41500000.baq.20110328.vcf, freebayes.20:41500000-41600000.baq.20110328.vcf, freebayes.20:41600000-41700000.baq.20110328.vcf, freebayes.20:41700000-41800000.baq.20110328.vcf, freebayes.20:41800000-41900000.baq.20110328.vcf, freebayes.20:41900000-42000000.baq.20110328.vcf, freebayes.20:42000000-42100000.baq.20110328.vcf, freebayes.20:42100000-42200000.baq.20110328.vcf, freebayes.20:42200000-42300000.baq.20110328.vcf, freebayes.20:42300000-42400000.baq.20110328.vcf, freebayes.20:42400000-42500000.baq.20110328.vcf, freebayes.20:42500000-42600000.baq.20110328.vcf, freebayes.20:42600000-42700000.baq.20110328.vcf, freebayes.20:42700000-42800000.baq.20110328.vcf, freebayes.20:42800000-42900000.baq.20110328.vcf, freebayes.20:42900000-43000000.baq.20110328.vcf, freebayes.20:43000000-43100000.baq.20110328.vcf, freebayes.20:43100000-43200000.baq.20110328.vcf, freebayes.20:43200000-43300000.baq.20110328.vcf, freebayes.20:43300000-43400000.baq.20110328.vcf, freebayes.20:43400000-43500000.baq.20110328.vcf, freebayes.20:43500000-43600000.baq.20110328.vcf, freebayes.20:43600000-43700000.baq.20110328.vcf, freebayes.20:43700000-43800000.baq.20110328.vcf, freebayes.20:43800000-43900000.baq.20110328.vcf, freebayes.20:43900000-44000000.baq.20110328.vcf, freebayes.20:44000000-44100000.baq.20110328.vcf, freebayes.20:44100000-44200000.baq.20110328.vcf, freebayes.20:44200000-44300000.baq.20110328.vcf, freebayes.20:44300000-44400000.baq.20110328.vcf, freebayes.20:44400000-44500000.baq.20110328.vcf, freebayes.20:44500000-44600000.baq.20110328.vcf, freebayes.20:44600000-44700000.baq.20110328.vcf, freebayes.20:44700000-44800000.baq.20110328.vcf, freebayes.20:44800000-44900000.baq.20110328.vcf, freebayes.20:44900000-45000000.baq.20110328.vcf, freebayes.20:45000000-45100000.baq.20110328.vcf, freebayes.20:45100000-45200000.baq.20110328.vcf, freebayes.20:45200000-45300000.baq.20110328.vcf, freebayes.20:45300000-45400000.baq.20110328.vcf, freebayes.20:45400000-45500000.baq.20110328.vcf, freebayes.20:45500000-45600000.baq.20110328.vcf, freebayes.20:45600000-45700000.baq.20110328.vcf, freebayes.20:45700000-45800000.baq.20110328.vcf, freebayes.20:45800000-45900000.baq.20110328.vcf, freebayes.20:45900000-46000000.baq.20110328.vcf, freebayes.20:46000000-46100000.baq.20110328.vcf, freebayes.20:46100000-46200000.baq.20110328.vcf, freebayes.20:46200000-46300000.baq.20110328.vcf, freebayes.20:46300000-46400000.baq.20110328.vcf, freebayes.20:46400000-46500000.baq.20110328.vcf, freebayes.20:46500000-46600000.baq.20110328.vcf, freebayes.20:46600000-46700000.baq.20110328.vcf, freebayes.20:46700000-46800000.baq.20110328.vcf, freebayes.20:46800000-46900000.baq.20110328.vcf, freebayes.20:46900000-47000000.baq.20110328.vcf, freebayes.20:47000000-47100000.baq.20110328.vcf, freebayes.20:47100000-47200000.baq.20110328.vcf, freebayes.20:47200000-47300000.baq.20110328.vcf, freebayes.20:47300000-47400000.baq.20110328.vcf, freebayes.20:47400000-47500000.baq.20110328.vcf, freebayes.20:47500000-47600000.baq.20110328.vcf, freebayes.20:47600000-47700000.baq.20110328.vcf, freebayes.20:47700000-47800000.baq.20110328.vcf, freebayes.20:47800000-47900000.baq.20110328.vcf, freebayes.20:47900000-48000000.baq.20110328.vcf, freebayes.20:48000000-48100000.baq.20110328.vcf, freebayes.20:48100000-48200000.baq.20110328.vcf, freebayes.20:48200000-48300000.baq.20110328.vcf, freebayes.20:48300000-48400000.baq.20110328.vcf, freebayes.20:48400000-48500000.baq.20110328.vcf, freebayes.20:48500000-48600000.baq.20110328.vcf, freebayes.20:48600000-48700000.baq.20110328.vcf, freebayes.20:48700000-48800000.baq.20110328.vcf, freebayes.20:48800000-48900000.baq.20110328.vcf, freebayes.20:48900000-49000000.baq.20110328.vcf, freebayes.20:49000000-49100000.baq.20110328.vcf, freebayes.20:49100000-49200000.baq.20110328.vcf, freebayes.20:49200000-49300000.baq.20110328.vcf, freebayes.20:49300000-49400000.baq.20110328.vcf, freebayes.20:49400000-49500000.baq.20110328.vcf, freebayes.20:49500000-49600000.baq.20110328.vcf, freebayes.20:49600000-49700000.baq.20110328.vcf, freebayes.20:49700000-49800000.baq.20110328.vcf, freebayes.20:49800000-49900000.baq.20110328.vcf, freebayes.20:49900000-50000000.baq.20110328.vcf, freebayes.20:50000000-50100000.baq.20110328.vcf, freebayes.20:50100000-50200000.baq.20110328.vcf, freebayes.20:50200000-50300000.baq.20110328.vcf, freebayes.20:50300000-50400000.baq.20110328.vcf, freebayes.20:50400000-50500000.baq.20110328.vcf, freebayes.20:50500000-50600000.baq.20110328.vcf, freebayes.20:50600000-50700000.baq.20110328.vcf, freebayes.20:50700000-50800000.baq.20110328.vcf, freebayes.20:50800000-50900000.baq.20110328.vcf, freebayes.20:50900000-51000000.baq.20110328.vcf, freebayes.20:51000000-51100000.baq.20110328.vcf, freebayes.20:51100000-51200000.baq.20110328.vcf, freebayes.20:51200000-51300000.baq.20110328.vcf, freebayes.20:51300000-51400000.baq.20110328.vcf, freebayes.20:51400000-51500000.baq.20110328.vcf, freebayes.20:51500000-51600000.baq.20110328.vcf, freebayes.20:51600000-51700000.baq.20110328.vcf, freebayes.20:51700000-51800000.baq.20110328.vcf, freebayes.20:51800000-51900000.baq.20110328.vcf, freebayes.20:51900000-52000000.baq.20110328.vcf, freebayes.20:52000000-52100000.baq.20110328.vcf, freebayes.20:52100000-52200000.baq.20110328.vcf, freebayes.20:52200000-52300000.baq.20110328.vcf, freebayes.20:52300000-52400000.baq.20110328.vcf, freebayes.20:52400000-52500000.baq.20110328.vcf, freebayes.20:52500000-52600000.baq.20110328.vcf, freebayes.20:52600000-52700000.baq.20110328.vcf, freebayes.20:52700000-52800000.baq.20110328.vcf, freebayes.20:52800000-52900000.baq.20110328.vcf, freebayes.20:52900000-53000000.baq.20110328.vcf, freebayes.20:53000000-53100000.baq.20110328.vcf, freebayes.20:53100000-53200000.baq.20110328.vcf, freebayes.20:53200000-53300000.baq.20110328.vcf, freebayes.20:53300000-53400000.baq.20110328.vcf, freebayes.20:53400000-53500000.baq.20110328.vcf, freebayes.20:53500000-53600000.baq.20110328.vcf, freebayes.20:53600000-53700000.baq.20110328.vcf, freebayes.20:53700000-53800000.baq.20110328.vcf, freebayes.20:53800000-53900000.baq.20110328.vcf, freebayes.20:53900000-54000000.baq.20110328.vcf, freebayes.20:54000000-54100000.baq.20110328.vcf, freebayes.20:54100000-54200000.baq.20110328.vcf, freebayes.20:54200000-54300000.baq.20110328.vcf, freebayes.20:54300000-54400000.baq.20110328.vcf, freebayes.20:54400000-54500000.baq.20110328.vcf, freebayes.20:54500000-54600000.baq.20110328.vcf, freebayes.20:54600000-54700000.baq.20110328.vcf, freebayes.20:54700000-54800000.baq.20110328.vcf, freebayes.20:54800000-54900000.baq.20110328.vcf, freebayes.20:54900000-55000000.baq.20110328.vcf, freebayes.20:55000000-55100000.baq.20110328.vcf, freebayes.20:55100000-55200000.baq.20110328.vcf, freebayes.20:55200000-55300000.baq.20110328.vcf, freebayes.20:55300000-55400000.baq.20110328.vcf, freebayes.20:55400000-55500000.baq.20110328.vcf, freebayes.20:55500000-55600000.baq.20110328.vcf, freebayes.20:55600000-55700000.baq.20110328.vcf, freebayes.20:55700000-55800000.baq.20110328.vcf, freebayes.20:55800000-55900000.baq.20110328.vcf, freebayes.20:55900000-56000000.baq.20110328.vcf, freebayes.20:56000000-56100000.baq.20110328.vcf, freebayes.20:56100000-56200000.baq.20110328.vcf, freebayes.20:56200000-56300000.baq.20110328.vcf, freebayes.20:56300000-56400000.baq.20110328.vcf, freebayes.20:56400000-56500000.baq.20110328.vcf, freebayes.20:56500000-56600000.baq.20110328.vcf, freebayes.20:56600000-56700000.baq.20110328.vcf, freebayes.20:56700000-56800000.baq.20110328.vcf, freebayes.20:56800000-56900000.baq.20110328.vcf, freebayes.20:56900000-57000000.baq.20110328.vcf, freebayes.20:57000000-57100000.baq.20110328.vcf, freebayes.20:57100000-57200000.baq.20110328.vcf, freebayes.20:57200000-57300000.baq.20110328.vcf, freebayes.20:57300000-57400000.baq.20110328.vcf, freebayes.20:57400000-57500000.baq.20110328.vcf, freebayes.20:57500000-57600000.baq.20110328.vcf, freebayes.20:57600000-57700000.baq.20110328.vcf, freebayes.20:57700000-57800000.baq.20110328.vcf, freebayes.20:57800000-57900000.baq.20110328.vcf, freebayes.20:57900000-58000000.baq.20110328.vcf, freebayes.20:58000000-58100000.baq.20110328.vcf, freebayes.20:58100000-58200000.baq.20110328.vcf, freebayes.20:58200000-58300000.baq.20110328.vcf, freebayes.20:58300000-58400000.baq.20110328.vcf, freebayes.20:58400000-58500000.baq.20110328.vcf, freebayes.20:58500000-58600000.baq.20110328.vcf, freebayes.20:58600000-58700000.baq.20110328.vcf, freebayes.20:58700000-58800000.baq.20110328.vcf, freebayes.20:58800000-58900000.baq.20110328.vcf, freebayes.20:58900000-59000000.baq.20110328.vcf, freebayes.20:59000000-59100000.baq.20110328.vcf, freebayes.20:59100000-59200000.baq.20110328.vcf, freebayes.20:59200000-59300000.baq.20110328.vcf, freebayes.20:59300000-59400000.baq.20110328.vcf, freebayes.20:59400000-59500000.baq.20110328.vcf, freebayes.20:59500000-59600000.baq.20110328.vcf, freebayes.20:59600000-59700000.baq.20110328.vcf, freebayes.20:59700000-59800000.baq.20110328.vcf, freebayes.20:59800000-59900000.baq.20110328.vcf, freebayes.20:59900000-60000000.baq.20110328.vcf, freebayes.20:60000000-60100000.baq.20110328.vcf, freebayes.20:60100000-60200000.baq.20110328.vcf, freebayes.20:60200000-60300000.baq.20110328.vcf, freebayes.20:60300000-60400000.baq.20110328.vcf, freebayes.20:60400000-60500000.baq.20110328.vcf, freebayes.20:60500000-60600000.baq.20110328.vcf, freebayes.20:60600000-60700000.baq.20110328.vcf, freebayes.20:60700000-60800000.baq.20110328.vcf, freebayes.20:60800000-60900000.baq.20110328.vcf, freebayes.20:60900000-61000000.baq.20110328.vcf, freebayes.20:61000000-61100000.baq.20110328.vcf, freebayes.20:61100000-61200000.baq.20110328.vcf, freebayes.20:61200000-61300000.baq.20110328.vcf, freebayes.20:61300000-61400000.baq.20110328.vcf, freebayes.20:61400000-61500000.baq.20110328.vcf, freebayes.20:61500000-61600000.baq.20110328.vcf, freebayes.20:61600000-61700000.baq.20110328.vcf, freebayes.20:61700000-61800000.baq.20110328.vcf, freebayes.20:61800000-61900000.baq.20110328.vcf, freebayes.20:61900000-62000000.baq.20110328.vcf, freebayes.20:62000000-62100000.baq.20110328.vcf, freebayes.20:62100000-62200000.baq.20110328.vcf, freebayes.20:62200000-62300000.baq.20110328.vcf, freebayes.20:62300000-62400000.baq.20110328.vcf, freebayes.20:62400000-62500000.baq.20110328.vcf, freebayes.20:62500000-62600000.baq.20110328.vcf, freebayes.20:62600000-62700000.baq.20110328.vcf, freebayes.20:62700000-62800000.baq.20110328.vcf, freebayes.20:62800000-62900000.baq.20110328.vcf, freebayes.20:62900000-63000000.baq.20110328.vcf, freebayes.20:63000000-63025520.baq.20110328.vcf -#CHROM POS ID REF ALT QUAL FILTER INFO +#CHROM POS ID REF ALT QUAL FILTER INFO 20 458502 . G GA 4567.01 PASS AA=20;AB=0.61111;ABA=14;ABP=6.8707;ABR=22;AC=38;AF=0.0544;AN=698;BL=374;BR=1129;BVAR;BaseQRankSum=13.364;DP=15979;DP4=1882,2188,45,37;Dels=0.00;EL=5;EPP=13.868;ER=15;FR;FS=6.503;HETAR=11;HOMA=2;HOMR=985;HP=1;HPLen=2;HR=2;HRun=0;HU=G;INDEL;INS;InbreedingCoeff=0.0157;IndelType=INS.NOVEL_1.Novel_A.;LEN=1;LRB=0.50233;LRBP=826.56;MQ=66.16;MQ0Fraction=0.0110;MQM=70.5;MQRankSum=-3.158;NF;NR;NS=998;PP;PV4=0.15,1,0.42,0.15;RA=3173;RL=1;RPP=38.188;RR=19;RUN=1;ReadPosRankSum=-2.346;SAB=0.7;SAF=14;SAP=9.959;SAR=6;SC=GGGCGTGGTGGTGCATGTAAT;SRB=0.50047;SRF=1588;SRP=3.0165;SRR=1585;TC;TR=9;TU=GGT;VQSLOD=10.0079;set=Intersection;sumGLbyD=23.94 20 573764 . TA T 591.51 PASS AC=91;AF=0.1987;AN=458;BaseQRankSum=0.137;DP=519;FS=3.153;HRun=1;HaplotypeScore=14.0744;InbreedingCoeff=0.1460;IndelType=DEL.NumRepetitions_1.EventLength_1.RepeatExpansion_A.;MQ=48.16;MQ0=26;MQ0Fraction=0.0501;MQRankSum=-1.636;QD=3.63;ReadPosRankSum=-4.140;SB=-408.14;VQSLOD=5.2458;set=VQSR 20 766143 . C CATCTGGTA 5521.70 PASS AA=24;AB=0.5;ABA=18;ABP=3.0103;ABR=18;AC=14;AF=0.0289;AF1=0.02038;AN=484;BL=655;BR=1542;BVAR;BaseQRankSum=3.801;CI95=0.01549,0.02655;DP=11749;DP4=2222,1998,14,8;Dels=0.00;EL=9;EPP=6.2675;ER=15;FQ=999;FR;FS=2.941;HETAR=9;HOMA=4;HOMR=901;HP=2;HPLen=2;HR=1;HRun=0;HU=A;INDEL;INS;InbreedingCoeff=0.0515;IndelType=INS.NumRepetitions_1.EventLength_8.;LEN=8;LRB=0.40373;LRBP=780.64;MQ=56.81;MQ0Fraction=0.0253;MQM=22.167;MQRankSum=-4.809;NF;NR;NS=914;PP;PV4=0.39,1,5.8e-07,1;RA=3093;RL=6;RPP=16.039;RR=18;RUN=1;ReadPosRankSum=-2.827;SAB=0.625;SAF=15;SAP=6.2675;SAR=9;SC=GCTTTAAATTCATCTGGTACT;SRB=0.61623;SRF=1906;SRP=365.95;SRR=1187;TC;TR=1;TU=A;VQSLOD=7.0268;set=Intersection;sumGLbyD=50.23 diff --git a/tests/tabix_data/vcf/20.vcf b/tests/tabix_data/vcf/20.vcf index e14beb5..5e33de1 100644 --- a/tests/tabix_data/vcf/20.vcf +++ b/tests/tabix_data/vcf/20.vcf @@ -4,7 +4,7 @@ ##INFO= ##INFO= ##reference=file:///humgen/1kg/reference/human_g1k_v37.fasta -#CHROM POS ID REF ALT QUAL FILTER INFO +#CHROM POS ID REF ALT QUAL FILTER INFO 20 207414 . G A . . PacBio=NoCall;Sqnm=NoCall 20 792106 . C G . . PacBio=Poly;Sqnm=NoCall 20 894031 . G A . . PacBio=Poly;Sqnm=Poly diff --git a/tests/tabix_data/vcf/23.vcf b/tests/tabix_data/vcf/23.vcf index 9c9ed12..153bad6 100644 --- a/tests/tabix_data/vcf/23.vcf +++ b/tests/tabix_data/vcf/23.vcf @@ -216,7 +216,7 @@ ##source=SelectVariants ##source_20110031.1=/nfs/users/nfs_p/pd3/cvs/vcftools/perl/vcf-annotate -d /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.desc -a /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.tab.gz -c CHROM,FROM,INFO/VLD,INFO/KGPilot123,INFO/dbSNP ##vcfCTools=filter -#CHROM POS ID REF ALT QUAL FILTER INFO +#CHROM POS ID REF ALT QUAL FILTER INFO 20 458502 . G GA 4567.01 PASS AA=20;AB=0.61111;ABA=14;ABP=6.8707;ABR=22;AC=38;AF=0.0544;AN=698;BL=374;BR=1129;BVAR;BaseQRankSum=13.364;DP=15979;DP4=1882,2188,45,37;Dels=0.00;EL=5;EPP=13.868;ER=15;FR;FS=6.503;HETAR=11;HOMA=2;HOMR=985;HP=1;HPLen=2;HR=2;HRun=0;HU=G;INDEL;INS;InbreedingCoeff=0.0157;IndelType=INS.NOVEL_1.Novel_A.;LEN=1;LRB=0.50233;LRBP=826.56;MQ=66.16;MQ0Fraction=0.0110;MQM=70.5;MQRankSum=-3.158;NF;NR;NS=998;PP;PV4=0.15,1,0.42,0.15;RA=3173;RL=1;RPP=38.188;RR=19;RUN=1;ReadPosRankSum=-2.346;SAB=0.7;SAF=14;SAP=9.959;SAR=6;SC=GGGCGTGGTGGTGCATGTAAT;SET_INTEGRATION;SET_WGVQSR;SRB=0.50047;SRF=1588;SRP=3.0165;SRR=1585;TC;TR=9;TU=GGT;VQSLOD=10.0079;set=Intersection;sumGLbyD=23.94 20 573764 . TA T 591.51 PASS AC=91;AF=0.1987;AN=458;BaseQRankSum=0.137;DP=519;FS=3.153;HRun=1;HaplotypeScore=14.0744;InbreedingCoeff=0.1460;IndelType=DEL.NumRepetitions_1.EventLength_1.RepeatExpansion_A.;MQ=48.16;MQ0=26;MQ0Fraction=0.0501;MQRankSum=-1.636;QD=3.63;ReadPosRankSum=-4.140;SB=-408.14;SET_INTEGRATION;SET_WGVQSR;VQSLOD=5.2458;set=VQSR 20 766143 . C CATCTGGTA 5521.70 PASS AA=24;AB=0.5;ABA=18;ABP=3.0103;ABR=18;AC=14;AF=0.0289;AF1=0.02038;AN=484;BL=655;BR=1542;BVAR;BaseQRankSum=3.801;CI95=0.01549,0.02655;DP=11749;DP4=2222,1998,14,8;Dels=0.00;EL=9;EPP=6.2675;ER=15;FQ=999;FR;FS=2.941;HETAR=9;HOMA=4;HOMR=901;HP=2;HPLen=2;HR=1;HRun=0;HU=A;INDEL;INS;InbreedingCoeff=0.0515;IndelType=INS.NumRepetitions_1.EventLength_8.;LEN=8;LRB=0.40373;LRBP=780.64;MQ=56.81;MQ0Fraction=0.0253;MQM=22.167;MQRankSum=-4.809;NF;NR;NS=914;PP;PV4=0.39,1,5.8e-07,1;RA=3093;RL=6;RPP=16.039;RR=18;RUN=1;ReadPosRankSum=-2.827;SAB=0.625;SAF=15;SAP=6.2675;SAR=9;SC=GCTTTAAATTCATCTGGTACT;SET_INTEGRATION;SET_WGVQSR;SRB=0.61623;SRF=1906;SRP=365.95;SRR=1187;TC;TR=1;TU=A;VQSLOD=7.0268;set=Intersection;sumGLbyD=50.23 diff --git a/tests/tabix_test.py b/tests/tabix_test.py index 7546175..3f1f716 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -930,11 +930,11 @@ class TestVCFFromVariantFile(TestVCFFromVCF): "filter", "info", "format") fail_on_parsing = [ - (24, "Could not parse the header, sample line not found"), + (24, 'Could not parse the "#CHROM.." line'), ("issue85", "empty VCF"), ] fail_on_opening = [ - (24, "Could not parse the header, sample line not found"), + (24, 'Could not parse the "#CHROM.." line'), ("issue85", "empty VCF"), ] coordinate_offset = 0 diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py index 1806909..762002a 100644 --- a/tests/tabixproxies_test.py +++ b/tests/tabixproxies_test.py @@ -5,13 +5,39 @@ import sys import re import copy import gzip -from TestUtils import load_and_convert, make_data_files, TABIX_DATADIR +from TestUtils import load_and_convert, make_data_files, TABIX_DATADIR, IS_PYTHON3 def setUpModule(): make_data_files(TABIX_DATADIR) +@unittest.skipUnless(IS_PYTHON3, "Requires Python 3 Extended Iterable Unpacking") +class TestBED(unittest.TestCase): + + filename = os.path.join(TABIX_DATADIR, "fivecolumns.bed.gz") + + def setUp(self): + self.tabix = pysam.TabixFile(self.filename) + + def tearDown(self): + self.tabix.close() + + def testAssignmentToTargetList(self): + # TODO When we drop Python 2, remove exec() & my and simplify these + my = {} + for row in self.tabix.fetch(parser=pysam.asTuple()): + my['row'] = row + + # Test that *others gets the right columns... + exec('contig, start, end, *others = row', globals(), my) + self.assertEqual(3 + len(my['others']), len(row)) + + # ...and that a TupleProxy can be assigned from more than once + exec('contig, *others = row', globals(), my) + self.assertEqual(1 + len(my['others']), len(row)) + + class TestParser(unittest.TestCase): filename = os.path.join(TABIX_DATADIR, "example.gtf.gz") @@ -43,6 +69,13 @@ class TestParser(unittest.TestCase): self.assertEqual("\t".join(map(str, c)), str(r)) + @unittest.skipUnless(IS_PYTHON3, "Requires Python 3 Extended Iterable Unpacking") + def testAssignmentToTargetList(self): + for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): + my = { 'r': r } + exec('col1, col2, *others, colN = r', globals(), my) + self.assertEqual(2 + len(my['others']) + 1, len(r)) + def testWrite(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): -- 2.30.2